diff --git a/README.md b/README.md index 16846c7aa8..7a97d400c6 100644 --- a/README.md +++ b/README.md @@ -132,9 +132,7 @@ We *strongly* recommend working with LLM Foundry inside a Docker container (see ```bash git clone https://github.com/mosaicml/llm-foundry.git cd llm-foundry -pip install -e ".[gpu-flash2]" # or `pip install -e .` if no NVIDIA GPU. -# Note: Currently, `pip install -e ".[gpu-flash2]"` installs Flash Attention v2, and `pip install -e ".[gpu]"` installs Flash Attention v1. -# However, once the support for Flash Attention v1 is removed, both of these commands will install Flash Attention v2. +pip install -e ".[gpu]" # or `pip install -e .` if no NVIDIA GPU. ``` ### Without Docker (not recommended) @@ -152,9 +150,7 @@ source llmfoundry-venv/bin/activate pip install cmake packaging torch # setup.py requires these be installed -pip install -e ".[gpu-flash2]" # or `pip install -e .` if no NVIDIA GPU. -# Note: Currently, `pip install -e ".[gpu-flash2]"` installs Flash Attention v2, and `pip install -e ".[gpu]"` installs Flash Attention v1. -# However, once the support for Flash Attention v1 is removed, both of these commands will install Flash Attention v2. +pip install -e ".[gpu]" # or `pip install -e .` if no NVIDIA GPU. ``` ### TransformerEngine and amp_fp8 support diff --git a/llmfoundry/__init__.py b/llmfoundry/__init__.py index f65338c987..f4f0d68e9c 100644 --- a/llmfoundry/__init__.py +++ b/llmfoundry/__init__.py @@ -19,11 +19,6 @@ hf_dynamic_modules_logger.addFilter(new_files_warning_filter) -# Before importing any transformers models, we need to disable transformers flash attention if -# we are in an environment with flash attention version <2. Transformers hard errors on a not properly -# gated import otherwise. -import transformers - from llmfoundry import optim, utils from llmfoundry.data import (ConcatTokensDataset, MixtureOfDenoisersCollator, NoConcatDataset, Seq2SeqFinetuningCollator, @@ -33,8 +28,7 @@ ComposerHFT5) from llmfoundry.models.layers.attention import ( MultiheadAttention, attn_bias_shape, build_alibi_bias, build_attn_bias, - flash_attn_fn, is_flash_v1_installed, - scaled_multihead_dot_product_attention, triton_flash_attn_fn) + flash_attn_fn, scaled_multihead_dot_product_attention, triton_flash_attn_fn) from llmfoundry.models.layers.blocks import MPTBlock from llmfoundry.models.layers.ffn import FFN_CLASS_REGISTRY, MPTMLP, build_ffn from llmfoundry.models.model_registry import COMPOSER_MODEL_REGISTRY @@ -42,9 +36,6 @@ MPTForCausalLM, MPTModel, MPTPreTrainedModel) from llmfoundry.tokenizers import TiktokenTokenizerWrapper -if is_flash_v1_installed(): - transformers.utils.is_flash_attn_available = lambda: False - __all__ = [ 'build_text_denoising_dataloader', 'build_finetuning_dataloader', @@ -77,4 +68,4 @@ 'TiktokenTokenizerWrapper', ] -__version__ = '0.5.0' +__version__ = '0.6.0' diff --git a/llmfoundry/models/layers/attention.py b/llmfoundry/models/layers/attention.py index 8fa8e6bc66..79780bccee 100644 --- a/llmfoundry/models/layers/attention.py +++ b/llmfoundry/models/layers/attention.py @@ -44,13 +44,6 @@ def check_alibi_support(attention_impl: str) -> bool: v2_version='v2.4.2') -# Before importing any transformers models, we need to disable transformers flash attention if -# we are in an environment with flash attention version <2. Transformers hard errors on a not properly -# gated import otherwise. -if is_flash_v1_installed(): - import transformers - transformers.utils.is_flash_attn_available = lambda: False - from transformers.models.llama.modeling_llama import apply_rotary_pos_emb diff --git a/llmfoundry/models/mpt/configuration_mpt.py b/llmfoundry/models/mpt/configuration_mpt.py index 68fe5befd5..b5a099002e 100644 --- a/llmfoundry/models/mpt/configuration_mpt.py +++ b/llmfoundry/models/mpt/configuration_mpt.py @@ -9,7 +9,6 @@ from transformers import PretrainedConfig from llmfoundry.models.layers.attention import (check_alibi_support, - is_flash_v1_installed, is_flash_v2_installed) from llmfoundry.models.layers.blocks import attn_config_defaults @@ -230,13 +229,6 @@ def _validate_config(self) -> None: raise NotImplementedError( 'prefix_lm only implemented with torch and triton attention.') - if self.attn_config['attn_impl'] == 'flash' and is_flash_v1_installed(): - warnings.warn( - VersionedDeprecationWarning( - 'Support for Flash Attention v1 is deprecated. Please upgrade to Flash Attention v2.4.2. To install Flash Attention v2.4.2, please run `pip install -e ".[gpu-flash2]"` from the root directory of the llm-foundry repository.', - remove_version='0.6.0', - )) - if self.attn_config[ 'attn_impl'] == 'triton' and not self.attn_config['prefix_lm']: warnings.warn( diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py index ff33990d7a..0a5e2ac13d 100644 --- a/llmfoundry/models/mpt/modeling_mpt.py +++ b/llmfoundry/models/mpt/modeling_mpt.py @@ -27,8 +27,7 @@ from composer.utils import dist from llmfoundry.metrics import TokenAccuracy -from llmfoundry.models.layers.attention import (is_flash_v1_installed, - is_flash_v2_installed) +from llmfoundry.models.layers.attention import is_flash_v2_installed from llmfoundry.models.layers.norm import NORM_CLASS_REGISTRY if is_flash_v2_installed(): @@ -39,12 +38,6 @@ except Exception as e: raise e -if is_flash_v1_installed(): - try: # This try...except is needed because transformers requires it despite the 'if' statement above - from flash_attn import bert_padding - except Exception as e: - raise e - from omegaconf import DictConfig from omegaconf import OmegaConf as om from transformers import PreTrainedModel, PreTrainedTokenizerBase diff --git a/mcli/mcli-llama2-finetune.yaml b/mcli/mcli-llama2-finetune.yaml index 36de709aed..091fc5a84e 100644 --- a/mcli/mcli-llama2-finetune.yaml +++ b/mcli/mcli-llama2-finetune.yaml @@ -38,7 +38,7 @@ parameters: pretrained: true # Note: you must have set the HUGGING_FACE_HUB_TOKEN environment variable and have access to the llama2 models use_auth_token: true - attention_patch_type: triton + use_flash_attention_2: true # Tokenizer tokenizer: @@ -62,7 +62,7 @@ parameters: # # Or use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...` # # to profile this run's optimal packing_ratio as it depends on GPU count, # # batch size, sequence length - # packing_ratio: + # packing_ratio: auto drop_last: true num_workers: 8 pin_memory: false diff --git a/setup.py b/setup.py index 7534d24503..4ecd34861a 100644 --- a/setup.py +++ b/setup.py @@ -3,6 +3,7 @@ """MosaicML LLM Foundry package setup.""" +import copy import os import re @@ -98,16 +99,13 @@ 'mosaicml[tensorboard]>=0.20.1,<0.21', ] -extra_deps['gpu'] = [ - 'flash-attn==1.0.9', - # PyPI does not support direct dependencies, so we remove this line before uploading from PyPI - 'xentropy-cuda-lib@git+https://github.com/HazyResearch/flash-attention.git@v1.0.9#subdirectory=csrc/xentropy', -] - +# Flash 2 group kept for backwards compatibility extra_deps['gpu-flash2'] = [ 'flash-attn==2.5.0', ] +extra_deps['gpu'] = copy.deepcopy(extra_deps['gpu-flash2']) + extra_deps['peft'] = [ 'mosaicml[peft]>=0.20.1,<0.21', ] diff --git a/tests/models/layers/test_huggingface_flash.py b/tests/models/layers/test_huggingface_flash.py index 411aab77a2..e9c0d32877 100644 --- a/tests/models/layers/test_huggingface_flash.py +++ b/tests/models/layers/test_huggingface_flash.py @@ -12,23 +12,14 @@ from composer.utils import reproducibility from omegaconf import DictConfig from omegaconf import OmegaConf as om +from transformers.models.llama.modeling_llama import LlamaAttention from llmfoundry import COMPOSER_MODEL_REGISTRY from llmfoundry.models.hf.hf_fsdp import rgetattr -from llmfoundry.models.layers.attention import (is_flash_v1_installed, - is_flash_v2_installed) -from llmfoundry.utils.builders import build_tokenizer - -# Before importing any transformers models, we need to disable transformers flash attention if -# we are in an environment with flash attention version <2. Transformers hard errors on a not properly -# gated import otherwise. -if is_flash_v1_installed(): - transformers.utils.is_flash_attn_available = lambda: False - -from transformers.models.llama.modeling_llama import LlamaAttention - +from llmfoundry.models.layers.attention import is_flash_v2_installed from llmfoundry.models.layers.llama_attention_monkeypatch import ( llama_attention_patch_torch, llama_attention_patch_triton) +from llmfoundry.utils.builders import build_tokenizer @pytest.mark.parametrize('patch_fn_name', ['torch', 'triton'])