diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py index d1ed50b0cb..738281363a 100644 --- a/llmfoundry/models/hf/hf_causal_lm.py +++ b/llmfoundry/models/hf/hf_causal_lm.py @@ -109,6 +109,7 @@ def __init__(self, om_model_config: Union[DictConfig, use_auth_token=use_auth_token, # attn_implementation=requested_attention_implementation, ) + config._flash_attn_2_enabled = use_flash_attention_2 # This is not ideal, however Hugging Face's _autoset_attn_implementation function # forces you to load the model in fp16/bf16 if you want to use flash attention. Rather than loading diff --git a/setup.py b/setup.py index 2283e60d9c..6365d1a325 100644 --- a/setup.py +++ b/setup.py @@ -49,7 +49,7 @@ install_requires = [ 'mosaicml[libcloud,wandb,mlflow,oci,gcs]>=0.17.1,<0.18', 'accelerate>=0.25,<0.26', # for HF inference `device_map` - 'transformers>=4.36,<4.37', + 'transformers>=4.35,<4.36', 'mosaicml-streaming>=0.7.1,<0.8', 'torch>=2.1,<2.1.1', 'datasets==2.15.0',