diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py
index d1ed50b0cb..738281363a 100644
--- a/llmfoundry/models/hf/hf_causal_lm.py
+++ b/llmfoundry/models/hf/hf_causal_lm.py
@@ -109,6 +109,7 @@ def __init__(self, om_model_config: Union[DictConfig,
                 use_auth_token=use_auth_token,
                 # attn_implementation=requested_attention_implementation,
             )
+            config._flash_attn_2_enabled = use_flash_attention_2
 
             # This is not ideal, however Hugging Face's _autoset_attn_implementation function
             # forces you to load the model in fp16/bf16 if you want to use flash attention. Rather than loading
diff --git a/setup.py b/setup.py
index 2283e60d9c..6365d1a325 100644
--- a/setup.py
+++ b/setup.py
@@ -49,7 +49,7 @@
 install_requires = [
     'mosaicml[libcloud,wandb,mlflow,oci,gcs]>=0.17.1,<0.18',
     'accelerate>=0.25,<0.26',  # for HF inference `device_map`
-    'transformers>=4.36,<4.37',
+    'transformers>=4.35,<4.36',
     'mosaicml-streaming>=0.7.1,<0.8',
     'torch>=2.1,<2.1.1',
     'datasets==2.15.0',