clena up

mosaicml · Dec 15, 2023 · 004a023 · 004a023
1 parent 68d6f00
commit 004a023
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 5 deletions.
diff --git a/llmfoundry/__init__.py b/llmfoundry/__init__.py
@@ -5,6 +5,8 @@
 
 try:
     import warnings
+    # bitsandbytes is a very noisy library. A lot of it is print statements that we can't easily suppress,
+    # but we can at least suppress a bunch of spurious warnings.
     warnings.filterwarnings('ignore',
                             category=UserWarning,
                             module='bitsandbytes')
@@ -13,13 +15,12 @@
 
     from llmfoundry.utils.logging_utils import SpecificWarningFilter
 
-    # Filter out Hugging Face warning
+    # Filter out Hugging Face warning for not using a pinned revision of the model
     hf_dynamic_modules_logger = logging.getLogger(
         'transformers.dynamic_module_utils')
     new_files_warning_filter = SpecificWarningFilter(
         'A new version of the following files was downloaded from')
 
-    # We will trim examples later in the collate_fn, so we want to silence this warning from Hugging Face
     hf_dynamic_modules_logger.addFilter(new_files_warning_filter)
 
     # Before importing any transformers models, we need to disable transformers flash attention if

diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py
@@ -108,11 +108,9 @@ def __init__(self, om_model_config: Union[DictConfig,
                 trust_remote_code=trust_remote_code,
                 use_auth_token=use_auth_token,
                 attn_implementation=requested_attention_implementation,
-                use_cache=False,
+                use_cache=False, # Necessary due to https://github.com/huggingface/transformers/issues/28056
             )
 
-            # config._flash_attn_2_enabled = use_flash_attention_2
-
             # This is not ideal, however Hugging Face's _autoset_attn_implementation function
             # forces you to load the model in fp16/bf16 if you want to use flash attention. Rather than loading
             # the model and then casting it back to fp32, we are monkeypatching their check.