From 4b7a262d3eec51e8b5511371b1a4346d58c43e1e Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Thu, 28 Dec 2023 17:58:52 -0800
Subject: [PATCH 1/3] bump transformers

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index c030fe3268..09c13455b8 100644
--- a/setup.py
+++ b/setup.py
@@ -49,7 +49,7 @@
 install_requires = [
     'mosaicml[libcloud,wandb,mlflow,oci,gcs]>=0.17.1,<0.18',
     'accelerate>=0.25,<0.26',  # for HF inference `device_map`
-    'transformers>=4.36,<4.37',
+    'transformers>=4.36.2,<4.37',
     'mosaicml-streaming>=0.7.1,<0.8',
     'torch>=2.1,<2.1.1',
     'datasets==2.15.0',

From eb4a596ef79d91837249d3fe9797558e618a42b4 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Fri, 29 Dec 2023 00:28:34 -0800
Subject: [PATCH 2/3] try assign

---
 llmfoundry/callbacks/hf_checkpointer.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py
index 491d510188..83a729c627 100644
--- a/llmfoundry/callbacks/hf_checkpointer.py
+++ b/llmfoundry/callbacks/hf_checkpointer.py
@@ -24,6 +24,7 @@
 from llmfoundry.models.mpt import MPTConfig, MPTForCausalLM
 from llmfoundry.utils.huggingface_hub_utils import \
     edit_files_for_hf_compatibility
+from llmfoundry.models.utils import init_empty_weights
 
 log = logging.getLogger(__name__)
 
@@ -224,9 +225,10 @@ def _save_checkpoint(self, state: State, logger: Logger):
                 # TODO: after torch 2.1, we can load a state dict into a meta model
                 # and skip the extra model init
                 log.debug(f'Creating new model instance')
-                new_model_instance = type(original_model)(copied_config)
-                new_model_instance.to(dtype=self.dtype)
-                new_model_instance.load_state_dict(state_dict)
+                with init_empty_weights(include_buffers=False):
+                    new_model_instance = type(original_model)(copied_config)
+                # new_model_instance.to(dtype=self.dtype)
+                new_model_instance.load_state_dict(state_dict, assign=True)
                 del state_dict
 
                 log.debug('Saving Hugging Face checkpoint to disk')

From 331212744268dcc0f9504b1bca7b70320de56e61 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Fri, 19 Jan 2024 16:58:16 -0800
Subject: [PATCH 3/3] revert assign

---
 llmfoundry/callbacks/hf_checkpointer.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py
index 83a729c627..491d510188 100644
--- a/llmfoundry/callbacks/hf_checkpointer.py
+++ b/llmfoundry/callbacks/hf_checkpointer.py
@@ -24,7 +24,6 @@
 from llmfoundry.models.mpt import MPTConfig, MPTForCausalLM
 from llmfoundry.utils.huggingface_hub_utils import \
     edit_files_for_hf_compatibility
-from llmfoundry.models.utils import init_empty_weights
 
 log = logging.getLogger(__name__)
 
@@ -225,10 +224,9 @@ def _save_checkpoint(self, state: State, logger: Logger):
                 # TODO: after torch 2.1, we can load a state dict into a meta model
                 # and skip the extra model init
                 log.debug(f'Creating new model instance')
-                with init_empty_weights(include_buffers=False):
-                    new_model_instance = type(original_model)(copied_config)
-                # new_model_instance.to(dtype=self.dtype)
-                new_model_instance.load_state_dict(state_dict, assign=True)
+                new_model_instance = type(original_model)(copied_config)
+                new_model_instance.to(dtype=self.dtype)
+                new_model_instance.load_state_dict(state_dict)
                 del state_dict
 
                 log.debug('Saving Hugging Face checkpoint to disk')