From 20f6c3f58cad8055fec3e7614cfc64ade437babd Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Thu, 26 Oct 2023 19:42:32 +0000
Subject: [PATCH 1/5] Fix memory leak

---
 src/accelerate/utils/transformer_engine.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/accelerate/utils/transformer_engine.py b/src/accelerate/utils/transformer_engine.py
index a6342d7150f..b39d8efeaf3 100644
--- a/src/accelerate/utils/transformer_engine.py
+++ b/src/accelerate/utils/transformer_engine.py
@@ -36,15 +36,15 @@ def convert_model(model, to_transformer_engine=True, _convert_linear=True, _conv
             te_module = te.Linear(
                 module.in_features, module.out_features, bias=has_bias, params_dtype=module.weight.dtype
             )
-            te_module.weight.data = module.weight.data.clone()
+            module.weight.copy_(te_module.weight)
             if has_bias:
-                te_module.bias.data = module.bias.data.clone()
+                module.bias.copy_(te_module.bias)
 
             setattr(model, name, te_module)
         elif isinstance(module, nn.LayerNorm) and to_transformer_engine and _convert_ln:
             te_module = te.LayerNorm(module.normalized_shape[0], eps=module.eps, params_dtype=module.weight.dtype)
-            te_module.weight.data = module.weight.data.clone()
-            te_module.bias.data = module.bias.data.clone()
+            module.weight.copy_(te_module.weight)
+            module.bias.copy_(te_module.bias)
 
             setattr(model, name, te_module)
         elif isinstance(module, te.Linear) and not to_transformer_engine and _convert_linear:
@@ -52,15 +52,15 @@ def convert_model(model, to_transformer_engine=True, _convert_linear=True, _conv
             new_module = nn.Linear(
                 module.in_features, module.out_features, bias=has_bias, params_dtype=module.weight.dtype
             )
-            new_module.weight.data = module.weight.data.clone()
+            module.weight.copy_(new_module.weight)
             if has_bias:
-                new_module.bias.data = module.bias.data.clone()
+                module.bias.copy_(new_module.bias)
 
             setattr(model, name, new_module)
         elif isinstance(module, te.LayerNorm) and not to_transformer_engine and _convert_ln:
             new_module = nn.LayerNorm(module.normalized_shape[0], eps=module.eps, params_dtype=module.weight.dtype)
-            new_module.weight.data = module.weight.data.clone()
-            new_module.bias.data = module.bias.data.clone()
+            module.weight.copy_(new_module.weight)
+            module.bias.copy_(new_module.bias)
 
             setattr(model, name, new_module)
         else:

From b05da0d07a5f65583f0880bc8970d2b15c5bcfa7 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Thu, 26 Oct 2023 19:45:10 +0000
Subject: [PATCH 2/5] Change when model is moved to cuda

---
 src/accelerate/accelerator.py | 58 +++++++++++++++++------------------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
index 3bd420f8f7b..242f6401b7f 100755
--- a/src/accelerate/accelerator.py
+++ b/src/accelerate/accelerator.py
@@ -1357,35 +1357,6 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
                 " Please rerun your script specifying `--num_processes=1` or by launching with `python {{myscript.py}}`."
             )
 
-        if (getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False)) and getattr(
-            model, "hf_device_map", False
-        ):
-            model_devices = set(model.hf_device_map.values())
-            if len(model_devices) > 1 and self.distributed_type != DistributedType.NO:
-                raise ValueError(
-                    "You can't train a model that has been loaded in 8-bit precision on multiple devices in any distributed mode."
-                    " In order to use 8-bit models that have been loaded across multiple GPUs the solution is to use Naive Pipeline Parallelism."
-                    " Therefore you should not specify that you are under any distributed regime in your accelerate config."
-                )
-            current_device = list(model_devices)[0]
-            current_device_index = current_device.index if isinstance(current_device, torch.device) else current_device
-
-            if torch.device(current_device_index) != self.device:
-                # if on the first device (GPU 0) we don't care
-                if (self.device.index is not None) or (current_device_index != 0):
-                    raise ValueError(
-                        "You can't train a model that has been loaded in 8-bit precision on a different device than the one "
-                        "you're training on. Make sure you loaded the model on the correct device using for example `device_map={'':torch.cuda.current_device()}"
-                        "you're training on. Make sure you loaded the model on the correct device using for example `device_map={'':torch.cuda.current_device() or device_map={'':torch.xpu.current_device()}"
-                    )
-
-            if "cpu" in model_devices or "disk" in model_devices:
-                raise ValueError(
-                    "You can't train a model that has been loaded in 8-bit precision with CPU or disk offload."
-                )
-        elif device_placement and not self.verify_device_map(model):
-            model = model.to(self.device)
-
         if self.native_amp:
             model._original_forward = model.forward
             model_forward_func = model.forward.__func__ if hasattr(model.forward, "__func__") else model.forward
@@ -1416,6 +1387,35 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
                     "or higher, compute capability of 8.9 or higher). Will use FP16 instead."
                 )
             model.forward = fp8_autocast(enabled=fp8_enabled, fp8_recipe=fp8_recipe)(model.forward)
+
+        if (getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False)) and getattr(
+            model, "hf_device_map", False
+        ):
+            model_devices = set(model.hf_device_map.values())
+            if len(model_devices) > 1 and self.distributed_type != DistributedType.NO:
+                raise ValueError(
+                    "You can't train a model that has been loaded in 8-bit precision on multiple devices in any distributed mode."
+                    " In order to use 8-bit models that have been loaded across multiple GPUs the solution is to use Naive Pipeline Parallelism."
+                    " Therefore you should not specify that you are under any distributed regime in your accelerate config."
+                )
+            current_device = list(model_devices)[0]
+            current_device_index = current_device.index if isinstance(current_device, torch.device) else current_device
+
+            if torch.device(current_device_index) != self.device:
+                # if on the first device (GPU 0) we don't care
+                if (self.device.index is not None) or (current_device_index != 0):
+                    raise ValueError(
+                        "You can't train a model that has been loaded in 8-bit precision on a different device than the one "
+                        "you're training on. Make sure you loaded the model on the correct device using for example `device_map={'':torch.cuda.current_device()}"
+                        "you're training on. Make sure you loaded the model on the correct device using for example `device_map={'':torch.cuda.current_device() or device_map={'':torch.xpu.current_device()}"
+                    )
+
+            if "cpu" in model_devices or "disk" in model_devices:
+                raise ValueError(
+                    "You can't train a model that has been loaded in 8-bit precision with CPU or disk offload."
+                )
+        elif device_placement and not self.verify_device_map(model):
+            model = model.to(self.device)
         if not evaluation_mode:
             if self.distributed_type in (
                 DistributedType.MULTI_GPU,

From f8f9b2fbbe6b266ebd819d37dcbb98a8e8813721 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Fri, 27 Oct 2023 12:58:56 +0000
Subject: [PATCH 3/5] Add from PR

---
 src/accelerate/accelerator.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
index 242f6401b7f..cc34cd5cb1c 100755
--- a/src/accelerate/accelerator.py
+++ b/src/accelerate/accelerator.py
@@ -1406,7 +1406,6 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
                 if (self.device.index is not None) or (current_device_index != 0):
                     raise ValueError(
                         "You can't train a model that has been loaded in 8-bit precision on a different device than the one "
-                        "you're training on. Make sure you loaded the model on the correct device using for example `device_map={'':torch.cuda.current_device()}"
                         "you're training on. Make sure you loaded the model on the correct device using for example `device_map={'':torch.cuda.current_device() or device_map={'':torch.xpu.current_device()}"
                     )
 

From bc58807b14555df212041ed0dc38e7b3bd5c7acf Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Fri, 27 Oct 2023 16:52:03 +0000
Subject: [PATCH 4/5] Remove link

---
 src/accelerate/accelerator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
index cc34cd5cb1c..5bb96d8c224 100755
--- a/src/accelerate/accelerator.py
+++ b/src/accelerate/accelerator.py
@@ -1372,7 +1372,7 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
                 with torch.no_grad():
                     convert_model(model)
                 model._converted_to_transformer_engine = True
-            model._original_forward = model.forward
+            #model._original_forward = model.forward
 
             kwargs = self.fp8_recipe_handler.to_kwargs() if self.fp8_recipe_handler is not None else {}
             if "fp8_format" in kwargs:

From d1df551835c14e98b066d41f9b527005c49e964f Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Tue, 31 Oct 2023 09:08:42 -0400
Subject: [PATCH 5/5] Undo original forward link

---
 src/accelerate/accelerator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
index 5bb96d8c224..cc34cd5cb1c 100755
--- a/src/accelerate/accelerator.py
+++ b/src/accelerate/accelerator.py
@@ -1372,7 +1372,7 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
                 with torch.no_grad():
                     convert_model(model)
                 model._converted_to_transformer_engine = True
-            #model._original_forward = model.forward
+            model._original_forward = model.forward
 
             kwargs = self.fp8_recipe_handler.to_kwargs() if self.fp8_recipe_handler is not None else {}
             if "fp8_format" in kwargs: