From 7cb1c084bcc0d2f013b032272887cd28c4c499cc Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Tue, 24 Sep 2024 19:32:35 -0700
Subject: [PATCH 1/7] bump version to 0.13.0.dev0

---
 llmfoundry/_version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llmfoundry/_version.py b/llmfoundry/_version.py
index 2f1f590b19..0cddcaf967 100644
--- a/llmfoundry/_version.py
+++ b/llmfoundry/_version.py
@@ -3,4 +3,4 @@
 
 """The LLM Foundry Version."""
 
-__version__ = '0.12.0.dev0'
+__version__ = '0.13.0.dev0'

From 6858db9d3988e35c54275f5bab44404ca95744a3 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Tue, 24 Sep 2024 19:32:44 -0700
Subject: [PATCH 2/7] deprecations

---
 llmfoundry/command_utils/eval.py      |   2 +-
 llmfoundry/models/hf/__init__.py      |   2 -
 llmfoundry/models/hf/model_wrapper.py | 103 --------------------------
 tests/models/test_model.py            |   6 +-
 4 files changed, 4 insertions(+), 109 deletions(-)
 delete mode 100644 llmfoundry/models/hf/model_wrapper.py

diff --git a/llmfoundry/command_utils/eval.py b/llmfoundry/command_utils/eval.py
index 70c4319ea8..73127e8a07 100644
--- a/llmfoundry/command_utils/eval.py
+++ b/llmfoundry/command_utils/eval.py
@@ -82,7 +82,7 @@ def evaluate_model(
         warnings.warn(
             VersionedDeprecationWarning(
                 'The argument fsdp_config is deprecated. Please use parallelism_config instead.',
-                remove_version='0.13.0',
+                remove_version='0.14.0',
             ),
         )
     if fsdp_config and parallelism_config:
diff --git a/llmfoundry/models/hf/__init__.py b/llmfoundry/models/hf/__init__.py
index 2f25f92940..03df90e8cd 100644
--- a/llmfoundry/models/hf/__init__.py
+++ b/llmfoundry/models/hf/__init__.py
@@ -9,7 +9,6 @@
     prepare_hf_model_for_fsdp,
 )
 from llmfoundry.models.hf.hf_t5 import ComposerHFT5
-from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithFSDP
 
 __all__ = [
     'BaseHuggingFaceModel',
@@ -18,5 +17,4 @@
     'prepare_hf_causal_lm_model_for_fsdp',
     'prepare_hf_enc_dec_model_for_fsdp',
     'prepare_hf_model_for_fsdp',
-    'HuggingFaceModelWithFSDP',
 ]
diff --git a/llmfoundry/models/hf/model_wrapper.py b/llmfoundry/models/hf/model_wrapper.py
deleted file mode 100644
index c8805e5d6d..0000000000
--- a/llmfoundry/models/hf/model_wrapper.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright 2022 MosaicML LLM Foundry authors
-# SPDX-License-Identifier: Apache-2.0
-
-"""Re-usable :class:`.ComposerModel` for LLM HF Models."""
-
-from __future__ import annotations
-
-import warnings
-from collections import UserDict
-from typing import TYPE_CHECKING, Mapping, Optional, Union
-
-import transformers
-from composer.models.huggingface import HuggingFaceModel
-from torchmetrics import Metric
-from transformers import PreTrainedTokenizerBase
-from transformers.utils.generic import ModelOutput
-
-from llmfoundry.models.hf.hf_fsdp import prepare_hf_model_for_fsdp
-from llmfoundry.utils.warnings import VersionedDeprecationWarning
-
-if TYPE_CHECKING:
-    from peft import PeftConfig, PeftModel
-
-__all__ = ['HuggingFaceModelWithFSDP']
-
-# HuggingFace hardcodes the ignore index to -100
-_HF_IGNORE_INDEX = -100
-
-
-class HuggingFaceModelWithFSDP(HuggingFaceModel):
-    """Wrapper around HuggingFaceModel.
-
-    Handles preparation for FSDP wrapping.
-    """
-
-    def __init__(
-        self,
-        model: Union[transformers.PreTrainedModel, 'PeftModel'],
-        tokenizer: Optional[PreTrainedTokenizerBase] = None,
-        metrics: Optional[list[Metric]] = None,
-        eval_metrics: Optional[list[Metric]] = None,
-        shift_labels: bool = False,
-        allow_embedding_resizing: bool = False,
-        init_device: Optional[str] = None,
-        peft_config: Optional['PeftConfig'] = None,
-        should_save_peft_only: bool = True,
-    ):
-        warnings.warn(
-            VersionedDeprecationWarning(
-                '`HuggingFaceModelWithFSDP` is deprecated. In the future please use `BaseHuggingFaceModel`.',
-                remove_version='0.13.0',
-            ),
-        )
-        super().__init__(
-            model,
-            tokenizer,
-            use_logits=True,
-            metrics=metrics,
-            eval_metrics=eval_metrics,
-            shift_labels=shift_labels,
-            allow_embedding_resizing=allow_embedding_resizing,
-            peft_config=peft_config,
-            should_save_peft_only=should_save_peft_only,
-        )
-
-        self.prepare_inner_model(self.model, init_device)
-
-    def forward(self, batch: Mapping):
-        if isinstance(batch, dict) or isinstance(batch, UserDict):
-            # Further input validation is left to the huggingface forward call
-            batch = {
-                k: v for k, v in batch.items() if k in self.model_forward_args
-            }
-            output = self.model(**batch)  # type: ignore (thirdparty)
-        else:
-            raise ValueError(
-                'Unexpected batch type. Expected a dictionary with keys corresponding to the inputs to the forward function of the Huggingface model',
-            )
-        return output
-
-    def loss(self, outputs: ModelOutput, batch: Mapping):
-        if self.config.use_return_dict:
-            return outputs['loss']
-        # loss is at index 0 in the output tuple, logits are at index 1
-        return outputs[:2]
-
-    @staticmethod
-    def prepare_inner_model(
-        model: Union[transformers.PreTrainedModel, 'PeftModel'],
-        init_device: Optional[str] = None,
-    ):
-        """Prepare the inner model for FSDP wrapping.
-
-        Args:
-            model: The model to prepare.
-            init_device: The device to initialize the model on.
-        """
-        # Note: We need to add the FSDP related attributes to the model AFTER the super init,
-        # so that the (possible) embedding resizing doesn't destroy them
-        prepare_hf_model_for_fsdp(model, init_device)
-
-        # This provides support for meta initialization when using FSDP
-        model.param_init_fn = lambda module: model._init_weights(module)
diff --git a/tests/models/test_model.py b/tests/models/test_model.py
index eeb6bf0d90..92effffdd8 100644
--- a/tests/models/test_model.py
+++ b/tests/models/test_model.py
@@ -39,7 +39,7 @@
 
 from llmfoundry import ComposerHFCausalLM
 from llmfoundry.layers_registry import norms
-from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithFSDP
+from llmfoundry.models.hf import BaseHuggingFaceModel
 from llmfoundry.models.layers import build_alibi_bias
 from llmfoundry.models.layers.attention import (
     check_alibi_support,
@@ -2560,7 +2560,7 @@ def test_hf_init(
         False,
     )
 
-    model = HuggingFaceModelWithFSDP(model, tokenizer)
+    model = BaseHuggingFaceModel(model, tokenizer)
 
     batch = gen_random_batch(batch_size, test_cfg)
 
@@ -2609,7 +2609,7 @@ def test_head_dim_8_flash_mqa_attn(batch_size: int = 2):
 
     mpt = MPTForCausalLM(hf_config)
 
-    model = HuggingFaceModelWithFSDP(mpt, tokenizer, shift_labels=True)
+    model = BaseHuggingFaceModel(mpt, tokenizer, shift_labels=True)
 
     model = model.to(test_cfg.device)
     batch = gen_random_batch(batch_size, test_cfg)

From 259cc76c20e11ce9f7b525173db7dafebfb035e3 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Tue, 24 Sep 2024 20:36:19 -0700
Subject: [PATCH 3/7] fix?

---
 tests/models/test_model.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/tests/models/test_model.py b/tests/models/test_model.py
index 92effffdd8..16f02d63e2 100644
--- a/tests/models/test_model.py
+++ b/tests/models/test_model.py
@@ -2551,8 +2551,10 @@ def test_hf_init(
         betas=(0.9, 0.99),
     )
 
+    model = BaseHuggingFaceModel(str(save_path), tokenizer)
+
     prepare_fsdp_module(
-        model,
+        model.model,
         optimizer,
         FSDPConfig(**fsdp_config),
         precision,
@@ -2560,8 +2562,6 @@ def test_hf_init(
         False,
     )
 
-    model = BaseHuggingFaceModel(model, tokenizer)
-
     batch = gen_random_batch(batch_size, test_cfg)
 
     original_params = next(model.parameters()).clone().data
@@ -2579,7 +2579,10 @@ def test_hf_init(
 
 
 @pytest.mark.gpu
-def test_head_dim_8_flash_mqa_attn(batch_size: int = 2):
+def test_head_dim_8_flash_mqa_attn(
+    tmp_path: pathlib.Path,
+    batch_size: int = 2,
+):
     test_cfg = get_config(conf_path='scripts/train/yamls/pretrain/testing.yaml')
     test_cfg.device = torch.cuda.current_device()
 
@@ -2608,8 +2611,9 @@ def test_head_dim_8_flash_mqa_attn(batch_size: int = 2):
     )
 
     mpt = MPTForCausalLM(hf_config)
+    mpt.save_pretrained(tmp_path)
 
-    model = BaseHuggingFaceModel(mpt, tokenizer, shift_labels=True)
+    model = BaseHuggingFaceModel(str(tmp_path), tokenizer, shift_labels=True)
 
     model = model.to(test_cfg.device)
     batch = gen_random_batch(batch_size, test_cfg)

From e27bb7b58646dc0b4b0145e20b4efb4c76353df3 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Tue, 24 Sep 2024 21:21:17 -0700
Subject: [PATCH 4/7] Revert "fix?"

This reverts commit 259cc76c20e11ce9f7b525173db7dafebfb035e3.
---
 tests/models/test_model.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/tests/models/test_model.py b/tests/models/test_model.py
index 16f02d63e2..92effffdd8 100644
--- a/tests/models/test_model.py
+++ b/tests/models/test_model.py
@@ -2551,10 +2551,8 @@ def test_hf_init(
         betas=(0.9, 0.99),
     )
 
-    model = BaseHuggingFaceModel(str(save_path), tokenizer)
-
     prepare_fsdp_module(
-        model.model,
+        model,
         optimizer,
         FSDPConfig(**fsdp_config),
         precision,
@@ -2562,6 +2560,8 @@ def test_hf_init(
         False,
     )
 
+    model = BaseHuggingFaceModel(model, tokenizer)
+
     batch = gen_random_batch(batch_size, test_cfg)
 
     original_params = next(model.parameters()).clone().data
@@ -2579,10 +2579,7 @@ def test_hf_init(
 
 
 @pytest.mark.gpu
-def test_head_dim_8_flash_mqa_attn(
-    tmp_path: pathlib.Path,
-    batch_size: int = 2,
-):
+def test_head_dim_8_flash_mqa_attn(batch_size: int = 2):
     test_cfg = get_config(conf_path='scripts/train/yamls/pretrain/testing.yaml')
     test_cfg.device = torch.cuda.current_device()
 
@@ -2611,9 +2608,8 @@ def test_head_dim_8_flash_mqa_attn(
     )
 
     mpt = MPTForCausalLM(hf_config)
-    mpt.save_pretrained(tmp_path)
 
-    model = BaseHuggingFaceModel(str(tmp_path), tokenizer, shift_labels=True)
+    model = BaseHuggingFaceModel(mpt, tokenizer, shift_labels=True)
 
     model = model.to(test_cfg.device)
     batch = gen_random_batch(batch_size, test_cfg)

From 3b316c43bbb56eafcc301e59d9947bee58c8dc4a Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Tue, 24 Sep 2024 21:21:27 -0700
Subject: [PATCH 5/7] Revert "deprecations"

This reverts commit 6858db9d3988e35c54275f5bab44404ca95744a3.
---
 llmfoundry/command_utils/eval.py      |   2 +-
 llmfoundry/models/hf/__init__.py      |   2 +
 llmfoundry/models/hf/model_wrapper.py | 103 ++++++++++++++++++++++++++
 tests/models/test_model.py            |   6 +-
 4 files changed, 109 insertions(+), 4 deletions(-)
 create mode 100644 llmfoundry/models/hf/model_wrapper.py

diff --git a/llmfoundry/command_utils/eval.py b/llmfoundry/command_utils/eval.py
index 73127e8a07..70c4319ea8 100644
--- a/llmfoundry/command_utils/eval.py
+++ b/llmfoundry/command_utils/eval.py
@@ -82,7 +82,7 @@ def evaluate_model(
         warnings.warn(
             VersionedDeprecationWarning(
                 'The argument fsdp_config is deprecated. Please use parallelism_config instead.',
-                remove_version='0.14.0',
+                remove_version='0.13.0',
             ),
         )
     if fsdp_config and parallelism_config:
diff --git a/llmfoundry/models/hf/__init__.py b/llmfoundry/models/hf/__init__.py
index 03df90e8cd..2f25f92940 100644
--- a/llmfoundry/models/hf/__init__.py
+++ b/llmfoundry/models/hf/__init__.py
@@ -9,6 +9,7 @@
     prepare_hf_model_for_fsdp,
 )
 from llmfoundry.models.hf.hf_t5 import ComposerHFT5
+from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithFSDP
 
 __all__ = [
     'BaseHuggingFaceModel',
@@ -17,4 +18,5 @@
     'prepare_hf_causal_lm_model_for_fsdp',
     'prepare_hf_enc_dec_model_for_fsdp',
     'prepare_hf_model_for_fsdp',
+    'HuggingFaceModelWithFSDP',
 ]
diff --git a/llmfoundry/models/hf/model_wrapper.py b/llmfoundry/models/hf/model_wrapper.py
new file mode 100644
index 0000000000..c8805e5d6d
--- /dev/null
+++ b/llmfoundry/models/hf/model_wrapper.py
@@ -0,0 +1,103 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Re-usable :class:`.ComposerModel` for LLM HF Models."""
+
+from __future__ import annotations
+
+import warnings
+from collections import UserDict
+from typing import TYPE_CHECKING, Mapping, Optional, Union
+
+import transformers
+from composer.models.huggingface import HuggingFaceModel
+from torchmetrics import Metric
+from transformers import PreTrainedTokenizerBase
+from transformers.utils.generic import ModelOutput
+
+from llmfoundry.models.hf.hf_fsdp import prepare_hf_model_for_fsdp
+from llmfoundry.utils.warnings import VersionedDeprecationWarning
+
+if TYPE_CHECKING:
+    from peft import PeftConfig, PeftModel
+
+__all__ = ['HuggingFaceModelWithFSDP']
+
+# HuggingFace hardcodes the ignore index to -100
+_HF_IGNORE_INDEX = -100
+
+
+class HuggingFaceModelWithFSDP(HuggingFaceModel):
+    """Wrapper around HuggingFaceModel.
+
+    Handles preparation for FSDP wrapping.
+    """
+
+    def __init__(
+        self,
+        model: Union[transformers.PreTrainedModel, 'PeftModel'],
+        tokenizer: Optional[PreTrainedTokenizerBase] = None,
+        metrics: Optional[list[Metric]] = None,
+        eval_metrics: Optional[list[Metric]] = None,
+        shift_labels: bool = False,
+        allow_embedding_resizing: bool = False,
+        init_device: Optional[str] = None,
+        peft_config: Optional['PeftConfig'] = None,
+        should_save_peft_only: bool = True,
+    ):
+        warnings.warn(
+            VersionedDeprecationWarning(
+                '`HuggingFaceModelWithFSDP` is deprecated. In the future please use `BaseHuggingFaceModel`.',
+                remove_version='0.13.0',
+            ),
+        )
+        super().__init__(
+            model,
+            tokenizer,
+            use_logits=True,
+            metrics=metrics,
+            eval_metrics=eval_metrics,
+            shift_labels=shift_labels,
+            allow_embedding_resizing=allow_embedding_resizing,
+            peft_config=peft_config,
+            should_save_peft_only=should_save_peft_only,
+        )
+
+        self.prepare_inner_model(self.model, init_device)
+
+    def forward(self, batch: Mapping):
+        if isinstance(batch, dict) or isinstance(batch, UserDict):
+            # Further input validation is left to the huggingface forward call
+            batch = {
+                k: v for k, v in batch.items() if k in self.model_forward_args
+            }
+            output = self.model(**batch)  # type: ignore (thirdparty)
+        else:
+            raise ValueError(
+                'Unexpected batch type. Expected a dictionary with keys corresponding to the inputs to the forward function of the Huggingface model',
+            )
+        return output
+
+    def loss(self, outputs: ModelOutput, batch: Mapping):
+        if self.config.use_return_dict:
+            return outputs['loss']
+        # loss is at index 0 in the output tuple, logits are at index 1
+        return outputs[:2]
+
+    @staticmethod
+    def prepare_inner_model(
+        model: Union[transformers.PreTrainedModel, 'PeftModel'],
+        init_device: Optional[str] = None,
+    ):
+        """Prepare the inner model for FSDP wrapping.
+
+        Args:
+            model: The model to prepare.
+            init_device: The device to initialize the model on.
+        """
+        # Note: We need to add the FSDP related attributes to the model AFTER the super init,
+        # so that the (possible) embedding resizing doesn't destroy them
+        prepare_hf_model_for_fsdp(model, init_device)
+
+        # This provides support for meta initialization when using FSDP
+        model.param_init_fn = lambda module: model._init_weights(module)
diff --git a/tests/models/test_model.py b/tests/models/test_model.py
index 92effffdd8..eeb6bf0d90 100644
--- a/tests/models/test_model.py
+++ b/tests/models/test_model.py
@@ -39,7 +39,7 @@
 
 from llmfoundry import ComposerHFCausalLM
 from llmfoundry.layers_registry import norms
-from llmfoundry.models.hf import BaseHuggingFaceModel
+from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithFSDP
 from llmfoundry.models.layers import build_alibi_bias
 from llmfoundry.models.layers.attention import (
     check_alibi_support,
@@ -2560,7 +2560,7 @@ def test_hf_init(
         False,
     )
 
-    model = BaseHuggingFaceModel(model, tokenizer)
+    model = HuggingFaceModelWithFSDP(model, tokenizer)
 
     batch = gen_random_batch(batch_size, test_cfg)
 
@@ -2609,7 +2609,7 @@ def test_head_dim_8_flash_mqa_attn(batch_size: int = 2):
 
     mpt = MPTForCausalLM(hf_config)
 
-    model = BaseHuggingFaceModel(mpt, tokenizer, shift_labels=True)
+    model = HuggingFaceModelWithFSDP(mpt, tokenizer, shift_labels=True)
 
     model = model.to(test_cfg.device)
     batch = gen_random_batch(batch_size, test_cfg)

From ccf135cac25101ac548341d26a7bd3c743c482d5 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Tue, 24 Sep 2024 21:22:00 -0700
Subject: [PATCH 6/7] bump dep version

---
 llmfoundry/models/hf/model_wrapper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llmfoundry/models/hf/model_wrapper.py b/llmfoundry/models/hf/model_wrapper.py
index c8805e5d6d..f2b67db1ec 100644
--- a/llmfoundry/models/hf/model_wrapper.py
+++ b/llmfoundry/models/hf/model_wrapper.py
@@ -48,7 +48,7 @@ def __init__(
         warnings.warn(
             VersionedDeprecationWarning(
                 '`HuggingFaceModelWithFSDP` is deprecated. In the future please use `BaseHuggingFaceModel`.',
-                remove_version='0.13.0',
+                remove_version='0.14.0',
             ),
         )
         super().__init__(

From fef1726085781dd7ff1438ea961398f59e2c2dbd Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Tue, 24 Sep 2024 21:23:09 -0700
Subject: [PATCH 7/7] one more

---
 llmfoundry/command_utils/eval.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llmfoundry/command_utils/eval.py b/llmfoundry/command_utils/eval.py
index 70c4319ea8..73127e8a07 100644
--- a/llmfoundry/command_utils/eval.py
+++ b/llmfoundry/command_utils/eval.py
@@ -82,7 +82,7 @@ def evaluate_model(
         warnings.warn(
             VersionedDeprecationWarning(
                 'The argument fsdp_config is deprecated. Please use parallelism_config instead.',
-                remove_version='0.13.0',
+                remove_version='0.14.0',
             ),
         )
     if fsdp_config and parallelism_config: