From 81cdd0eaa0b5ccf7e69e8965ad2e2d873d2a83d9 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Mon, 18 Dec 2023 19:04:19 -0800
Subject: [PATCH 01/64] add peft config

---
 composer/models/huggingface.py | 13 ++++++++++++-
 setup.py                       |  4 ++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index 8a944e29c2..060d40d33f 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -28,6 +28,8 @@
     from transformers import PretrainedConfig
     from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
+    from peft import PeftConfig, get_peft_model
+
 log = logging.getLogger(__name__)
 
 __all__ = ['HuggingFaceModel']
@@ -73,7 +75,8 @@ def __init__(self,
                  metrics: Optional[List[Metric]] = None,
                  eval_metrics: Optional[List[Metric]] = None,
                  shift_labels: Optional[bool] = None,
-                 allow_embedding_resizing: bool = False) -> None:
+                 allow_embedding_resizing: bool = False,
+                 peft_config: Optional['PeftConfig'] = None) -> None:
         try:
             import transformers
             del transformers  # unused
@@ -87,6 +90,10 @@ def __init__(self,
         self.config = model.config
         self.model_forward_args = inspect.getfullargspec(self.model.forward).args
         self.tokenizer = tokenizer
+        self.peft_config = peft_config
+
+        if self.peft_config.peft_type != 'lora':
+            raise ValueError(f'PEFT type {self.peft_config.peft_type} is not supported by HuggingFaceModel. Only lora is supported.')
 
         if self.tokenizer is None:
             log.warning(
@@ -142,6 +149,10 @@ def __init__(self,
 
         self.dummy_forward_called = False
 
+        if self.peft_config is not None:
+            self.model = get_peft_model(self.model, self.peft_config)
+            log.info(f'PEFT model created. {self.model}')
+
     @staticmethod
     def load_huggingface_tokenizer_from_saved_state(
             hf_state: Dict[str, Any],
diff --git a/setup.py b/setup.py
index 3a4bd6bd23..6ccc3bffc7 100644
--- a/setup.py
+++ b/setup.py
@@ -188,6 +188,10 @@ def package_files(prefix: str, directory: str, extension: str):
     'datasets>=2.4,<3',
 ]
 
+extra_deps['peft'] = [
+    'peft>=0.6.0,<0.7',
+]
+
 extra_deps['sentencepiece'] = [
     'protobuf<3.21',
     'sentencepiece==0.1.99',

From 3d06ed96d75699e7674b0bba4bc2a742b660b808 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Fri, 22 Dec 2023 00:34:14 -0800
Subject: [PATCH 02/64] fix

---
 composer/models/huggingface.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index 060d40d33f..fd674e8060 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -23,12 +23,14 @@
 from composer.models.base import ComposerModel
 from composer.utils import MissingConditionalImportError, dist, get_file, import_object, is_model_fsdp, safe_torch_load
 
+from peft import get_peft_model
+
 if TYPE_CHECKING:
     import transformers
     from transformers import PretrainedConfig
     from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
-    from peft import PeftConfig, get_peft_model
+    from peft import PeftConfig
 
 log = logging.getLogger(__name__)
 

From 27cee1d0428b01bb3755e85d89210d440c788f8e Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Fri, 22 Dec 2023 11:47:38 -0800
Subject: [PATCH 03/64] fix

---
 composer/models/huggingface.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index fd674e8060..24dca688a2 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -94,8 +94,8 @@ def __init__(self,
         self.tokenizer = tokenizer
         self.peft_config = peft_config
 
-        if self.peft_config.peft_type != 'lora':
-            raise ValueError(f'PEFT type {self.peft_config.peft_type} is not supported by HuggingFaceModel. Only lora is supported.')
+        if self.peft_config.peft_type != 'LORA':
+            raise ValueError(f'PEFT type {self.peft_config.peft_type} is not supported by HuggingFaceModel. Only LORA is supported.')
 
         if self.tokenizer is None:
             log.warning(

From 9fe230fe342179f7f01a0168d67b6110b6524d6b Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Sun, 7 Jan 2024 17:23:31 -0800
Subject: [PATCH 04/64] wip

---
 composer/models/huggingface.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index 24dca688a2..9ea144c41c 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -13,6 +13,7 @@
 import string
 import tempfile
 import textwrap
+import warnings
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple, Type, Union
 
@@ -94,8 +95,17 @@ def __init__(self,
         self.tokenizer = tokenizer
         self.peft_config = peft_config
 
-        if self.peft_config.peft_type != 'LORA':
-            raise ValueError(f'PEFT type {self.peft_config.peft_type} is not supported by HuggingFaceModel. Only LORA is supported.')
+        if self.peft_config is not None:
+            try:
+                import peft
+                del peft
+            except ImportError as e:
+                raise MissingConditionalImportError(extra_deps_group='peft',
+                                                    conda_package='peft',
+                                                    conda_channel='conda-forge') from e
+
+        if self.peft_config is not None and self.peft_config.peft_type != 'LORA':
+            warnings.warn(f'PEFT type {self.peft_config.peft_type} is not supported by HuggingFaceModel. Only LORA is supported.', RuntimeWarning)
 
         if self.tokenizer is None:
             log.warning(

From 91a9962eb32d916a27a0e3b47d90c95d69ae7eb3 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Mon, 8 Jan 2024 22:03:44 -0800
Subject: [PATCH 05/64] tests and fixes

---
 composer/models/huggingface.py |  73 +++++++++++++-
 tests/models/test_hf_model.py  | 167 ++++++++++++++++++++++++++++++++-
 2 files changed, 234 insertions(+), 6 deletions(-)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index 9ea144c41c..6b4de0a252 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -24,7 +24,11 @@
 from composer.models.base import ComposerModel
 from composer.utils import MissingConditionalImportError, dist, get_file, import_object, is_model_fsdp, safe_torch_load
 
-from peft import get_peft_model
+try:
+    from peft import get_peft_model, PeftModel
+    _peft_installed = True
+except:
+    _peft_installed = False
 
 if TYPE_CHECKING:
     import transformers
@@ -105,7 +109,7 @@ def __init__(self,
                                                     conda_channel='conda-forge') from e
 
         if self.peft_config is not None and self.peft_config.peft_type != 'LORA':
-            warnings.warn(f'PEFT type {self.peft_config.peft_type} is not supported by HuggingFaceModel. Only LORA is supported.', RuntimeWarning)
+            raise ValueError(f'PEFT type {self.peft_config.peft_type} is not supported by HuggingFaceModel. Only LORA is supported.')
 
         if self.tokenizer is None:
             log.warning(
@@ -521,6 +525,19 @@ def get_metadata(self):
                 'class': f'{self.model.__class__.__module__}.{self.model.__class__.__name__}'
             }
 
+            # Also save PEFT config the model is a peft model
+            if _peft_installed:
+                active_adapter = self.model.active_adapter
+                self.model.peft_config[active_adapter].save_pretrained(model_dir)
+                with open(model_dir / 'adapter_config.json') as _peft_config:
+                    peft_config = json.load(_peft_config)
+                
+                model_output['peft_config'] = {
+                    'file_extension': '.json',
+                    'content': peft_config,
+                }
+
+
             if self.tokenizer is not None:
                 for tokenizer_file_name in tokenizer_dir.iterdir():
                     tokenizer_file_path = tokenizer_dir / tokenizer_file_name
@@ -659,6 +676,28 @@ def get_hf_config_from_composer_state_dict(state_dict: Dict[str, Any],
                 f'Please make sure that the model_type={hf_config_dict.get("model_type")} is valid, or that the'
                 f'config has a valid `_name_or_path`.')
 
+def get_peft_config_from_composer_state_dict(state_dict: Dict[str, Any]) -> Optional['PeftConfig']:
+    """Get a PEFT config from a composer state dict
+
+    Args:
+        state_dict (Dict[str, Any]): The state dict to get the config from
+
+    Returns:
+        peft.PeftConfig: The PEFT config
+    """    
+    try:
+        import peft
+    except ImportError as e:
+        raise MissingConditionalImportError(extra_deps_group='nlp',
+                                            conda_package='peft',
+                                            conda_channel='conda-forge') from e
+
+    if 'peft_config' not in state_dict['state']['integrations']['huggingface']['model']:
+        return None
+
+    peft_config_dict = state_dict['state']['integrations']['huggingface']['model']['peft_config']['content']
+
+    return peft.get_peft_config(peft_config_dict)
 
 def write_huggingface_pretrained_from_composer_checkpoint(
         checkpoint_path: Union[Path, str],
@@ -736,6 +775,34 @@ def write_huggingface_pretrained_from_composer_checkpoint(
     config = get_hf_config_from_composer_state_dict(composer_state_dict)
     config.save_pretrained(output_folder)
 
+    peft_config = get_peft_config_from_composer_state_dict(composer_state_dict)
+    if peft_config is not None:
+        peft_config.save_pretrained(output_folder)
+
     weights_state_dict = composer_state_dict['state']['model']
     torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(weights_state_dict, prefix='model.')
-    torch.save(weights_state_dict, Path(output_folder) / 'pytorch_model.bin')
+    
+    # NOTE: This only works for default adapter name
+    if peft_config is not None:
+        # Filtering copied from https://github.com/huggingface/peft/blob/4186c9b104644fd247a4cc0dc2dfc1ede4665204/src/peft/utils/save_and_load.py#L68C1-L86C116
+        bias = peft_config.bias
+        if bias == "none":
+            to_return = {k: weights_state_dict[k] for k in weights_state_dict if "lora_" in k}
+        elif bias == "all":
+            to_return = {k: weights_state_dict[k] for k in weights_state_dict if "lora_" in k or "bias" in k}
+        elif bias == "lora_only":
+            to_return = {}
+            for k in weights_state_dict:
+                if "lora_" in k:
+                    to_return[k] = weights_state_dict[k]
+                    bias_name = k.split("lora_")[0] + "bias"
+                    if bias_name in weights_state_dict:
+                        to_return[bias_name] = weights_state_dict[bias_name]
+        else:
+            raise NotImplementedError
+        to_return = {k: v for k, v in to_return.items() if (("lora_" in k and 'default' in k) or ("bias" in k))}
+        to_return = {k.replace(f".default", ""): v for k, v in to_return.items()}
+        
+        torch.save(to_return, Path(output_folder) / 'adapter_model.bin')
+    else:
+        torch.save(weights_state_dict, Path(output_folder) / 'pytorch_model.bin')
diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index 0f6076116f..266334d6eb 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -6,7 +6,7 @@
 import tempfile
 from contextlib import nullcontext
 from pathlib import Path
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, TYPE_CHECKING
 from unittest.mock import patch
 from urllib.parse import urlparse
 
@@ -29,6 +29,23 @@
                                  configure_tiny_gpt2_tokenizer, configure_tiny_t5_model, configure_tiny_t5_tokenizer)
 from tests.loggers.test_remote_uploader_downloader import DummyObjectStore
 
+if TYPE_CHECKING:
+    from peft import PeftConfig
+
+@pytest.fixture
+def tiny_gpt2_peft_config():
+    pytest.importorskip("peft")
+    from peft import get_peft_config
+    
+    peft_config = get_peft_config(
+        {
+            'peft_type': 'LORA',
+            'task_type': 'CAUSAL_LM',
+            'target_modules': ['c_attn'],
+            'fan_in_fan_out': True,
+        }
+    )
+    return peft_config
 
 def test_hf_tokenizer_save(tmp_path: Path, tiny_bert_model, tiny_bert_tokenizer):
     transformers = pytest.importorskip('transformers')
@@ -431,14 +448,21 @@ def get_lm_trainer(hf_model,
                    device_train_microbatch_size: Optional[int] = None,
                    batch_size: int = 4,
                    sequence_length: int = 4,
-                   size: int = 4):
+                   size: int = 4,
+                   peft_config: Optional['PeftConfig'] = None):
     transformers = pytest.importorskip('transformers')
 
     metrics: List[Metric] = [LanguageCrossEntropy(ignore_index=-100)]
     if not is_conditional_generation:
         metrics.append(MaskedAccuracy(ignore_index=-100))
 
-    model = HuggingFaceModel(hf_model, tokenizer=hf_tokenizer, metrics=metrics, use_logits=True)
+    model = HuggingFaceModel(
+        hf_model,
+        tokenizer=hf_tokenizer,
+        metrics=metrics,
+        use_logits=True,
+        peft_config=peft_config,
+    )
 
     vocab_size = hf_model.config.vocab_size
     sequence_length = 4
@@ -1148,3 +1172,140 @@ def test_eval_forward_generate(device, world_size, hf_model, hf_tokenizer, use_f
     assert len(generation1) == len(generation2) == 2
     assert all(isinstance(decoded_generation, str) for decoded_generation in generation1)
     assert all(isinstance(decoded_generation, str) for decoded_generation in generation2)
+
+
+def test_peft_init(tiny_gpt2_model, tiny_gpt2_peft_config):
+    pytest.importorskip("peft")
+    from peft import PeftModelForCausalLM
+
+    original_model = copy.deepcopy(tiny_gpt2_model)
+    hf_model = HuggingFaceModel(tiny_gpt2_model, peft_config=tiny_gpt2_peft_config)
+    assert isinstance(hf_model.model, PeftModelForCausalLM)
+    assert hf_model.model.peft_config['default'].peft_type == 'LORA'
+    assert hf_model.model.peft_config['default'].task_type == 'CAUSAL_LM'
+    assert hf_model.model.config == original_model.config
+
+def test_peft_init_not_installed(tiny_gpt2_model, tiny_gpt2_peft_config):
+    pytest.importorskip("peft")
+
+    with patch.dict('sys.modules', {'peft': None}):
+        with pytest.raises(ImportError):
+            from composer.models import HuggingFaceModel
+            _ = HuggingFaceModel(tiny_gpt2_model, peft_config=tiny_gpt2_peft_config)
+
+def test_peft_trains_and_loads(tiny_gpt2_model, tiny_gpt2_tokenizer, tiny_gpt2_peft_config, tmp_path):
+    pytest.importorskip("peft")
+    
+    trainer = get_lm_trainer(
+        tiny_gpt2_model,
+        tiny_gpt2_tokenizer,
+        str(tmp_path),
+        peft_config=tiny_gpt2_peft_config,
+        device_train_microbatch_size=1,
+        mlm=False,
+    )
+    trainer.fit()
+
+    load_trainer = get_lm_trainer(
+        tiny_gpt2_model,
+        tiny_gpt2_tokenizer,
+        str(tmp_path),
+        peft_config=tiny_gpt2_peft_config,
+        device_train_microbatch_size=1,
+        mlm=False,
+        load_path=str(tmp_path / 'hf-checkpoint.pt'),
+    )
+    
+    for p1, p2 in zip(trainer.state.model.parameters(), load_trainer.state.model.parameters()):
+        torch.testing.assert_close(p1, p2)
+
+def test_peft_generate(tiny_gpt2_model, tiny_gpt2_tokenizer, tiny_gpt2_peft_config):
+    pytest.importorskip("peft")
+
+    hf_model = HuggingFaceModel(tiny_gpt2_model, tokenizer=tiny_gpt2_tokenizer, peft_config=tiny_gpt2_peft_config)
+
+    input_dict = tiny_gpt2_tokenizer(['hello', 'goodbyes'], return_tensors='pt', padding=True)
+    hf_model.generate(**input_dict, max_new_tokens=5, pad_token_id=tiny_gpt2_tokenizer.pad_token_id)
+
+def test_peft_metadata(tiny_gpt2_model, tiny_gpt2_tokenizer, tiny_gpt2_peft_config):
+    pytest.importorskip("peft")
+
+    from peft import get_peft_config
+
+    hf_model = HuggingFaceModel(tiny_gpt2_model, tokenizer=tiny_gpt2_tokenizer, peft_config=tiny_gpt2_peft_config)
+    metadata = hf_model.get_metadata()
+    loaded_peft_config = get_peft_config(metadata['model']['peft_config']['content'])
+
+    assert loaded_peft_config == tiny_gpt2_peft_config
+
+def test_peft_write_hf_from_composer(tiny_gpt2_model, tiny_gpt2_tokenizer, tiny_gpt2_peft_config, tmp_path):
+    peft = pytest.importorskip("peft")
+    transformers = pytest.importorskip('transformers')
+
+    # Simulate a local model instead of a hub model
+    tiny_gpt2_model.save_pretrained(tmp_path / 'hf-save-to-load')
+    tiny_gpt2_model = transformers.AutoModelForCausalLM.from_pretrained(tmp_path / 'hf-save-to-load')
+    
+    trainer = get_lm_trainer(
+        tiny_gpt2_model,
+        tiny_gpt2_tokenizer,
+        str(tmp_path),
+        peft_config=tiny_gpt2_peft_config,
+        device_train_microbatch_size=1,
+        mlm=False,
+    )
+    trainer.fit()
+
+    from composer.models.huggingface import write_huggingface_pretrained_from_composer_checkpoint
+    write_huggingface_pretrained_from_composer_checkpoint(str(tmp_path / 'hf-checkpoint.pt'), tmp_path / 'hf-save-pretrained')
+
+    # Test we can load back in using transformers interface
+    loaded_hf_model = transformers.AutoModelForCausalLM.from_pretrained(str(tmp_path / 'hf-save-pretrained'))
+    for p1, p2 in zip(trainer.state.model.model.parameters(), loaded_hf_model.parameters()):
+        torch.testing.assert_close(p1, p2)
+    
+    # Test we can load back in using peft interface
+    loaded_peft_model = peft.PeftModelForCausalLM.from_pretrained(tiny_gpt2_model, str(tmp_path / 'hf-save-pretrained'))
+    for p1, p2 in zip(trainer.state.model.model.parameters(), loaded_peft_model.parameters()):
+        torch.testing.assert_close(p1, p2)
+
+@pytest.mark.gpu
+@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.13.0'),
+                    reason='requires PyTorch 1.13 or higher')
+def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, tiny_gpt2_peft_config, tmp_path):
+    pytest.importorskip("peft")
+
+    fsdp_config = {
+        'sharding_strategy': 'FULL_SHARD',
+        'cpu_offload': False,
+        'mixed_precision': 'PURE',
+        'backward_prefetch': 'BACKWARD_PRE',
+        'activation_checkpointing': False,
+        'activation_cpu_offload': False,
+        'verbose': False
+    }
+    
+    trainer = get_lm_trainer(
+        tiny_gpt2_model,
+        tiny_gpt2_tokenizer,
+        str(tmp_path),
+        peft_config=tiny_gpt2_peft_config,
+        device_train_microbatch_size=1,
+        mlm=False,
+        fsdp_config=fsdp_config,
+    )
+    trainer.fit()
+
+    load_trainer = get_lm_trainer(
+        tiny_gpt2_model,
+        tiny_gpt2_tokenizer,
+        str(tmp_path),
+        peft_config=tiny_gpt2_peft_config,
+        device_train_microbatch_size=1,
+        mlm=False,
+        load_path=str(tmp_path / 'hf-checkpoint.pt'),
+        fsdp_config=fsdp_config,
+    )
+    
+    for p1, p2 in zip(trainer.state.model.parameters(), load_trainer.state.model.parameters()):
+        torch.testing.assert_close(p1, p2)
\ No newline at end of file

From af8a8879965aa5e7a23c567a9ba0da6f10ac8dc9 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Mon, 8 Jan 2024 22:06:44 -0800
Subject: [PATCH 06/64] fix

---
 composer/models/huggingface.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index 6b4de0a252..064ebce77a 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -97,9 +97,8 @@ def __init__(self,
         self.config = model.config
         self.model_forward_args = inspect.getfullargspec(self.model.forward).args
         self.tokenizer = tokenizer
-        self.peft_config = peft_config
 
-        if self.peft_config is not None:
+        if peft_config is not None:
             try:
                 import peft
                 del peft
@@ -165,8 +164,9 @@ def __init__(self,
 
         self.dummy_forward_called = False
 
-        if self.peft_config is not None:
-            self.model = get_peft_model(self.model, self.peft_config)
+        if peft_config is not None:
+            from peft import get_peft_model
+            self.model = get_peft_model(self.model, peft_config)
             log.info(f'PEFT model created. {self.model}')
 
     @staticmethod

From 9a51ded941978b286b4802c577f29e090632d39a Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Mon, 8 Jan 2024 22:07:54 -0800
Subject: [PATCH 07/64] fix

---
 composer/models/huggingface.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index 064ebce77a..85be451b2a 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -107,8 +107,8 @@ def __init__(self,
                                                     conda_package='peft',
                                                     conda_channel='conda-forge') from e
 
-        if self.peft_config is not None and self.peft_config.peft_type != 'LORA':
-            raise ValueError(f'PEFT type {self.peft_config.peft_type} is not supported by HuggingFaceModel. Only LORA is supported.')
+        if peft_config is not None and peft_config.peft_type != 'LORA':
+            raise ValueError(f'PEFT type {peft_config.peft_type} is not supported by HuggingFaceModel. Only LORA is supported.')
 
         if self.tokenizer is None:
             log.warning(

From d43bd0c332fccc68dd32d92371e9878c505315a0 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Tue, 9 Jan 2024 00:44:33 -0800
Subject: [PATCH 08/64] precommit

---
 composer/models/huggingface.py | 85 +++++++++++++++++-----------------
 tests/models/test_hf_model.py  | 59 ++++++++++++-----------
 2 files changed, 76 insertions(+), 68 deletions(-)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index 71a0879dda..5428d0104a 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -13,7 +13,6 @@
 import string
 import tempfile
 import textwrap
-import warnings
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple, Type, Union
 
@@ -25,18 +24,17 @@
 from composer.utils import MissingConditionalImportError, dist, get_file, import_object, is_model_fsdp, safe_torch_load
 
 try:
-    from peft import get_peft_model, PeftModel
+    from peft import get_peft_model
     _peft_installed = True
 except:
     _peft_installed = False
 
 if TYPE_CHECKING:
     import transformers
+    from peft import PeftConfig, PeftModel
     from transformers import PretrainedConfig
     from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
-    from peft import PeftConfig
-
 log = logging.getLogger(__name__)
 
 __all__ = ['HuggingFaceModel']
@@ -75,7 +73,7 @@ class HuggingFaceModel(ComposerModel):
     """
 
     def __init__(self,
-                 model: transformers.PreTrainedModel,
+                 model: Union[transformers.PreTrainedModel, 'PeftModel'],
                  tokenizer: Optional[Union[transformers.PreTrainedTokenizer,
                                            transformers.PreTrainedTokenizerFast]] = None,
                  use_logits: Optional[bool] = False,
@@ -94,21 +92,19 @@ def __init__(self,
 
         super().__init__()
         self.model = model
-        self.config = model.config
+        self.config: PretrainedConfig = model.config
         self.model_forward_args = inspect.getfullargspec(self.model.forward).args
         self.tokenizer = tokenizer
 
         if peft_config is not None:
-            try:
-                import peft
-                del peft
-            except ImportError as e:
+            if not _peft_installed:
                 raise MissingConditionalImportError(extra_deps_group='peft',
                                                     conda_package='peft',
-                                                    conda_channel='conda-forge') from e
+                                                    conda_channel='conda-forge')
 
         if peft_config is not None and peft_config.peft_type != 'LORA':
-            raise ValueError(f'PEFT type {peft_config.peft_type} is not supported by HuggingFaceModel. Only LORA is supported.')
+            raise ValueError(
+                f'PEFT type {peft_config.peft_type} is not supported by HuggingFaceModel. Only LORA is supported.')
 
         if self.tokenizer is None:
             log.warning(
@@ -165,7 +161,6 @@ def __init__(self,
         self.dummy_forward_called = False
 
         if peft_config is not None:
-            from peft import get_peft_model
             self.model = get_peft_model(self.model, peft_config)
             log.info(f'PEFT model created. {self.model}')
 
@@ -456,7 +451,8 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
 
             # HF encoder decoder models like T5 expect either decoder_input_ids or labels,
             # so we add decoder_input_ids to the batch if it is missing
-            if self.model.config.is_encoder_decoder and 'decoder_input_ids' not in batch:
+            model_config: PretrainedConfig = self.model.config
+            if model_config.is_encoder_decoder and 'decoder_input_ids' not in batch:
                 if hasattr(self.model, 'prepare_decoder_input_ids_from_labels'):
                     batch['decoder_input_ids'] = self.model.prepare_decoder_input_ids_from_labels(labels=self.labels)
                 else:
@@ -512,7 +508,9 @@ def get_metadata(self):
             tmp_dir = Path(tmp_dir)
             model_dir = tmp_dir / 'model'
             tokenizer_dir = tmp_dir / 'tokenizer'
-            self.model.config.save_pretrained(model_dir)
+
+            original_model_config: PretrainedConfig = self.model.config
+            original_model_config.save_pretrained(model_dir)
             if self.tokenizer is not None:
                 self.tokenizer.save_pretrained(tokenizer_dir)
 
@@ -525,18 +523,19 @@ def get_metadata(self):
                 'class': f'{self.model.__class__.__module__}.{self.model.__class__.__name__}'
             }
 
-            # Also save PEFT config the model is a peft model
+            # Also save PEFT config if the model is a peft model
             if _peft_installed:
-                active_adapter = self.model.active_adapter
-                self.model.peft_config[active_adapter].save_pretrained(model_dir)
-                with open(model_dir / 'adapter_config.json') as _peft_config:
-                    peft_config = json.load(_peft_config)
-                
-                model_output['peft_config'] = {
-                    'file_extension': '.json',
-                    'content': peft_config,
-                }
-
+                from peft import PeftModel
+                if isinstance(self.model, PeftModel):
+                    active_adapter = self.model.active_adapter
+                    self.model.peft_config[active_adapter].save_pretrained(str(model_dir))
+                    with open(model_dir / 'adapter_config.json') as _peft_config:
+                        peft_config = json.load(_peft_config)
+
+                    model_output['peft_config'] = {
+                        'file_extension': '.json',
+                        'content': peft_config,
+                    }
 
             if self.tokenizer is not None:
                 for tokenizer_file_name in tokenizer_dir.iterdir():
@@ -592,7 +591,8 @@ def generate(self, input_ids: torch.Tensor, **kwargs):
         if not using_torch_2() and not self.dummy_forward_called and is_model_fsdp(self.model):
             with torch.no_grad():
                 maybe_decoder_input_ids = {}
-                if self.model.config.is_encoder_decoder:
+                model_config: PretrainedConfig = self.model.config
+                if model_config.is_encoder_decoder:
                     maybe_decoder_input_ids['decoder_input_ids'] = torch.tensor([[0]],
                                                                                 dtype=torch.long,
                                                                                 device=input_ids.device)
@@ -676,6 +676,7 @@ def get_hf_config_from_composer_state_dict(state_dict: Dict[str, Any],
                 f'Please make sure that the model_type={hf_config_dict.get("model_type")} is valid, or that the'
                 f'config has a valid `_name_or_path`.')
 
+
 def get_peft_config_from_composer_state_dict(state_dict: Dict[str, Any]) -> Optional['PeftConfig']:
     """Get a PEFT config from a composer state dict
 
@@ -684,12 +685,11 @@ def get_peft_config_from_composer_state_dict(state_dict: Dict[str, Any]) -> Opti
 
     Returns:
         peft.PeftConfig: The PEFT config
-    """    
+    """
     try:
         import peft
     except ImportError as e:
-        raise MissingConditionalImportError(extra_deps_group='nlp',
-                                            conda_package='peft',
+        raise MissingConditionalImportError(extra_deps_group='nlp', conda_package='peft',
                                             conda_channel='conda-forge') from e
 
     if 'peft_config' not in state_dict['state']['integrations']['huggingface']['model']:
@@ -699,6 +699,7 @@ def get_peft_config_from_composer_state_dict(state_dict: Dict[str, Any]) -> Opti
 
     return peft.get_peft_config(peft_config_dict)
 
+
 def write_huggingface_pretrained_from_composer_checkpoint(
         checkpoint_path: Union[Path, str],
         output_folder: Union[Path, str],
@@ -777,32 +778,32 @@ def write_huggingface_pretrained_from_composer_checkpoint(
 
     peft_config = get_peft_config_from_composer_state_dict(composer_state_dict)
     if peft_config is not None:
-        peft_config.save_pretrained(output_folder)
+        peft_config.save_pretrained(str(output_folder))
 
     weights_state_dict = composer_state_dict['state']['model']
     torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(weights_state_dict, prefix='model.')
-    
+
     # NOTE: This only works for default adapter name
     if peft_config is not None:
         # Filtering copied from https://github.com/huggingface/peft/blob/4186c9b104644fd247a4cc0dc2dfc1ede4665204/src/peft/utils/save_and_load.py#L68C1-L86C116
         bias = peft_config.bias
-        if bias == "none":
-            to_return = {k: weights_state_dict[k] for k in weights_state_dict if "lora_" in k}
-        elif bias == "all":
-            to_return = {k: weights_state_dict[k] for k in weights_state_dict if "lora_" in k or "bias" in k}
-        elif bias == "lora_only":
+        if bias == 'none':
+            to_return = {k: weights_state_dict[k] for k in weights_state_dict if 'lora_' in k}
+        elif bias == 'all':
+            to_return = {k: weights_state_dict[k] for k in weights_state_dict if 'lora_' in k or 'bias' in k}
+        elif bias == 'lora_only':
             to_return = {}
             for k in weights_state_dict:
-                if "lora_" in k:
+                if 'lora_' in k:
                     to_return[k] = weights_state_dict[k]
-                    bias_name = k.split("lora_")[0] + "bias"
+                    bias_name = k.split('lora_')[0] + 'bias'
                     if bias_name in weights_state_dict:
                         to_return[bias_name] = weights_state_dict[bias_name]
         else:
             raise NotImplementedError
-        to_return = {k: v for k, v in to_return.items() if (("lora_" in k and 'default' in k) or ("bias" in k))}
-        to_return = {k.replace(f".default", ""): v for k, v in to_return.items()}
-        
+        to_return = {k: v for k, v in to_return.items() if (('lora_' in k and 'default' in k) or ('bias' in k))}
+        to_return = {k.replace(f'.default', ''): v for k, v in to_return.items()}
+
         torch.save(to_return, Path(output_folder) / 'adapter_model.bin')
     else:
         torch.save(weights_state_dict, Path(output_folder) / 'pytorch_model.bin')
diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index 266334d6eb..10e0647ae8 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -6,7 +6,7 @@
 import tempfile
 from contextlib import nullcontext
 from pathlib import Path
-from typing import Any, Dict, List, Optional, TYPE_CHECKING
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
 from unittest.mock import patch
 from urllib.parse import urlparse
 
@@ -32,21 +32,21 @@
 if TYPE_CHECKING:
     from peft import PeftConfig
 
+
 @pytest.fixture
 def tiny_gpt2_peft_config():
-    pytest.importorskip("peft")
+    pytest.importorskip('peft')
     from peft import get_peft_config
-    
-    peft_config = get_peft_config(
-        {
-            'peft_type': 'LORA',
-            'task_type': 'CAUSAL_LM',
-            'target_modules': ['c_attn'],
-            'fan_in_fan_out': True,
-        }
-    )
+
+    peft_config = get_peft_config({
+        'peft_type': 'LORA',
+        'task_type': 'CAUSAL_LM',
+        'target_modules': ['c_attn'],
+        'fan_in_fan_out': True,
+    })
     return peft_config
 
+
 def test_hf_tokenizer_save(tmp_path: Path, tiny_bert_model, tiny_bert_tokenizer):
     transformers = pytest.importorskip('transformers')
 
@@ -1175,7 +1175,7 @@ def test_eval_forward_generate(device, world_size, hf_model, hf_tokenizer, use_f
 
 
 def test_peft_init(tiny_gpt2_model, tiny_gpt2_peft_config):
-    pytest.importorskip("peft")
+    pytest.importorskip('peft')
     from peft import PeftModelForCausalLM
 
     original_model = copy.deepcopy(tiny_gpt2_model)
@@ -1185,17 +1185,19 @@ def test_peft_init(tiny_gpt2_model, tiny_gpt2_peft_config):
     assert hf_model.model.peft_config['default'].task_type == 'CAUSAL_LM'
     assert hf_model.model.config == original_model.config
 
+
 def test_peft_init_not_installed(tiny_gpt2_model, tiny_gpt2_peft_config):
-    pytest.importorskip("peft")
+    pytest.importorskip('peft')
 
     with patch.dict('sys.modules', {'peft': None}):
         with pytest.raises(ImportError):
             from composer.models import HuggingFaceModel
             _ = HuggingFaceModel(tiny_gpt2_model, peft_config=tiny_gpt2_peft_config)
 
+
 def test_peft_trains_and_loads(tiny_gpt2_model, tiny_gpt2_tokenizer, tiny_gpt2_peft_config, tmp_path):
-    pytest.importorskip("peft")
-    
+    pytest.importorskip('peft')
+
     trainer = get_lm_trainer(
         tiny_gpt2_model,
         tiny_gpt2_tokenizer,
@@ -1215,20 +1217,22 @@ def test_peft_trains_and_loads(tiny_gpt2_model, tiny_gpt2_tokenizer, tiny_gpt2_p
         mlm=False,
         load_path=str(tmp_path / 'hf-checkpoint.pt'),
     )
-    
+
     for p1, p2 in zip(trainer.state.model.parameters(), load_trainer.state.model.parameters()):
         torch.testing.assert_close(p1, p2)
 
+
 def test_peft_generate(tiny_gpt2_model, tiny_gpt2_tokenizer, tiny_gpt2_peft_config):
-    pytest.importorskip("peft")
+    pytest.importorskip('peft')
 
     hf_model = HuggingFaceModel(tiny_gpt2_model, tokenizer=tiny_gpt2_tokenizer, peft_config=tiny_gpt2_peft_config)
 
     input_dict = tiny_gpt2_tokenizer(['hello', 'goodbyes'], return_tensors='pt', padding=True)
     hf_model.generate(**input_dict, max_new_tokens=5, pad_token_id=tiny_gpt2_tokenizer.pad_token_id)
 
+
 def test_peft_metadata(tiny_gpt2_model, tiny_gpt2_tokenizer, tiny_gpt2_peft_config):
-    pytest.importorskip("peft")
+    pytest.importorskip('peft')
 
     from peft import get_peft_config
 
@@ -1238,14 +1242,15 @@ def test_peft_metadata(tiny_gpt2_model, tiny_gpt2_tokenizer, tiny_gpt2_peft_conf
 
     assert loaded_peft_config == tiny_gpt2_peft_config
 
+
 def test_peft_write_hf_from_composer(tiny_gpt2_model, tiny_gpt2_tokenizer, tiny_gpt2_peft_config, tmp_path):
-    peft = pytest.importorskip("peft")
+    peft = pytest.importorskip('peft')
     transformers = pytest.importorskip('transformers')
 
     # Simulate a local model instead of a hub model
     tiny_gpt2_model.save_pretrained(tmp_path / 'hf-save-to-load')
     tiny_gpt2_model = transformers.AutoModelForCausalLM.from_pretrained(tmp_path / 'hf-save-to-load')
-    
+
     trainer = get_lm_trainer(
         tiny_gpt2_model,
         tiny_gpt2_tokenizer,
@@ -1257,23 +1262,25 @@ def test_peft_write_hf_from_composer(tiny_gpt2_model, tiny_gpt2_tokenizer, tiny_
     trainer.fit()
 
     from composer.models.huggingface import write_huggingface_pretrained_from_composer_checkpoint
-    write_huggingface_pretrained_from_composer_checkpoint(str(tmp_path / 'hf-checkpoint.pt'), tmp_path / 'hf-save-pretrained')
+    write_huggingface_pretrained_from_composer_checkpoint(str(tmp_path / 'hf-checkpoint.pt'),
+                                                          tmp_path / 'hf-save-pretrained')
 
     # Test we can load back in using transformers interface
     loaded_hf_model = transformers.AutoModelForCausalLM.from_pretrained(str(tmp_path / 'hf-save-pretrained'))
     for p1, p2 in zip(trainer.state.model.model.parameters(), loaded_hf_model.parameters()):
         torch.testing.assert_close(p1, p2)
-    
+
     # Test we can load back in using peft interface
     loaded_peft_model = peft.PeftModelForCausalLM.from_pretrained(tiny_gpt2_model, str(tmp_path / 'hf-save-pretrained'))
     for p1, p2 in zip(trainer.state.model.model.parameters(), loaded_peft_model.parameters()):
         torch.testing.assert_close(p1, p2)
 
+
 @pytest.mark.gpu
 @pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.13.0'),
                     reason='requires PyTorch 1.13 or higher')
 def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, tiny_gpt2_peft_config, tmp_path):
-    pytest.importorskip("peft")
+    pytest.importorskip('peft')
 
     fsdp_config = {
         'sharding_strategy': 'FULL_SHARD',
@@ -1284,7 +1291,7 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, tiny_gpt2_peft_c
         'activation_cpu_offload': False,
         'verbose': False
     }
-    
+
     trainer = get_lm_trainer(
         tiny_gpt2_model,
         tiny_gpt2_tokenizer,
@@ -1306,6 +1313,6 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, tiny_gpt2_peft_c
         load_path=str(tmp_path / 'hf-checkpoint.pt'),
         fsdp_config=fsdp_config,
     )
-    
+
     for p1, p2 in zip(trainer.state.model.parameters(), load_trainer.state.model.parameters()):
-        torch.testing.assert_close(p1, p2)
\ No newline at end of file
+        torch.testing.assert_close(p1, p2)

From a8cc52ff0a072d2bb748d6d41b869f7767b3acd8 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Tue, 9 Jan 2024 00:53:43 -0800
Subject: [PATCH 09/64] fsdp test

---
 tests/models/test_hf_model.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index 10e0647ae8..86eaeb2d3c 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -1277,9 +1277,10 @@ def test_peft_write_hf_from_composer(tiny_gpt2_model, tiny_gpt2_tokenizer, tiny_
 
 
 @pytest.mark.gpu
+@world_size(2)
 @pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.13.0'),
                     reason='requires PyTorch 1.13 or higher')
-def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, tiny_gpt2_peft_config, tmp_path):
+def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, tiny_gpt2_peft_config, tmp_path, world_size):
     pytest.importorskip('peft')
 
     fsdp_config = {
@@ -1314,5 +1315,8 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, tiny_gpt2_peft_c
         fsdp_config=fsdp_config,
     )
 
-    for p1, p2 in zip(trainer.state.model.parameters(), load_trainer.state.model.parameters()):
-        torch.testing.assert_close(p1, p2)
+    from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+
+    with FSDP.summon_full_params(trainer.state.model), FSDP.summon_full_params(load_trainer.state.model):
+        for p1, p2 in zip(trainer.state.model.parameters(), load_trainer.state.model.parameters()):
+            torch.testing.assert_close(p1, p2)

From a7fe00c638cab50ff706d193474b3bc010d80cd6 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Tue, 9 Jan 2024 01:34:18 -0800
Subject: [PATCH 10/64] bump version, add mistral fixtures, fix various tests

---
 setup.py                      |  2 +-
 tests/common/models.py        | 21 ++++++++++
 tests/conftest.py             |  6 ++-
 tests/fixtures/fixtures.py    | 45 +++++++++++++++++++++
 tests/models/test_hf_model.py | 76 +++++++++++++++++++++++------------
 5 files changed, 123 insertions(+), 27 deletions(-)

diff --git a/setup.py b/setup.py
index 794af774dd..c4a45e19d6 100644
--- a/setup.py
+++ b/setup.py
@@ -190,7 +190,7 @@ def package_files(prefix: str, directory: str, extension: str):
 ]
 
 extra_deps['peft'] = [
-    'peft>=0.6.0,<0.7',
+    'peft>=0.7.0,<0.8',
 ]
 
 extra_deps['sentencepiece'] = [
diff --git a/tests/common/models.py b/tests/common/models.py
index cac3769b38..2a91b671df 100644
--- a/tests/common/models.py
+++ b/tests/common/models.py
@@ -541,3 +541,24 @@ def configure_tiny_t5_config():
 
 def configure_tiny_t5_hf_model(use_logits=True):
     return HuggingFaceModel(configure_tiny_t5_model(), configure_tiny_t5_tokenizer(), use_logits)
+
+def configure_tiny_mistral_model():
+    try:
+        return copy.deepcopy(pytest.tiny_mistral_model)
+    except AttributeError:
+        pytest.skip('Composer installed without NLP support')
+
+def configure_tiny_mistral_tokenizer():
+    try:
+        return copy.deepcopy(pytest.tiny_mistral_tokenizer)
+    except AttributeError:
+        pytest.skip('Composer installed without NLP support')
+
+def configure_tiny_mistral_config():
+    try:
+        return copy.deepcopy(pytest.tiny_mistral_config)
+    except AttributeError:
+        pytest.skip('Composer installed without NLP support')
+
+def configure_tiny_mistral_hf_model(use_logits=True):
+    return HuggingFaceModel(configure_tiny_mistral_model(), configure_tiny_mistral_tokenizer(), use_logits)
\ No newline at end of file
diff --git a/tests/conftest.py b/tests/conftest.py
index bcd063d9c7..c56c099d0a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -113,7 +113,8 @@ def pytest_configure():
                                              tiny_bert_tokenizer_helper, tiny_gpt2_config_helper,
                                              tiny_gpt2_model_helper, tiny_gpt2_tokenizer_helper, tiny_opt_config_helper,
                                              tiny_opt_model_helper, tiny_opt_tokenizer_helper, tiny_t5_config_helper,
-                                             tiny_t5_model_helper, tiny_t5_tokenizer_helper)
+                                             tiny_t5_model_helper, tiny_t5_tokenizer_helper, tiny_mistral_config_helper,
+                                             tiny_mistral_model_helper, tiny_mistral_tokenizer_helper)
         pytest.tiny_bert_config = tiny_bert_config_helper()  # type: ignore
         pytest.tiny_bert_model = tiny_bert_model_helper(pytest.tiny_bert_config)  # type: ignore
         pytest.tiny_bert_tokenizer = tiny_bert_tokenizer_helper()  # type: ignore
@@ -126,6 +127,9 @@ def pytest_configure():
         pytest.tiny_t5_config = tiny_t5_config_helper()  # type: ignore
         pytest.tiny_t5_model = tiny_t5_model_helper(pytest.tiny_t5_config)  # type: ignore
         pytest.tiny_t5_tokenizer = tiny_t5_tokenizer_helper()  # type: ignore
+        pytest.tiny_mistral_config = tiny_mistral_config_helper()
+        pytest.tiny_mistral_model = tiny_mistral_model_helper(pytest.tiny_mistral_config)
+        pytest.tiny_mistral_tokenizer = tiny_mistral_tokenizer_helper()
 
 
 def pytest_sessionfinish(session: pytest.Session, exitstatus: int):
diff --git a/tests/fixtures/fixtures.py b/tests/fixtures/fixtures.py
index cfd8674338..3067c9cfc9 100644
--- a/tests/fixtures/fixtures.py
+++ b/tests/fixtures/fixtures.py
@@ -320,6 +320,39 @@ def _session_tiny_t5_model(_session_tiny_t5_config):  # type: ignore
     return tiny_t5_model_helper(_session_tiny_t5_config)
 
 
+def tiny_mistral_config_helper():
+    transformers = pytest.importorskip('transformers')
+
+    tiny_overrides = {'hidden_size': 128, 'intermediate_size': 256, 'num_attention_heads': 8, 'num_hidden_layers': 2, 'num_kv_heads': 4}
+    return transformers.AutoConfig.from_pretrained('mistralai/Mistral-7B-v0.1', **tiny_overrides)
+
+
+@pytest.fixture(scope='session')
+def _session_tiny_mistral_config():  # type: ignore
+    return tiny_mistral_config_helper()
+
+
+def tiny_mistral_tokenizer_helper():
+    transformers = pytest.importorskip('transformers')
+
+    hf_tokenizer = transformers.AutoTokenizer.from_pretrained('mistralai/Mistral-7B-v0.1', model_max_length=512)
+    return hf_tokenizer
+
+
+@pytest.fixture(scope='session')
+def _session_tiny_mistral_tokenizer():  # type: ignore
+    return tiny_mistral_tokenizer_helper()
+
+
+def tiny_mistral_model_helper(config):
+    transformers = pytest.importorskip('transformers')
+
+    return transformers.AutoModelForCausalLM.from_config(config)
+
+@pytest.fixture(scope='session')
+def _session_tiny_t5_model(_session_tiny_t5_config):  # type: ignore
+    return tiny_t5_model_helper(_session_tiny_t5_config)
+
 @pytest.fixture
 def tiny_bert_model(_session_tiny_bert_model):
     return copy.deepcopy(_session_tiny_bert_model)
@@ -393,3 +426,15 @@ def tiny_t5_tokenizer(_session_tiny_t5_tokenizer):
 @pytest.fixture
 def tiny_t5_model(_session_tiny_t5_model):
     return copy.deepcopy(_session_tiny_t5_model)
+
+@pytest.fixture
+def tiny_mistral_config(_session_tiny_mistral_config):
+    return copy.deepcopy(_session_tiny_mistral_config)
+
+@pytest.fixture
+def tiny_mistral_tokenizer(_session_tiny_mistral_tokenizer):
+    return copy.deepcopy(_session_tiny_mistral_tokenizer)
+
+@pytest.fixture
+def tiny_mistral_model(_session_tiny_mistral_model):
+    return copy.deepcopy(_session_tiny_mistral_model)
diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index 86eaeb2d3c..cd0c6cd523 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -26,15 +26,14 @@
 from tests.common.datasets import RandomTextClassificationDataset, RandomTextLMDataset, RandomTextRegressionDataset
 from tests.common.markers import device, world_size
 from tests.common.models import (configure_tiny_bert_model, configure_tiny_bert_tokenizer, configure_tiny_gpt2_model,
-                                 configure_tiny_gpt2_tokenizer, configure_tiny_t5_model, configure_tiny_t5_tokenizer)
+                                 configure_tiny_gpt2_tokenizer, configure_tiny_t5_model, configure_tiny_t5_tokenizer,
+                                 configure_tiny_mistral_model, configure_tiny_mistral_tokenizer,)
 from tests.loggers.test_remote_uploader_downloader import DummyObjectStore
 
 if TYPE_CHECKING:
     from peft import PeftConfig
 
-
-@pytest.fixture
-def tiny_gpt2_peft_config():
+def _gpt2_peft_config():
     pytest.importorskip('peft')
     from peft import get_peft_config
 
@@ -46,6 +45,24 @@ def tiny_gpt2_peft_config():
     })
     return peft_config
 
+@pytest.fixture
+def gpt2_peft_config():
+    return _gpt2_peft_config()
+
+def _mistral_peft_config():
+    pytest.importorskip('peft')
+    from peft import get_peft_config
+
+    peft_config = get_peft_config({
+        'peft_type': 'LORA',
+        'task_type': 'CAUSAL_LM',
+        'target_modules': ['up_proj'],
+    })
+    return peft_config
+
+@pytest.fixture
+def mistral_peft_config():
+    return _mistral_peft_config()
 
 def test_hf_tokenizer_save(tmp_path: Path, tiny_bert_model, tiny_bert_tokenizer):
     transformers = pytest.importorskip('transformers')
@@ -1174,35 +1191,35 @@ def test_eval_forward_generate(device, world_size, hf_model, hf_tokenizer, use_f
     assert all(isinstance(decoded_generation, str) for decoded_generation in generation2)
 
 
-def test_peft_init(tiny_gpt2_model, tiny_gpt2_peft_config):
+def test_peft_init(tiny_gpt2_model, gpt2_peft_config):
     pytest.importorskip('peft')
     from peft import PeftModelForCausalLM
 
     original_model = copy.deepcopy(tiny_gpt2_model)
-    hf_model = HuggingFaceModel(tiny_gpt2_model, peft_config=tiny_gpt2_peft_config)
+    hf_model = HuggingFaceModel(tiny_gpt2_model, peft_config=gpt2_peft_config)
     assert isinstance(hf_model.model, PeftModelForCausalLM)
     assert hf_model.model.peft_config['default'].peft_type == 'LORA'
     assert hf_model.model.peft_config['default'].task_type == 'CAUSAL_LM'
     assert hf_model.model.config == original_model.config
 
 
-def test_peft_init_not_installed(tiny_gpt2_model, tiny_gpt2_peft_config):
+def test_peft_init_not_installed(tiny_gpt2_model, gpt2_peft_config):
     pytest.importorskip('peft')
 
-    with patch.dict('sys.modules', {'peft': None}):
+    with patch('composer.models.huggingface._peft_installed', False):
         with pytest.raises(ImportError):
             from composer.models import HuggingFaceModel
-            _ = HuggingFaceModel(tiny_gpt2_model, peft_config=tiny_gpt2_peft_config)
+            _ = HuggingFaceModel(tiny_gpt2_model, peft_config=gpt2_peft_config)
 
 
-def test_peft_trains_and_loads(tiny_gpt2_model, tiny_gpt2_tokenizer, tiny_gpt2_peft_config, tmp_path):
+def test_peft_trains_and_loads(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path):
     pytest.importorskip('peft')
 
     trainer = get_lm_trainer(
         tiny_gpt2_model,
         tiny_gpt2_tokenizer,
         str(tmp_path),
-        peft_config=tiny_gpt2_peft_config,
+        peft_config=gpt2_peft_config,
         device_train_microbatch_size=1,
         mlm=False,
     )
@@ -1212,7 +1229,7 @@ def test_peft_trains_and_loads(tiny_gpt2_model, tiny_gpt2_tokenizer, tiny_gpt2_p
         tiny_gpt2_model,
         tiny_gpt2_tokenizer,
         str(tmp_path),
-        peft_config=tiny_gpt2_peft_config,
+        peft_config=gpt2_peft_config,
         device_train_microbatch_size=1,
         mlm=False,
         load_path=str(tmp_path / 'hf-checkpoint.pt'),
@@ -1221,29 +1238,38 @@ def test_peft_trains_and_loads(tiny_gpt2_model, tiny_gpt2_tokenizer, tiny_gpt2_p
     for p1, p2 in zip(trainer.state.model.parameters(), load_trainer.state.model.parameters()):
         torch.testing.assert_close(p1, p2)
 
-
-def test_peft_generate(tiny_gpt2_model, tiny_gpt2_tokenizer, tiny_gpt2_peft_config):
+@pytest.mark.parametrize('model,tokenizer,peft_config', [
+    (configure_tiny_gpt2_model, configure_tiny_gpt2_tokenizer, _gpt2_peft_config()),
+    (configure_tiny_mistral_model, configure_tiny_mistral_tokenizer, _mistral_peft_config()),
+])
+def test_peft_generate(model, tokenizer, peft_config):
     pytest.importorskip('peft')
 
-    hf_model = HuggingFaceModel(tiny_gpt2_model, tokenizer=tiny_gpt2_tokenizer, peft_config=tiny_gpt2_peft_config)
+    model = model()
+    tokenizer = tokenizer()
+
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    hf_model = HuggingFaceModel(model, tokenizer=tokenizer, peft_config=peft_config)
 
-    input_dict = tiny_gpt2_tokenizer(['hello', 'goodbyes'], return_tensors='pt', padding=True)
-    hf_model.generate(**input_dict, max_new_tokens=5, pad_token_id=tiny_gpt2_tokenizer.pad_token_id)
+    input_dict = tokenizer(['hello', 'goodbyes'], return_tensors='pt', padding=True)
+    hf_model.generate(**input_dict, max_new_tokens=5, pad_token_id=tokenizer.pad_token_id)
 
 
-def test_peft_metadata(tiny_gpt2_model, tiny_gpt2_tokenizer, tiny_gpt2_peft_config):
+def test_peft_metadata(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config):
     pytest.importorskip('peft')
 
     from peft import get_peft_config
 
-    hf_model = HuggingFaceModel(tiny_gpt2_model, tokenizer=tiny_gpt2_tokenizer, peft_config=tiny_gpt2_peft_config)
+    hf_model = HuggingFaceModel(tiny_gpt2_model, tokenizer=tiny_gpt2_tokenizer, peft_config=gpt2_peft_config)
     metadata = hf_model.get_metadata()
     loaded_peft_config = get_peft_config(metadata['model']['peft_config']['content'])
 
-    assert loaded_peft_config == tiny_gpt2_peft_config
+    assert loaded_peft_config == gpt2_peft_config
 
 
-def test_peft_write_hf_from_composer(tiny_gpt2_model, tiny_gpt2_tokenizer, tiny_gpt2_peft_config, tmp_path):
+def test_peft_write_hf_from_composer(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path):
     peft = pytest.importorskip('peft')
     transformers = pytest.importorskip('transformers')
 
@@ -1255,7 +1281,7 @@ def test_peft_write_hf_from_composer(tiny_gpt2_model, tiny_gpt2_tokenizer, tiny_
         tiny_gpt2_model,
         tiny_gpt2_tokenizer,
         str(tmp_path),
-        peft_config=tiny_gpt2_peft_config,
+        peft_config=gpt2_peft_config,
         device_train_microbatch_size=1,
         mlm=False,
     )
@@ -1280,7 +1306,7 @@ def test_peft_write_hf_from_composer(tiny_gpt2_model, tiny_gpt2_tokenizer, tiny_
 @world_size(2)
 @pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.13.0'),
                     reason='requires PyTorch 1.13 or higher')
-def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, tiny_gpt2_peft_config, tmp_path, world_size):
+def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, world_size):
     pytest.importorskip('peft')
 
     fsdp_config = {
@@ -1297,7 +1323,7 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, tiny_gpt2_peft_c
         tiny_gpt2_model,
         tiny_gpt2_tokenizer,
         str(tmp_path),
-        peft_config=tiny_gpt2_peft_config,
+        peft_config=gpt2_peft_config,
         device_train_microbatch_size=1,
         mlm=False,
         fsdp_config=fsdp_config,
@@ -1308,7 +1334,7 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, tiny_gpt2_peft_c
         tiny_gpt2_model,
         tiny_gpt2_tokenizer,
         str(tmp_path),
-        peft_config=tiny_gpt2_peft_config,
+        peft_config=gpt2_peft_config,
         device_train_microbatch_size=1,
         mlm=False,
         load_path=str(tmp_path / 'hf-checkpoint.pt'),

From 48808b0878b95b62fab7cce5175f06b5d54e8d84 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Tue, 9 Jan 2024 01:35:36 -0800
Subject: [PATCH 11/64] precommit

---
 tests/common/models.py        |  6 +++++-
 tests/conftest.py             | 13 +++++++------
 tests/fixtures/fixtures.py    | 13 ++++++++++++-
 tests/models/test_hf_model.py | 10 ++++++++--
 4 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/tests/common/models.py b/tests/common/models.py
index 2a91b671df..2e310ddacb 100644
--- a/tests/common/models.py
+++ b/tests/common/models.py
@@ -542,23 +542,27 @@ def configure_tiny_t5_config():
 def configure_tiny_t5_hf_model(use_logits=True):
     return HuggingFaceModel(configure_tiny_t5_model(), configure_tiny_t5_tokenizer(), use_logits)
 
+
 def configure_tiny_mistral_model():
     try:
         return copy.deepcopy(pytest.tiny_mistral_model)
     except AttributeError:
         pytest.skip('Composer installed without NLP support')
 
+
 def configure_tiny_mistral_tokenizer():
     try:
         return copy.deepcopy(pytest.tiny_mistral_tokenizer)
     except AttributeError:
         pytest.skip('Composer installed without NLP support')
 
+
 def configure_tiny_mistral_config():
     try:
         return copy.deepcopy(pytest.tiny_mistral_config)
     except AttributeError:
         pytest.skip('Composer installed without NLP support')
 
+
 def configure_tiny_mistral_hf_model(use_logits=True):
-    return HuggingFaceModel(configure_tiny_mistral_model(), configure_tiny_mistral_tokenizer(), use_logits)
\ No newline at end of file
+    return HuggingFaceModel(configure_tiny_mistral_model(), configure_tiny_mistral_tokenizer(), use_logits)
diff --git a/tests/conftest.py b/tests/conftest.py
index c56c099d0a..bb923e8870 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -111,10 +111,11 @@ def pytest_configure():
     if TRANSFORMERS_INSTALLED:
         from tests.fixtures.fixtures import (tiny_bert_config_helper, tiny_bert_model_helper,
                                              tiny_bert_tokenizer_helper, tiny_gpt2_config_helper,
-                                             tiny_gpt2_model_helper, tiny_gpt2_tokenizer_helper, tiny_opt_config_helper,
+                                             tiny_gpt2_model_helper, tiny_gpt2_tokenizer_helper,
+                                             tiny_mistral_config_helper, tiny_mistral_model_helper,
+                                             tiny_mistral_tokenizer_helper, tiny_opt_config_helper,
                                              tiny_opt_model_helper, tiny_opt_tokenizer_helper, tiny_t5_config_helper,
-                                             tiny_t5_model_helper, tiny_t5_tokenizer_helper, tiny_mistral_config_helper,
-                                             tiny_mistral_model_helper, tiny_mistral_tokenizer_helper)
+                                             tiny_t5_model_helper, tiny_t5_tokenizer_helper)
         pytest.tiny_bert_config = tiny_bert_config_helper()  # type: ignore
         pytest.tiny_bert_model = tiny_bert_model_helper(pytest.tiny_bert_config)  # type: ignore
         pytest.tiny_bert_tokenizer = tiny_bert_tokenizer_helper()  # type: ignore
@@ -127,9 +128,9 @@ def pytest_configure():
         pytest.tiny_t5_config = tiny_t5_config_helper()  # type: ignore
         pytest.tiny_t5_model = tiny_t5_model_helper(pytest.tiny_t5_config)  # type: ignore
         pytest.tiny_t5_tokenizer = tiny_t5_tokenizer_helper()  # type: ignore
-        pytest.tiny_mistral_config = tiny_mistral_config_helper()
-        pytest.tiny_mistral_model = tiny_mistral_model_helper(pytest.tiny_mistral_config)
-        pytest.tiny_mistral_tokenizer = tiny_mistral_tokenizer_helper()
+        pytest.tiny_mistral_config = tiny_mistral_config_helper()  # type: ignore
+        pytest.tiny_mistral_model = tiny_mistral_model_helper(pytest.tiny_mistral_config)  # type: ignore
+        pytest.tiny_mistral_tokenizer = tiny_mistral_tokenizer_helper()  # type: ignore
 
 
 def pytest_sessionfinish(session: pytest.Session, exitstatus: int):
diff --git a/tests/fixtures/fixtures.py b/tests/fixtures/fixtures.py
index 3067c9cfc9..17bc272b1e 100644
--- a/tests/fixtures/fixtures.py
+++ b/tests/fixtures/fixtures.py
@@ -323,7 +323,13 @@ def _session_tiny_t5_model(_session_tiny_t5_config):  # type: ignore
 def tiny_mistral_config_helper():
     transformers = pytest.importorskip('transformers')
 
-    tiny_overrides = {'hidden_size': 128, 'intermediate_size': 256, 'num_attention_heads': 8, 'num_hidden_layers': 2, 'num_kv_heads': 4}
+    tiny_overrides = {
+        'hidden_size': 128,
+        'intermediate_size': 256,
+        'num_attention_heads': 8,
+        'num_hidden_layers': 2,
+        'num_kv_heads': 4
+    }
     return transformers.AutoConfig.from_pretrained('mistralai/Mistral-7B-v0.1', **tiny_overrides)
 
 
@@ -349,10 +355,12 @@ def tiny_mistral_model_helper(config):
 
     return transformers.AutoModelForCausalLM.from_config(config)
 
+
 @pytest.fixture(scope='session')
 def _session_tiny_t5_model(_session_tiny_t5_config):  # type: ignore
     return tiny_t5_model_helper(_session_tiny_t5_config)
 
+
 @pytest.fixture
 def tiny_bert_model(_session_tiny_bert_model):
     return copy.deepcopy(_session_tiny_bert_model)
@@ -427,14 +435,17 @@ def tiny_t5_tokenizer(_session_tiny_t5_tokenizer):
 def tiny_t5_model(_session_tiny_t5_model):
     return copy.deepcopy(_session_tiny_t5_model)
 
+
 @pytest.fixture
 def tiny_mistral_config(_session_tiny_mistral_config):
     return copy.deepcopy(_session_tiny_mistral_config)
 
+
 @pytest.fixture
 def tiny_mistral_tokenizer(_session_tiny_mistral_tokenizer):
     return copy.deepcopy(_session_tiny_mistral_tokenizer)
 
+
 @pytest.fixture
 def tiny_mistral_model(_session_tiny_mistral_model):
     return copy.deepcopy(_session_tiny_mistral_model)
diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index cd0c6cd523..6872498f96 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -26,13 +26,14 @@
 from tests.common.datasets import RandomTextClassificationDataset, RandomTextLMDataset, RandomTextRegressionDataset
 from tests.common.markers import device, world_size
 from tests.common.models import (configure_tiny_bert_model, configure_tiny_bert_tokenizer, configure_tiny_gpt2_model,
-                                 configure_tiny_gpt2_tokenizer, configure_tiny_t5_model, configure_tiny_t5_tokenizer,
-                                 configure_tiny_mistral_model, configure_tiny_mistral_tokenizer,)
+                                 configure_tiny_gpt2_tokenizer, configure_tiny_mistral_model,
+                                 configure_tiny_mistral_tokenizer, configure_tiny_t5_model, configure_tiny_t5_tokenizer)
 from tests.loggers.test_remote_uploader_downloader import DummyObjectStore
 
 if TYPE_CHECKING:
     from peft import PeftConfig
 
+
 def _gpt2_peft_config():
     pytest.importorskip('peft')
     from peft import get_peft_config
@@ -45,10 +46,12 @@ def _gpt2_peft_config():
     })
     return peft_config
 
+
 @pytest.fixture
 def gpt2_peft_config():
     return _gpt2_peft_config()
 
+
 def _mistral_peft_config():
     pytest.importorskip('peft')
     from peft import get_peft_config
@@ -60,10 +63,12 @@ def _mistral_peft_config():
     })
     return peft_config
 
+
 @pytest.fixture
 def mistral_peft_config():
     return _mistral_peft_config()
 
+
 def test_hf_tokenizer_save(tmp_path: Path, tiny_bert_model, tiny_bert_tokenizer):
     transformers = pytest.importorskip('transformers')
 
@@ -1238,6 +1243,7 @@ def test_peft_trains_and_loads(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_c
     for p1, p2 in zip(trainer.state.model.parameters(), load_trainer.state.model.parameters()):
         torch.testing.assert_close(p1, p2)
 
+
 @pytest.mark.parametrize('model,tokenizer,peft_config', [
     (configure_tiny_gpt2_model, configure_tiny_gpt2_tokenizer, _gpt2_peft_config()),
     (configure_tiny_mistral_model, configure_tiny_mistral_tokenizer, _mistral_peft_config()),

From 0c21573282e04ef09d6e1ae9f03c07a66f5f0b33 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Tue, 9 Jan 2024 01:39:08 -0800
Subject: [PATCH 12/64] fix

---
 tests/models/test_hf_model.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index 6872498f96..5d04d57de6 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -1325,6 +1325,8 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config
         'verbose': False
     }
 
+    stashed_model = copy.deepcopy(tiny_gpt2_model)
+
     trainer = get_lm_trainer(
         tiny_gpt2_model,
         tiny_gpt2_tokenizer,
@@ -1337,7 +1339,7 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config
     trainer.fit()
 
     load_trainer = get_lm_trainer(
-        tiny_gpt2_model,
+        stashed_model,
         tiny_gpt2_tokenizer,
         str(tmp_path),
         peft_config=gpt2_peft_config,

From a4031f1bd9cd723fae100059f8c600aa1183e019 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Tue, 9 Jan 2024 01:44:11 -0800
Subject: [PATCH 13/64] no peft

---
 tests/models/test_hf_model.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index 5d04d57de6..4076087ab6 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -1331,7 +1331,7 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config
         tiny_gpt2_model,
         tiny_gpt2_tokenizer,
         str(tmp_path),
-        peft_config=gpt2_peft_config,
+        # peft_config=gpt2_peft_config,
         device_train_microbatch_size=1,
         mlm=False,
         fsdp_config=fsdp_config,
@@ -1342,7 +1342,7 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config
         stashed_model,
         tiny_gpt2_tokenizer,
         str(tmp_path),
-        peft_config=gpt2_peft_config,
+        # peft_config=gpt2_peft_config,
         device_train_microbatch_size=1,
         mlm=False,
         load_path=str(tmp_path / 'hf-checkpoint.pt'),

From 44f8a896915f835db0eb066d7832e7bfa937f842 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Tue, 9 Jan 2024 01:47:42 -0800
Subject: [PATCH 14/64] rank0 path

---
 tests/models/test_hf_model.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index 4076087ab6..afe863edf5 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -1338,6 +1338,10 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config
     )
     trainer.fit()
 
+    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
+    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
+    rank0_path = gathered_paths[0]
+
     load_trainer = get_lm_trainer(
         stashed_model,
         tiny_gpt2_tokenizer,
@@ -1345,7 +1349,7 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config
         # peft_config=gpt2_peft_config,
         device_train_microbatch_size=1,
         mlm=False,
-        load_path=str(tmp_path / 'hf-checkpoint.pt'),
+        load_path=str(rank0_path / 'hf-checkpoint.pt'),
         fsdp_config=fsdp_config,
     )
 

From 1ac433737063b737f39020ab7aeb6d6e865f710c Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Tue, 9 Jan 2024 01:48:47 -0800
Subject: [PATCH 15/64] rank0 path

---
 tests/models/test_hf_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index afe863edf5..058b4dd2bf 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -1349,7 +1349,7 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config
         # peft_config=gpt2_peft_config,
         device_train_microbatch_size=1,
         mlm=False,
-        load_path=str(rank0_path / 'hf-checkpoint.pt'),
+        load_path=Path(rank0_path) / 'hf-checkpoint.pt',
         fsdp_config=fsdp_config,
     )
 

From 538a54fb8ce891d388af0c2f27b018d1a842bea4 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Tue, 9 Jan 2024 01:50:51 -0800
Subject: [PATCH 16/64] rank0 path

---
 tests/models/test_hf_model.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index 058b4dd2bf..559ceda95e 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -1340,7 +1340,7 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    rank0_path = gathered_paths[0]
+    rank0_path = Path(gathered_paths[0])
 
     load_trainer = get_lm_trainer(
         stashed_model,
@@ -1349,7 +1349,7 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config
         # peft_config=gpt2_peft_config,
         device_train_microbatch_size=1,
         mlm=False,
-        load_path=Path(rank0_path) / 'hf-checkpoint.pt',
+        load_path=str(rank0_path / 'hf-checkpoint.pt'),
         fsdp_config=fsdp_config,
     )
 

From b81506a95d765a70910c912f9b6a9c61a118ff89 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Tue, 9 Jan 2024 01:53:32 -0800
Subject: [PATCH 17/64] rank0 path

---
 tests/models/test_hf_model.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index 559ceda95e..b6e1c808cd 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -1337,6 +1337,7 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config
         fsdp_config=fsdp_config,
     )
     trainer.fit()
+    trainer.close()
 
     tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)

From 0fda30020d5ceddfec4fb10271c03185dbd2f026 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Tue, 9 Jan 2024 01:54:52 -0800
Subject: [PATCH 18/64] rank0 path

---
 tests/models/test_hf_model.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index b6e1c808cd..0037eec31c 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -1343,6 +1343,8 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
     rank0_path = Path(gathered_paths[0])
 
+    print(torch.load(str(rank0_path / 'hf-checkpoint.pt')))
+
     load_trainer = get_lm_trainer(
         stashed_model,
         tiny_gpt2_tokenizer,

From 57bf7df0d93acdf81bc97da5f3900fdb26fd5451 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Tue, 9 Jan 2024 02:00:01 -0800
Subject: [PATCH 19/64] rank0 path

---
 tests/models/test_hf_model.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index 0037eec31c..30b4d2f847 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -521,8 +521,13 @@ def get_lm_trainer(hf_model,
                                      collate_fn=collator,
                                      sampler=dist.get_sampler(train_dataset))
 
+    from composer.optim import DecoupledAdamW
+
+    optimizer = DecoupledAdamW(model.parameters(), lr=1e-3)
+
     in_memory_logger = InMemoryLogger()
     trainer = Trainer(model=model,
+                      optimizer=optimizer,
                       train_dataloader=train_dataloader,
                       eval_dataloader=eval_dataloader,
                       max_duration='1ep',
@@ -1343,8 +1348,6 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config
     gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
     rank0_path = Path(gathered_paths[0])
 
-    print(torch.load(str(rank0_path / 'hf-checkpoint.pt')))
-
     load_trainer = get_lm_trainer(
         stashed_model,
         tiny_gpt2_tokenizer,

From e4792f8af517015499423cdacfc00502e9a8763a Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Tue, 9 Jan 2024 02:00:48 -0800
Subject: [PATCH 20/64] rank0 path

---
 tests/models/test_hf_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index 30b4d2f847..3e5db4976f 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -527,7 +527,7 @@ def get_lm_trainer(hf_model,
 
     in_memory_logger = InMemoryLogger()
     trainer = Trainer(model=model,
-                      optimizer=optimizer,
+                      optimizers=optimizer,
                       train_dataloader=train_dataloader,
                       eval_dataloader=eval_dataloader,
                       max_duration='1ep',

From d01e79f8c9ab208bea1f06f29ae833bd3c3127ac Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Tue, 9 Jan 2024 02:01:41 -0800
Subject: [PATCH 21/64] rank0 path

---
 tests/models/test_hf_model.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index 3e5db4976f..3149e332e7 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -1336,7 +1336,7 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config
         tiny_gpt2_model,
         tiny_gpt2_tokenizer,
         str(tmp_path),
-        # peft_config=gpt2_peft_config,
+        peft_config=gpt2_peft_config,
         device_train_microbatch_size=1,
         mlm=False,
         fsdp_config=fsdp_config,
@@ -1352,7 +1352,7 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config
         stashed_model,
         tiny_gpt2_tokenizer,
         str(tmp_path),
-        # peft_config=gpt2_peft_config,
+        peft_config=gpt2_peft_config,
         device_train_microbatch_size=1,
         mlm=False,
         load_path=str(rank0_path / 'hf-checkpoint.pt'),

From aa552f22d8d270d131dffcf941f9884b3f1d5ecd Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Tue, 9 Jan 2024 02:02:36 -0800
Subject: [PATCH 22/64] rank0 path

---
 tests/models/test_hf_model.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index 3149e332e7..f77fe13746 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -1344,10 +1344,6 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config
     trainer.fit()
     trainer.close()
 
-    tmp_path_to_broadcast = str(os.path.abspath(tmp_path))
-    gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
-    rank0_path = Path(gathered_paths[0])
-
     load_trainer = get_lm_trainer(
         stashed_model,
         tiny_gpt2_tokenizer,
@@ -1355,7 +1351,7 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config
         peft_config=gpt2_peft_config,
         device_train_microbatch_size=1,
         mlm=False,
-        load_path=str(rank0_path / 'hf-checkpoint.pt'),
+        load_path=str(tmp_path / 'hf-checkpoint.pt'),
         fsdp_config=fsdp_config,
     )
 

From 49c10ac1a187522d592eabfd0a718051e276cee8 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Tue, 9 Jan 2024 02:28:18 -0800
Subject: [PATCH 23/64] sd filter

---
 tests/models/test_hf_model.py | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index f77fe13746..d7b7081d47 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -471,7 +471,8 @@ def get_lm_trainer(hf_model,
                    batch_size: int = 4,
                    sequence_length: int = 4,
                    size: int = 4,
-                   peft_config: Optional['PeftConfig'] = None):
+                   peft_config: Optional['PeftConfig'] = None,
+                   just_lora: bool = False):
     transformers = pytest.importorskip('transformers')
 
     metrics: List[Metric] = [LanguageCrossEntropy(ignore_index=-100)]
@@ -484,6 +485,7 @@ def get_lm_trainer(hf_model,
         metrics=metrics,
         use_logits=True,
         peft_config=peft_config,
+        peft_filter_state_dict_trainable=just_lora,
     )
 
     vocab_size = hf_model.config.vocab_size
@@ -1221,8 +1223,8 @@ def test_peft_init_not_installed(tiny_gpt2_model, gpt2_peft_config):
             from composer.models import HuggingFaceModel
             _ = HuggingFaceModel(tiny_gpt2_model, peft_config=gpt2_peft_config)
 
-
-def test_peft_trains_and_loads(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path):
+@pytest.mark.parametrize('just_lora', [True, False])
+def test_peft_trains_and_loads(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, just_lora):
     pytest.importorskip('peft')
 
     trainer = get_lm_trainer(
@@ -1232,6 +1234,7 @@ def test_peft_trains_and_loads(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_c
         peft_config=gpt2_peft_config,
         device_train_microbatch_size=1,
         mlm=False,
+        just_lora=just_lora,
     )
     trainer.fit()
 
@@ -1243,6 +1246,7 @@ def test_peft_trains_and_loads(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_c
         device_train_microbatch_size=1,
         mlm=False,
         load_path=str(tmp_path / 'hf-checkpoint.pt'),
+        just_lora=just_lora,
     )
 
     for p1, p2 in zip(trainer.state.model.parameters(), load_trainer.state.model.parameters()):
@@ -1279,8 +1283,8 @@ def test_peft_metadata(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config):
 
     assert loaded_peft_config == gpt2_peft_config
 
-
-def test_peft_write_hf_from_composer(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path):
+@pytest.mark.parametrize('just_lora', [True, False])
+def test_peft_write_hf_from_composer(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, just_lora):
     peft = pytest.importorskip('peft')
     transformers = pytest.importorskip('transformers')
 
@@ -1295,6 +1299,7 @@ def test_peft_write_hf_from_composer(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_
         peft_config=gpt2_peft_config,
         device_train_microbatch_size=1,
         mlm=False,
+        just_lora=just_lora,
     )
     trainer.fit()
 
@@ -1315,9 +1320,10 @@ def test_peft_write_hf_from_composer(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_
 
 @pytest.mark.gpu
 @world_size(2)
+@pytest.mark.parametrize('just_lora', [True, False])
 @pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.13.0'),
                     reason='requires PyTorch 1.13 or higher')
-def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, world_size):
+def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, world_size, just_lora):
     pytest.importorskip('peft')
 
     fsdp_config = {
@@ -1340,6 +1346,7 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config
         device_train_microbatch_size=1,
         mlm=False,
         fsdp_config=fsdp_config,
+        just_lora=just_lora,
     )
     trainer.fit()
     trainer.close()
@@ -1353,6 +1360,7 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config
         mlm=False,
         load_path=str(tmp_path / 'hf-checkpoint.pt'),
         fsdp_config=fsdp_config,
+        just_lora=just_lora,
     )
 
     from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
@@ -1360,3 +1368,12 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config
     with FSDP.summon_full_params(trainer.state.model), FSDP.summon_full_params(load_trainer.state.model):
         for p1, p2 in zip(trainer.state.model.parameters(), load_trainer.state.model.parameters()):
             torch.testing.assert_close(p1, p2)
+
+def test_filtered_state_dict(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path):
+    pytest.importorskip('peft')
+
+    hf_model = HuggingFaceModel(tiny_gpt2_model, tokenizer=tiny_gpt2_tokenizer, peft_config=gpt2_peft_config, peft_filter_state_dict_trainable=True)
+    state_dict = hf_model.state_dict()
+
+    assert len(state_dict.keys()) == 4
+

From 4b7e4724b2f92741fb094421ebfa60be5bfb18d9 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Tue, 9 Jan 2024 02:29:06 -0800
Subject: [PATCH 24/64] filter sd

---
 composer/models/huggingface.py | 58 ++++++++++++++++++++++------------
 1 file changed, 38 insertions(+), 20 deletions(-)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index 5428d0104a..2498c5f32e 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -81,7 +81,8 @@ def __init__(self,
                  eval_metrics: Optional[List[Metric]] = None,
                  shift_labels: Optional[bool] = None,
                  allow_embedding_resizing: bool = False,
-                 peft_config: Optional['PeftConfig'] = None) -> None:
+                 peft_config: Optional['PeftConfig'] = None,
+                 peft_filter_state_dict_trainable: bool = False) -> None:
         try:
             import transformers
             del transformers  # unused
@@ -96,6 +97,7 @@ def __init__(self,
         self.model_forward_args = inspect.getfullargspec(self.model.forward).args
         self.tokenizer = tokenizer
 
+        self.peft_filter_state_dict_trainable = peft_filter_state_dict_trainable
         if peft_config is not None:
             if not _peft_installed:
                 raise MissingConditionalImportError(extra_deps_group='peft',
@@ -164,6 +166,16 @@ def __init__(self,
             self.model = get_peft_model(self.model, peft_config)
             log.info(f'PEFT model created. {self.model}')
 
+    def state_dict(self, *args, **kwargs) -> Dict[str, Any]:
+        """Returns the state dict of the model."""
+        full_state_dict = super().state_dict(*args, **kwargs)
+        
+        if self.peft_filter_state_dict_trainable:
+            full_state_dict = filter_state_dict_peft(full_state_dict, self.model.peft_config[self.model.active_adapter], False)
+
+        return full_state_dict
+
+
     @staticmethod
     def load_huggingface_tokenizer_from_saved_state(
             hf_state: Dict[str, Any],
@@ -785,25 +797,31 @@ def write_huggingface_pretrained_from_composer_checkpoint(
 
     # NOTE: This only works for default adapter name
     if peft_config is not None:
-        # Filtering copied from https://github.com/huggingface/peft/blob/4186c9b104644fd247a4cc0dc2dfc1ede4665204/src/peft/utils/save_and_load.py#L68C1-L86C116
-        bias = peft_config.bias
-        if bias == 'none':
-            to_return = {k: weights_state_dict[k] for k in weights_state_dict if 'lora_' in k}
-        elif bias == 'all':
-            to_return = {k: weights_state_dict[k] for k in weights_state_dict if 'lora_' in k or 'bias' in k}
-        elif bias == 'lora_only':
-            to_return = {}
-            for k in weights_state_dict:
-                if 'lora_' in k:
-                    to_return[k] = weights_state_dict[k]
-                    bias_name = k.split('lora_')[0] + 'bias'
-                    if bias_name in weights_state_dict:
-                        to_return[bias_name] = weights_state_dict[bias_name]
-        else:
-            raise NotImplementedError
-        to_return = {k: v for k, v in to_return.items() if (('lora_' in k and 'default' in k) or ('bias' in k))}
-        to_return = {k.replace(f'.default', ''): v for k, v in to_return.items()}
+        weights_state_dict = filter_state_dict_peft(weights_state_dict, peft_config)
 
-        torch.save(to_return, Path(output_folder) / 'adapter_model.bin')
+        torch.save(weights_state_dict, Path(output_folder) / 'adapter_model.bin')
     else:
         torch.save(weights_state_dict, Path(output_folder) / 'pytorch_model.bin')
+
+def filter_state_dict_peft(state_dict: Dict[str, Any], peft_config: 'PeftConfig', remove_adapter_names: bool = True) -> Dict[str, Any]:
+    # Filtering copied from https://github.com/huggingface/peft/blob/4186c9b104644fd247a4cc0dc2dfc1ede4665204/src/peft/utils/save_and_load.py#L68C1-L86C116
+    bias = peft_config.bias
+    if bias == 'none':
+        to_return = {k: state_dict[k] for k in state_dict if 'lora_' in k}
+    elif bias == 'all':
+        to_return = {k: state_dict[k] for k in state_dict if 'lora_' in k or 'bias' in k}
+    elif bias == 'lora_only':
+        to_return = {}
+        for k in state_dict:
+            if 'lora_' in k:
+                to_return[k] = state_dict[k]
+                bias_name = k.split('lora_')[0] + 'bias'
+                if bias_name in state_dict:
+                    to_return[bias_name] = state_dict[bias_name]
+    else:
+        raise NotImplementedError
+    to_return = {k: v for k, v in to_return.items() if (('lora_' in k and 'default' in k) or ('bias' in k))}
+    
+    if remove_adapter_names:
+        to_return = {k.replace(f'.default', ''): v for k, v in to_return.items()}
+    return to_return
\ No newline at end of file

From 09346e7f3367ae17409d2a153ec02db1869e09e0 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Tue, 9 Jan 2024 14:16:18 -0800
Subject: [PATCH 25/64] precommit

---
 composer/models/huggingface.py | 15 +++++++++------
 tests/models/test_hf_model.py  |  9 +++++++--
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index 2498c5f32e..d7fb0404ea 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -169,13 +169,13 @@ def __init__(self,
     def state_dict(self, *args, **kwargs) -> Dict[str, Any]:
         """Returns the state dict of the model."""
         full_state_dict = super().state_dict(*args, **kwargs)
-        
+
         if self.peft_filter_state_dict_trainable:
-            full_state_dict = filter_state_dict_peft(full_state_dict, self.model.peft_config[self.model.active_adapter], False)
+            full_state_dict = filter_state_dict_peft(full_state_dict, self.model.peft_config[self.model.active_adapter],
+                                                     False)
 
         return full_state_dict
 
-
     @staticmethod
     def load_huggingface_tokenizer_from_saved_state(
             hf_state: Dict[str, Any],
@@ -803,7 +803,10 @@ def write_huggingface_pretrained_from_composer_checkpoint(
     else:
         torch.save(weights_state_dict, Path(output_folder) / 'pytorch_model.bin')
 
-def filter_state_dict_peft(state_dict: Dict[str, Any], peft_config: 'PeftConfig', remove_adapter_names: bool = True) -> Dict[str, Any]:
+
+def filter_state_dict_peft(state_dict: Dict[str, Any],
+                           peft_config: 'PeftConfig',
+                           remove_adapter_names: bool = True) -> Dict[str, Any]:
     # Filtering copied from https://github.com/huggingface/peft/blob/4186c9b104644fd247a4cc0dc2dfc1ede4665204/src/peft/utils/save_and_load.py#L68C1-L86C116
     bias = peft_config.bias
     if bias == 'none':
@@ -821,7 +824,7 @@ def filter_state_dict_peft(state_dict: Dict[str, Any], peft_config: 'PeftConfig'
     else:
         raise NotImplementedError
     to_return = {k: v for k, v in to_return.items() if (('lora_' in k and 'default' in k) or ('bias' in k))}
-    
+
     if remove_adapter_names:
         to_return = {k.replace(f'.default', ''): v for k, v in to_return.items()}
-    return to_return
\ No newline at end of file
+    return to_return
diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index d7b7081d47..0cf13fb372 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -1223,6 +1223,7 @@ def test_peft_init_not_installed(tiny_gpt2_model, gpt2_peft_config):
             from composer.models import HuggingFaceModel
             _ = HuggingFaceModel(tiny_gpt2_model, peft_config=gpt2_peft_config)
 
+
 @pytest.mark.parametrize('just_lora', [True, False])
 def test_peft_trains_and_loads(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, just_lora):
     pytest.importorskip('peft')
@@ -1283,6 +1284,7 @@ def test_peft_metadata(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config):
 
     assert loaded_peft_config == gpt2_peft_config
 
+
 @pytest.mark.parametrize('just_lora', [True, False])
 def test_peft_write_hf_from_composer(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, just_lora):
     peft = pytest.importorskip('peft')
@@ -1369,11 +1371,14 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config
         for p1, p2 in zip(trainer.state.model.parameters(), load_trainer.state.model.parameters()):
             torch.testing.assert_close(p1, p2)
 
+
 def test_filtered_state_dict(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path):
     pytest.importorskip('peft')
 
-    hf_model = HuggingFaceModel(tiny_gpt2_model, tokenizer=tiny_gpt2_tokenizer, peft_config=gpt2_peft_config, peft_filter_state_dict_trainable=True)
+    hf_model = HuggingFaceModel(tiny_gpt2_model,
+                                tokenizer=tiny_gpt2_tokenizer,
+                                peft_config=gpt2_peft_config,
+                                peft_filter_state_dict_trainable=True)
     state_dict = hf_model.state_dict()
 
     assert len(state_dict.keys()) == 4
-

From dd47afefb3c6fcf226080ac797415f81f6bb4ed4 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Tue, 16 Jan 2024 15:42:28 -0800
Subject: [PATCH 26/64] more complete test

---
 tests/models/test_hf_model.py | 30 +++++++++++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index 0cf13fb372..1f806a77b3 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -1343,34 +1343,58 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config
     trainer = get_lm_trainer(
         tiny_gpt2_model,
         tiny_gpt2_tokenizer,
-        str(tmp_path),
+        str(tmp_path / 'trainer1'),
         peft_config=gpt2_peft_config,
         device_train_microbatch_size=1,
         mlm=False,
         fsdp_config=fsdp_config,
         just_lora=just_lora,
     )
+
+    for n, p in trainer.state.model.model.named_parameters():
+        if 'lora' in n:
+            assert p.requires_grad
+        else:
+            assert not p.requires_grad
+
     trainer.fit()
     trainer.close()
 
     load_trainer = get_lm_trainer(
         stashed_model,
         tiny_gpt2_tokenizer,
-        str(tmp_path),
+        str(tmp_path / 'trainer2'),
         peft_config=gpt2_peft_config,
         device_train_microbatch_size=1,
         mlm=False,
-        load_path=str(tmp_path / 'hf-checkpoint.pt'),
+        load_path=str(tmp_path / 'trainer1' / 'hf-checkpoint.pt'),
         fsdp_config=fsdp_config,
         just_lora=just_lora,
     )
 
+    for n, p in load_trainer.state.model.model.named_parameters():
+        if 'lora' in n:
+            assert p.requires_grad
+        else:
+            assert not p.requires_grad
+
     from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 
     with FSDP.summon_full_params(trainer.state.model), FSDP.summon_full_params(load_trainer.state.model):
         for p1, p2 in zip(trainer.state.model.parameters(), load_trainer.state.model.parameters()):
             torch.testing.assert_close(p1, p2)
 
+    loaded_ckpt_1 = torch.load(str(tmp_path / 'trainer1' / 'hf-checkpoint.pt'))
+    loaded_ckpt_2 = torch.load(str(tmp_path / 'trainer2' / 'hf-checkpoint.pt'))
+
+    # Check that only the LoRA parameters were saved
+    if just_lora:
+        assert all('lora' in k for k in loaded_ckpt_1['state']['model'].keys())
+        assert all('lora' in k for k in loaded_ckpt_2['state']['model'].keys())
+    else:
+        assert not all('lora' in k for k in loaded_ckpt_1['state']['model'].keys())
+        assert not all('lora' in k for k in loaded_ckpt_2['state']['model'].keys())
+
 
 def test_filtered_state_dict(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path):
     pytest.importorskip('peft')

From ca494c5ede814403709809774b7d5e364fef327c Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Tue, 16 Jan 2024 15:45:40 -0800
Subject: [PATCH 27/64] gate

---
 tests/models/test_hf_model.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index 1f806a77b3..73020f481d 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -1384,16 +1384,17 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config
         for p1, p2 in zip(trainer.state.model.parameters(), load_trainer.state.model.parameters()):
             torch.testing.assert_close(p1, p2)
 
-    loaded_ckpt_1 = torch.load(str(tmp_path / 'trainer1' / 'hf-checkpoint.pt'))
-    loaded_ckpt_2 = torch.load(str(tmp_path / 'trainer2' / 'hf-checkpoint.pt'))
-
-    # Check that only the LoRA parameters were saved
-    if just_lora:
-        assert all('lora' in k for k in loaded_ckpt_1['state']['model'].keys())
-        assert all('lora' in k for k in loaded_ckpt_2['state']['model'].keys())
-    else:
-        assert not all('lora' in k for k in loaded_ckpt_1['state']['model'].keys())
-        assert not all('lora' in k for k in loaded_ckpt_2['state']['model'].keys())
+    if dist.get_global_rank() == 0:
+        loaded_ckpt_1 = torch.load(str(tmp_path / 'trainer1' / 'hf-checkpoint.pt'))
+        loaded_ckpt_2 = torch.load(str(tmp_path / 'trainer2' / 'hf-checkpoint.pt'))
+
+        # Check that only the LoRA parameters were saved
+        if just_lora:
+            assert all('lora' in k for k in loaded_ckpt_1['state']['model'].keys())
+            assert all('lora' in k for k in loaded_ckpt_2['state']['model'].keys())
+        else:
+            assert not all('lora' in k for k in loaded_ckpt_1['state']['model'].keys())
+            assert not all('lora' in k for k in loaded_ckpt_2['state']['model'].keys())
 
 
 def test_filtered_state_dict(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path):

From b345d85330a62237e38a2fe5b780442f1cda642e Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Tue, 16 Jan 2024 15:46:47 -0800
Subject: [PATCH 28/64] fix

---
 tests/models/test_hf_model.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index 73020f481d..a2a17f4ecc 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -1386,15 +1386,12 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config
 
     if dist.get_global_rank() == 0:
         loaded_ckpt_1 = torch.load(str(tmp_path / 'trainer1' / 'hf-checkpoint.pt'))
-        loaded_ckpt_2 = torch.load(str(tmp_path / 'trainer2' / 'hf-checkpoint.pt'))
 
         # Check that only the LoRA parameters were saved
         if just_lora:
             assert all('lora' in k for k in loaded_ckpt_1['state']['model'].keys())
-            assert all('lora' in k for k in loaded_ckpt_2['state']['model'].keys())
         else:
             assert not all('lora' in k for k in loaded_ckpt_1['state']['model'].keys())
-            assert not all('lora' in k for k in loaded_ckpt_2['state']['model'].keys())
 
 
 def test_filtered_state_dict(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path):

From c400ab0d81aa558381b3d068583d8ed684adfb87 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Wed, 17 Jan 2024 10:31:14 -0800
Subject: [PATCH 29/64] debug

---
 composer/models/huggingface.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index d7fb0404ea..facdd8d8fc 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -413,6 +413,7 @@ def hf_from_composer_checkpoint(
         return hf_model, hf_tokenizer
 
     def forward(self, batch):
+        print(batch)
         if isinstance(batch, Mapping):
             # Further input validation is left to the huggingface forward call
             batch = {k: v for k, v in batch.items() if k in self.model_forward_args}

From f276664f373612aa7dceda4590b4691022daa024 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Wed, 17 Jan 2024 19:50:18 +0000
Subject: [PATCH 30/64] fix inspect for peft

---
 composer/models/huggingface.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index facdd8d8fc..7891af1e95 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -95,6 +95,12 @@ def __init__(self,
         self.model = model
         self.config: PretrainedConfig = model.config
         self.model_forward_args = inspect.getfullargspec(self.model.forward).args
+        import transformers
+        if _peft_installed and self.model_forward_args == ['self']:
+            from peft import PeftModel
+            if isinstance(self.model, PeftModel):
+                self.model_forward_args = inspect.getfullargspec(self.model.base_model.model.forward).args
+
         self.tokenizer = tokenizer
 
         self.peft_filter_state_dict_trainable = peft_filter_state_dict_trainable

From e9d4c4c9ba6b02484a5055139a1f631d4eaf80ce Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Thu, 18 Jan 2024 01:05:50 -0800
Subject: [PATCH 31/64] precommit

---
 composer/datasets/utils.py                    |  6 ++--
 composer/models/huggingface.py                | 34 ++++++++++++++-----
 .../test_in_context_learning_datasets.py      |  4 +--
 3 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/composer/datasets/utils.py b/composer/datasets/utils.py
index 431a860900..6bb376ff30 100644
--- a/composer/datasets/utils.py
+++ b/composer/datasets/utils.py
@@ -179,7 +179,7 @@ class MultiTokenEOSCriteria(transformers.StoppingCriteria):
         def __init__(
             self,
             stop_sequence: str,
-            tokenizer: transformers.PreTrainedTokenizer,
+            tokenizer: transformers.PreTrainedTokenizerBase,
             batch_size: int,
         ) -> None:
             self.done_tracker = [False] * batch_size
@@ -196,7 +196,7 @@ def __init__(
             self.stop_sequence_id_len = len(self.stop_sequence_ids) + 2
             self.tokenizer = tokenizer
 
-        def __call__(self, input_ids, scores: Optional[torch.FloatTensor] = None, **kwargs) -> bool:
+        def __call__(self, input_ids: torch.LongTensor, scores: Optional[torch.FloatTensor] = None, **kwargs) -> bool:
             # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
             lookback_ids_batch = input_ids[:, :][:, -self.stop_sequence_id_len:]
 
@@ -213,7 +213,7 @@ def __call__(self, input_ids, scores: Optional[torch.FloatTensor] = None, **kwar
             return False not in self.done_tracker
 
     def stop_sequences_criteria(
-        tokenizer: transformers.PreTrainedTokenizer,
+        tokenizer: transformers.PreTrainedTokenizerBase,
         stop_sequences: List[str],
         batch_size: int,
     ) -> transformers.StoppingCriteriaList:
diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index dbeda8a6a6..2dbeabbe34 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -13,6 +13,7 @@
 import string
 import tempfile
 import textwrap
+import warnings
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple, Type, Union
 
@@ -159,7 +160,8 @@ def __init__(self,
 
         self.labels: Optional[torch.Tensor] = None  # set in eval_forward() if exists
 
-        is_causal_lm = _is_registered_causal_lm(model)
+        is_causal_lm = _is_registered_causal_lm(self.model)
+
         self.shift_labels = is_causal_lm if shift_labels is None else shift_labels
         if is_causal_lm and not self.shift_labels:
             log.warning('The shift_labels argument was set to False but the model is an instance of a'
@@ -169,16 +171,21 @@ def __init__(self,
         self.dummy_forward_called = False
 
         if peft_config is not None:
-            self.model = get_peft_model(self.model, peft_config)
-            log.info(f'PEFT model created. {self.model}')
+            from peft import PeftModel
+            if isinstance(self.model, PeftModel):
+                warnings.warn('PEFT model was passed in directly. Ignoring the provided PEFT config.')
+            else:
+                self.model = get_peft_model(self.model, peft_config)
+                log.info(f'PEFT model created. {self.model}')
 
     def state_dict(self, *args, **kwargs) -> Dict[str, Any]:
         """Returns the state dict of the model."""
         full_state_dict = super().state_dict(*args, **kwargs)
 
         if self.peft_filter_state_dict_trainable:
-            full_state_dict = filter_state_dict_peft(full_state_dict, self.model.peft_config[self.model.active_adapter],
-                                                     False)
+            active_adapter = self.model.active_adapter
+            assert isinstance(active_adapter, str)
+            full_state_dict = filter_state_dict_peft(full_state_dict, self.model.peft_config[active_adapter], False)
 
         return full_state_dict
 
@@ -474,6 +481,7 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
 
             # HF encoder decoder models like T5 expect either decoder_input_ids or labels,
             # so we add decoder_input_ids to the batch if it is missing
+            assert isinstance(self.model.config, PretrainedConfig)
             model_config: PretrainedConfig = self.model.config
             if model_config.is_encoder_decoder and 'decoder_input_ids' not in batch:
                 if hasattr(self.model, 'prepare_decoder_input_ids_from_labels'):
@@ -532,6 +540,7 @@ def get_metadata(self):
             model_dir = tmp_dir / 'model'
             tokenizer_dir = tmp_dir / 'tokenizer'
 
+            assert isinstance(self.model.config, PretrainedConfig)
             original_model_config: PretrainedConfig = self.model.config
             original_model_config.save_pretrained(model_dir)
             if self.tokenizer is not None:
@@ -615,6 +624,7 @@ def generate(self, input_ids: torch.Tensor, **kwargs):
         if not using_torch_2() and not self.dummy_forward_called and is_model_fsdp(self.model):
             with torch.no_grad():
                 maybe_decoder_input_ids = {}
+                assert isinstance(self.model.config, PretrainedConfig)
                 model_config: PretrainedConfig = self.model.config
                 if model_config.is_encoder_decoder:
                     maybe_decoder_input_ids['decoder_input_ids'] = torch.tensor([[0]],
@@ -638,7 +648,7 @@ def generate(self, input_ids: torch.Tensor, **kwargs):
             return self.model.generate(input_ids=input_ids, pad_token_id=pad_token_id, **kwargs)
 
 
-def _is_registered_causal_lm(model: transformers.PreTrainedModel) -> bool:
+def _is_registered_causal_lm(model: Union[transformers.PreTrainedModel, 'PeftModel']) -> bool:
     """Return True if model class is either a registered 🤗 Causal LM or a subclass of one"""
     try:
         from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING
@@ -647,6 +657,11 @@ def _is_registered_causal_lm(model: transformers.PreTrainedModel) -> bool:
                                             conda_package='transformers',
                                             conda_channel='conda-forge') from e
 
+    if _peft_installed and isinstance(model, PeftModel):
+        model_to_check = model.base_model.model
+    else:
+        model_to_check = model
+
     # This try/except is needed until https://github.com/huggingface/transformers/issues/26778
     # is resolved in a release. This means that this attempt to automatically detect causal LMs
     # does not currently work in an environment with flash attention <2 installed.
@@ -658,7 +673,7 @@ def _is_registered_causal_lm(model: transformers.PreTrainedModel) -> bool:
             return False
         else:
             raise e
-    return any(isinstance(model, causal_lm_class) for causal_lm_class in causal_lm_classes)  # type: ignore
+    return any(isinstance(model_to_check, causal_lm_class) for causal_lm_class in causal_lm_classes)  # type: ignore
 
 
 def get_hf_config_from_composer_state_dict(state_dict: Dict[str, Any],
@@ -819,8 +834,11 @@ def write_huggingface_pretrained_from_composer_checkpoint(
 def filter_state_dict_peft(state_dict: Dict[str, Any],
                            peft_config: 'PeftConfig',
                            remove_adapter_names: bool = True) -> Dict[str, Any]:
+    if peft_config.peft_type != 'LORA':
+        raise NotImplementedError(f'Only LoRA PEFT is supported. Got {peft_config.peft_type}')
+
     # Filtering copied from https://github.com/huggingface/peft/blob/4186c9b104644fd247a4cc0dc2dfc1ede4665204/src/peft/utils/save_and_load.py#L68C1-L86C116
-    bias = peft_config.bias
+    bias = peft_config.bias  # type: ignore
     if bias == 'none':
         to_return = {k: state_dict[k] for k in state_dict if 'lora_' in k}
     elif bias == 'all':
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index ec7df306d6..2a3ff87884 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -73,13 +73,13 @@ def test_stop_sequences_criteria(tiny_gpt2_tokenizer):
     seq1 = tiny_gpt2_tokenizer('Dogs are furry')['input_ids']
     seq2 = tiny_gpt2_tokenizer('Dogs are furry\n\n')['input_ids']
     seq1 = [50257] * (len(seq2) - len(seq1)) + seq1
-    input_ids = torch.tensor([seq1, seq2])
+    input_ids = torch.LongTensor([seq1, seq2])
     assert not eos_criteria(input_ids, None)
 
     eos_criteria = MultiTokenEOSCriteria('\n\n', tiny_gpt2_tokenizer, 2)
     seq1 = tiny_gpt2_tokenizer('Dogs are furry\n\n')['input_ids']
     seq2 = tiny_gpt2_tokenizer('Dogs are furry\n\n')['input_ids']
-    input_ids = torch.tensor([seq1, seq2])
+    input_ids = torch.LongTensor([seq1, seq2])
     assert eos_criteria(input_ids, None)
 
 

From eca54b19cd2a591670c3eb6c8ba6d8b551a5e791 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Thu, 18 Jan 2024 10:06:27 -0800
Subject: [PATCH 32/64] update

---
 composer/models/huggingface.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index 2dbeabbe34..4eb85c9329 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -25,7 +25,7 @@
 from composer.utils import MissingConditionalImportError, dist, get_file, import_object, is_model_fsdp, safe_torch_load
 
 try:
-    from peft import get_peft_model
+    from peft import PeftModel, get_peft_model
     _peft_installed = True
 except:
     _peft_installed = False

From f019e1beeef115b10895fdd185f848bdc447e4f7 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Thu, 18 Jan 2024 12:14:41 -0800
Subject: [PATCH 33/64] fix imports

---
 composer/models/huggingface.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index 4eb85c9329..465bdd5774 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -481,6 +481,7 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
 
             # HF encoder decoder models like T5 expect either decoder_input_ids or labels,
             # so we add decoder_input_ids to the batch if it is missing
+            from transformers import PretrainedConfig
             assert isinstance(self.model.config, PretrainedConfig)
             model_config: PretrainedConfig = self.model.config
             if model_config.is_encoder_decoder and 'decoder_input_ids' not in batch:
@@ -539,7 +540,8 @@ def get_metadata(self):
             tmp_dir = Path(tmp_dir)
             model_dir = tmp_dir / 'model'
             tokenizer_dir = tmp_dir / 'tokenizer'
-
+            
+            from transformers import PretrainedConfig
             assert isinstance(self.model.config, PretrainedConfig)
             original_model_config: PretrainedConfig = self.model.config
             original_model_config.save_pretrained(model_dir)
@@ -624,6 +626,7 @@ def generate(self, input_ids: torch.Tensor, **kwargs):
         if not using_torch_2() and not self.dummy_forward_called and is_model_fsdp(self.model):
             with torch.no_grad():
                 maybe_decoder_input_ids = {}
+                from transformers import PretrainedConfig
                 assert isinstance(self.model.config, PretrainedConfig)
                 model_config: PretrainedConfig = self.model.config
                 if model_config.is_encoder_decoder:

From d8258aa7e195b13361916db263da59d01f4579c9 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Thu, 18 Jan 2024 12:15:44 -0800
Subject: [PATCH 34/64] precommit

---
 composer/models/huggingface.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index 465bdd5774..9aadc5b8c8 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -540,7 +540,7 @@ def get_metadata(self):
             tmp_dir = Path(tmp_dir)
             model_dir = tmp_dir / 'model'
             tokenizer_dir = tmp_dir / 'tokenizer'
-            
+
             from transformers import PretrainedConfig
             assert isinstance(self.model.config, PretrainedConfig)
             original_model_config: PretrainedConfig = self.model.config

From 70421856aecf5264f5023b22d675b3d013103bba Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Thu, 18 Jan 2024 13:22:19 -0800
Subject: [PATCH 35/64] round 1

---
 composer/models/huggingface.py |  1 -
 tests/common/models.py         | 58 +++++++++++++++++-----------------
 tests/conftest.py              | 17 ++++++----
 3 files changed, 40 insertions(+), 36 deletions(-)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index 9aadc5b8c8..f3855ce6c4 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -429,7 +429,6 @@ def hf_from_composer_checkpoint(
         return hf_model, hf_tokenizer
 
     def forward(self, batch):
-        print(batch)
         if isinstance(batch, Mapping):
             # Further input validation is left to the huggingface forward call
             batch = {k: v for k, v in batch.items() if k in self.model_forward_args}
diff --git a/tests/common/models.py b/tests/common/models.py
index a0b66d8929..4efd832b62 100644
--- a/tests/common/models.py
+++ b/tests/common/models.py
@@ -574,32 +574,32 @@ def configure_tiny_t5_hf_model(use_logits: bool = True) -> HuggingFaceModel:
     return HuggingFaceModel(configure_tiny_t5_model(), configure_tiny_t5_tokenizer(), use_logits)
 
 
-def configure_tiny_mistral_model() -> 'PreTrainedModel':
-    try:
-        from transformers import PreTrainedModel
-        assert isinstance(pytest.tiny_mistral_model, PreTrainedModel)
-        return copy.deepcopy(pytest.tiny_mistral_model)
-    except AttributeError:
-        pytest.skip('Composer installed without NLP support')
-
-
-def configure_tiny_mistral_tokenizer() -> Union['PreTrainedTokenizer', 'PreTrainedTokenizerFast']:
-    try:
-        from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
-        assert isinstance(pytest.tiny_mistral_tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast))
-        return copy.deepcopy(pytest.tiny_mistral_tokenizer)
-    except AttributeError:
-        pytest.skip('Composer installed without NLP support')
-
-
-def configure_tiny_mistral_config() -> 'PretrainedConfig':
-    try:
-        from transformers import PretrainedConfig
-        assert isinstance(pytest.tiny_mistral_config, PretrainedConfig)
-        return copy.deepcopy(pytest.tiny_mistral_config)
-    except AttributeError:
-        pytest.skip('Composer installed without NLP support')
-
-
-def configure_tiny_mistral_hf_model(use_logits: bool = True) -> HuggingFaceModel:
-    return HuggingFaceModel(configure_tiny_mistral_model(), configure_tiny_mistral_tokenizer(), use_logits)
+# def configure_tiny_mistral_model() -> 'PreTrainedModel':
+#     try:
+#         from transformers import PreTrainedModel
+#         assert isinstance(pytest.tiny_mistral_model, PreTrainedModel)
+#         return copy.deepcopy(pytest.tiny_mistral_model)
+#     except AttributeError:
+#         pytest.skip('Composer installed without NLP support')
+
+
+# def configure_tiny_mistral_tokenizer() -> Union['PreTrainedTokenizer', 'PreTrainedTokenizerFast']:
+#     try:
+#         from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+#         assert isinstance(pytest.tiny_mistral_tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast))
+#         return copy.deepcopy(pytest.tiny_mistral_tokenizer)
+#     except AttributeError:
+#         pytest.skip('Composer installed without NLP support')
+
+
+# def configure_tiny_mistral_config() -> 'PretrainedConfig':
+#     try:
+#         from transformers import PretrainedConfig
+#         assert isinstance(pytest.tiny_mistral_config, PretrainedConfig)
+#         return copy.deepcopy(pytest.tiny_mistral_config)
+#     except AttributeError:
+#         pytest.skip('Composer installed without NLP support')
+
+
+# def configure_tiny_mistral_hf_model(use_logits: bool = True) -> HuggingFaceModel:
+#     return HuggingFaceModel(configure_tiny_mistral_model(), configure_tiny_mistral_tokenizer(), use_logits)
diff --git a/tests/conftest.py b/tests/conftest.py
index bb923e8870..9e5c21f97a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -109,11 +109,16 @@ def pytest_configure():
         TRANSFORMERS_INSTALLED = False
 
     if TRANSFORMERS_INSTALLED:
+        # from tests.fixtures.fixtures import (tiny_bert_config_helper, tiny_bert_model_helper,
+        #                                      tiny_bert_tokenizer_helper, tiny_gpt2_config_helper,
+        #                                      tiny_gpt2_model_helper, tiny_gpt2_tokenizer_helper,
+        #                                      tiny_mistral_config_helper, tiny_mistral_model_helper,
+        #                                      tiny_mistral_tokenizer_helper, tiny_opt_config_helper,
+        #                                      tiny_opt_model_helper, tiny_opt_tokenizer_helper, tiny_t5_config_helper,
+        #                                      tiny_t5_model_helper, tiny_t5_tokenizer_helper)
         from tests.fixtures.fixtures import (tiny_bert_config_helper, tiny_bert_model_helper,
                                              tiny_bert_tokenizer_helper, tiny_gpt2_config_helper,
-                                             tiny_gpt2_model_helper, tiny_gpt2_tokenizer_helper,
-                                             tiny_mistral_config_helper, tiny_mistral_model_helper,
-                                             tiny_mistral_tokenizer_helper, tiny_opt_config_helper,
+                                             tiny_gpt2_model_helper, tiny_gpt2_tokenizer_helper, tiny_opt_config_helper,
                                              tiny_opt_model_helper, tiny_opt_tokenizer_helper, tiny_t5_config_helper,
                                              tiny_t5_model_helper, tiny_t5_tokenizer_helper)
         pytest.tiny_bert_config = tiny_bert_config_helper()  # type: ignore
@@ -128,9 +133,9 @@ def pytest_configure():
         pytest.tiny_t5_config = tiny_t5_config_helper()  # type: ignore
         pytest.tiny_t5_model = tiny_t5_model_helper(pytest.tiny_t5_config)  # type: ignore
         pytest.tiny_t5_tokenizer = tiny_t5_tokenizer_helper()  # type: ignore
-        pytest.tiny_mistral_config = tiny_mistral_config_helper()  # type: ignore
-        pytest.tiny_mistral_model = tiny_mistral_model_helper(pytest.tiny_mistral_config)  # type: ignore
-        pytest.tiny_mistral_tokenizer = tiny_mistral_tokenizer_helper()  # type: ignore
+        # pytest.tiny_mistral_config = tiny_mistral_config_helper()  # type: ignore
+        # pytest.tiny_mistral_model = tiny_mistral_model_helper(pytest.tiny_mistral_config)  # type: ignore
+        # pytest.tiny_mistral_tokenizer = tiny_mistral_tokenizer_helper()  # type: ignore
 
 
 def pytest_sessionfinish(session: pytest.Session, exitstatus: int):

From 651faa090efe2449c4a6fab3bd8b6f825c399593 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Thu, 18 Jan 2024 13:29:40 -0800
Subject: [PATCH 36/64] remove more

---
 tests/models/test_hf_model.py | 394 +++++++++++++++++-----------------
 1 file changed, 198 insertions(+), 196 deletions(-)

diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index a69b56b5a8..b34ca67369 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -25,9 +25,11 @@
 from composer.utils import dist, is_model_fsdp
 from tests.common.datasets import RandomTextClassificationDataset, RandomTextLMDataset, RandomTextRegressionDataset
 from tests.common.markers import device, world_size
+# from tests.common.models import (configure_tiny_bert_model, configure_tiny_bert_tokenizer, configure_tiny_gpt2_model,
+#                                  configure_tiny_gpt2_tokenizer, configure_tiny_mistral_model,
+#                                  configure_tiny_mistral_tokenizer, configure_tiny_t5_model, configure_tiny_t5_tokenizer)
 from tests.common.models import (configure_tiny_bert_model, configure_tiny_bert_tokenizer, configure_tiny_gpt2_model,
-                                 configure_tiny_gpt2_tokenizer, configure_tiny_mistral_model,
-                                 configure_tiny_mistral_tokenizer, configure_tiny_t5_model, configure_tiny_t5_tokenizer)
+                                 configure_tiny_gpt2_tokenizer, configure_tiny_t5_model, configure_tiny_t5_tokenizer)
 from tests.loggers.test_remote_uploader_downloader import DummyObjectStore
 
 if TYPE_CHECKING:
@@ -1203,204 +1205,204 @@ def test_eval_forward_generate(device, world_size, hf_model, hf_tokenizer, use_f
     assert all(isinstance(decoded_generation, str) for decoded_generation in generation2)
 
 
-def test_peft_init(tiny_gpt2_model, gpt2_peft_config):
-    pytest.importorskip('peft')
-    from peft import PeftModelForCausalLM
-
-    original_model = copy.deepcopy(tiny_gpt2_model)
-    hf_model = HuggingFaceModel(tiny_gpt2_model, peft_config=gpt2_peft_config)
-    assert isinstance(hf_model.model, PeftModelForCausalLM)
-    assert hf_model.model.peft_config['default'].peft_type == 'LORA'
-    assert hf_model.model.peft_config['default'].task_type == 'CAUSAL_LM'
-    assert hf_model.model.config == original_model.config
-
-
-def test_peft_init_not_installed(tiny_gpt2_model, gpt2_peft_config):
-    pytest.importorskip('peft')
-
-    with patch('composer.models.huggingface._peft_installed', False):
-        with pytest.raises(ImportError):
-            from composer.models import HuggingFaceModel
-            _ = HuggingFaceModel(tiny_gpt2_model, peft_config=gpt2_peft_config)
-
-
-@pytest.mark.parametrize('just_lora', [True, False])
-def test_peft_trains_and_loads(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, just_lora):
-    pytest.importorskip('peft')
-
-    trainer = get_lm_trainer(
-        tiny_gpt2_model,
-        tiny_gpt2_tokenizer,
-        str(tmp_path),
-        peft_config=gpt2_peft_config,
-        device_train_microbatch_size=1,
-        mlm=False,
-        just_lora=just_lora,
-    )
-    trainer.fit()
-
-    load_trainer = get_lm_trainer(
-        tiny_gpt2_model,
-        tiny_gpt2_tokenizer,
-        str(tmp_path),
-        peft_config=gpt2_peft_config,
-        device_train_microbatch_size=1,
-        mlm=False,
-        load_path=str(tmp_path / 'hf-checkpoint.pt'),
-        just_lora=just_lora,
-    )
-
-    for p1, p2 in zip(trainer.state.model.parameters(), load_trainer.state.model.parameters()):
-        torch.testing.assert_close(p1, p2)
-
-
-@pytest.mark.parametrize('model,tokenizer,peft_config', [
-    (configure_tiny_gpt2_model, configure_tiny_gpt2_tokenizer, _gpt2_peft_config()),
-    (configure_tiny_mistral_model, configure_tiny_mistral_tokenizer, _mistral_peft_config()),
-])
-def test_peft_generate(model, tokenizer, peft_config):
-    pytest.importorskip('peft')
-
-    model = model()
-    tokenizer = tokenizer()
-
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-
-    hf_model = HuggingFaceModel(model, tokenizer=tokenizer, peft_config=peft_config)
-
-    input_dict = tokenizer(['hello', 'goodbyes'], return_tensors='pt', padding=True)
-    hf_model.generate(**input_dict, max_new_tokens=5, pad_token_id=tokenizer.pad_token_id)
-
-
-def test_peft_metadata(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config):
-    pytest.importorskip('peft')
-
-    from peft import get_peft_config
-
-    hf_model = HuggingFaceModel(tiny_gpt2_model, tokenizer=tiny_gpt2_tokenizer, peft_config=gpt2_peft_config)
-    metadata = hf_model.get_metadata()
-    loaded_peft_config = get_peft_config(metadata['model']['peft_config']['content'])
-
-    assert loaded_peft_config == gpt2_peft_config
-
-
-@pytest.mark.parametrize('just_lora', [True, False])
-def test_peft_write_hf_from_composer(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, just_lora):
-    peft = pytest.importorskip('peft')
-    transformers = pytest.importorskip('transformers')
-
-    # Simulate a local model instead of a hub model
-    tiny_gpt2_model.save_pretrained(tmp_path / 'hf-save-to-load')
-    tiny_gpt2_model = transformers.AutoModelForCausalLM.from_pretrained(tmp_path / 'hf-save-to-load')
-
-    trainer = get_lm_trainer(
-        tiny_gpt2_model,
-        tiny_gpt2_tokenizer,
-        str(tmp_path),
-        peft_config=gpt2_peft_config,
-        device_train_microbatch_size=1,
-        mlm=False,
-        just_lora=just_lora,
-    )
-    trainer.fit()
+# def test_peft_init(tiny_gpt2_model, gpt2_peft_config):
+#     pytest.importorskip('peft')
+#     from peft import PeftModelForCausalLM
 
-    from composer.models.huggingface import write_huggingface_pretrained_from_composer_checkpoint
-    write_huggingface_pretrained_from_composer_checkpoint(str(tmp_path / 'hf-checkpoint.pt'),
-                                                          tmp_path / 'hf-save-pretrained')
+#     original_model = copy.deepcopy(tiny_gpt2_model)
+#     hf_model = HuggingFaceModel(tiny_gpt2_model, peft_config=gpt2_peft_config)
+#     assert isinstance(hf_model.model, PeftModelForCausalLM)
+#     assert hf_model.model.peft_config['default'].peft_type == 'LORA'
+#     assert hf_model.model.peft_config['default'].task_type == 'CAUSAL_LM'
+#     assert hf_model.model.config == original_model.config
 
-    # Test we can load back in using transformers interface
-    loaded_hf_model = transformers.AutoModelForCausalLM.from_pretrained(str(tmp_path / 'hf-save-pretrained'))
-    for p1, p2 in zip(trainer.state.model.model.parameters(), loaded_hf_model.parameters()):
-        torch.testing.assert_close(p1, p2)
-
-    # Test we can load back in using peft interface
-    loaded_peft_model = peft.PeftModelForCausalLM.from_pretrained(tiny_gpt2_model, str(tmp_path / 'hf-save-pretrained'))
-    for p1, p2 in zip(trainer.state.model.model.parameters(), loaded_peft_model.parameters()):
-        torch.testing.assert_close(p1, p2)
 
+# def test_peft_init_not_installed(tiny_gpt2_model, gpt2_peft_config):
+#     pytest.importorskip('peft')
 
-@pytest.mark.gpu
-@world_size(2)
-@pytest.mark.parametrize('just_lora', [True, False])
-@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.13.0'),
-                    reason='requires PyTorch 1.13 or higher')
-def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, world_size, just_lora):
-    pytest.importorskip('peft')
+#     with patch('composer.models.huggingface._peft_installed', False):
+#         with pytest.raises(ImportError):
+#             from composer.models import HuggingFaceModel
+#             _ = HuggingFaceModel(tiny_gpt2_model, peft_config=gpt2_peft_config)
 
-    fsdp_config = {
-        'sharding_strategy': 'FULL_SHARD',
-        'cpu_offload': False,
-        'mixed_precision': 'PURE',
-        'backward_prefetch': 'BACKWARD_PRE',
-        'activation_checkpointing': False,
-        'activation_cpu_offload': False,
-        'verbose': False
-    }
-
-    stashed_model = copy.deepcopy(tiny_gpt2_model)
-
-    trainer = get_lm_trainer(
-        tiny_gpt2_model,
-        tiny_gpt2_tokenizer,
-        str(tmp_path / 'trainer1'),
-        peft_config=gpt2_peft_config,
-        device_train_microbatch_size=1,
-        mlm=False,
-        fsdp_config=fsdp_config,
-        just_lora=just_lora,
-    )
-
-    for n, p in trainer.state.model.model.named_parameters():
-        if 'lora' in n:
-            assert p.requires_grad
-        else:
-            assert not p.requires_grad
-
-    trainer.fit()
-    trainer.close()
-
-    load_trainer = get_lm_trainer(
-        stashed_model,
-        tiny_gpt2_tokenizer,
-        str(tmp_path / 'trainer2'),
-        peft_config=gpt2_peft_config,
-        device_train_microbatch_size=1,
-        mlm=False,
-        load_path=str(tmp_path / 'trainer1' / 'hf-checkpoint.pt'),
-        fsdp_config=fsdp_config,
-        just_lora=just_lora,
-    )
-
-    for n, p in load_trainer.state.model.model.named_parameters():
-        if 'lora' in n:
-            assert p.requires_grad
-        else:
-            assert not p.requires_grad
-
-    from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-
-    with FSDP.summon_full_params(trainer.state.model), FSDP.summon_full_params(load_trainer.state.model):
-        for p1, p2 in zip(trainer.state.model.parameters(), load_trainer.state.model.parameters()):
-            torch.testing.assert_close(p1, p2)
-
-    if dist.get_global_rank() == 0:
-        loaded_ckpt_1 = torch.load(str(tmp_path / 'trainer1' / 'hf-checkpoint.pt'))
-
-        # Check that only the LoRA parameters were saved
-        if just_lora:
-            assert all('lora' in k for k in loaded_ckpt_1['state']['model'].keys())
-        else:
-            assert not all('lora' in k for k in loaded_ckpt_1['state']['model'].keys())
-
-
-def test_filtered_state_dict(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path):
-    pytest.importorskip('peft')
 
-    hf_model = HuggingFaceModel(tiny_gpt2_model,
-                                tokenizer=tiny_gpt2_tokenizer,
-                                peft_config=gpt2_peft_config,
-                                peft_filter_state_dict_trainable=True)
-    state_dict = hf_model.state_dict()
+# @pytest.mark.parametrize('just_lora', [True, False])
+# def test_peft_trains_and_loads(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, just_lora):
+#     pytest.importorskip('peft')
 
-    assert len(state_dict.keys()) == 4
+#     trainer = get_lm_trainer(
+#         tiny_gpt2_model,
+#         tiny_gpt2_tokenizer,
+#         str(tmp_path),
+#         peft_config=gpt2_peft_config,
+#         device_train_microbatch_size=1,
+#         mlm=False,
+#         just_lora=just_lora,
+#     )
+#     trainer.fit()
+
+#     load_trainer = get_lm_trainer(
+#         tiny_gpt2_model,
+#         tiny_gpt2_tokenizer,
+#         str(tmp_path),
+#         peft_config=gpt2_peft_config,
+#         device_train_microbatch_size=1,
+#         mlm=False,
+#         load_path=str(tmp_path / 'hf-checkpoint.pt'),
+#         just_lora=just_lora,
+#     )
+
+#     for p1, p2 in zip(trainer.state.model.parameters(), load_trainer.state.model.parameters()):
+#         torch.testing.assert_close(p1, p2)
+
+
+# @pytest.mark.parametrize('model,tokenizer,peft_config', [
+#     (configure_tiny_gpt2_model, configure_tiny_gpt2_tokenizer, _gpt2_peft_config()),
+#     (configure_tiny_mistral_model, configure_tiny_mistral_tokenizer, _mistral_peft_config()),
+# ])
+# def test_peft_generate(model, tokenizer, peft_config):
+#     pytest.importorskip('peft')
+
+#     model = model()
+#     tokenizer = tokenizer()
+
+#     if tokenizer.pad_token is None:
+#         tokenizer.pad_token = tokenizer.eos_token
+
+#     hf_model = HuggingFaceModel(model, tokenizer=tokenizer, peft_config=peft_config)
+
+#     input_dict = tokenizer(['hello', 'goodbyes'], return_tensors='pt', padding=True)
+#     hf_model.generate(**input_dict, max_new_tokens=5, pad_token_id=tokenizer.pad_token_id)
+
+
+# def test_peft_metadata(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config):
+#     pytest.importorskip('peft')
+
+#     from peft import get_peft_config
+
+#     hf_model = HuggingFaceModel(tiny_gpt2_model, tokenizer=tiny_gpt2_tokenizer, peft_config=gpt2_peft_config)
+#     metadata = hf_model.get_metadata()
+#     loaded_peft_config = get_peft_config(metadata['model']['peft_config']['content'])
+
+#     assert loaded_peft_config == gpt2_peft_config
+
+
+# @pytest.mark.parametrize('just_lora', [True, False])
+# def test_peft_write_hf_from_composer(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, just_lora):
+#     peft = pytest.importorskip('peft')
+#     transformers = pytest.importorskip('transformers')
+
+#     # Simulate a local model instead of a hub model
+#     tiny_gpt2_model.save_pretrained(tmp_path / 'hf-save-to-load')
+#     tiny_gpt2_model = transformers.AutoModelForCausalLM.from_pretrained(tmp_path / 'hf-save-to-load')
+
+#     trainer = get_lm_trainer(
+#         tiny_gpt2_model,
+#         tiny_gpt2_tokenizer,
+#         str(tmp_path),
+#         peft_config=gpt2_peft_config,
+#         device_train_microbatch_size=1,
+#         mlm=False,
+#         just_lora=just_lora,
+#     )
+#     trainer.fit()
+
+#     from composer.models.huggingface import write_huggingface_pretrained_from_composer_checkpoint
+#     write_huggingface_pretrained_from_composer_checkpoint(str(tmp_path / 'hf-checkpoint.pt'),
+#                                                           tmp_path / 'hf-save-pretrained')
+
+#     # Test we can load back in using transformers interface
+#     loaded_hf_model = transformers.AutoModelForCausalLM.from_pretrained(str(tmp_path / 'hf-save-pretrained'))
+#     for p1, p2 in zip(trainer.state.model.model.parameters(), loaded_hf_model.parameters()):
+#         torch.testing.assert_close(p1, p2)
+
+#     # Test we can load back in using peft interface
+#     loaded_peft_model = peft.PeftModelForCausalLM.from_pretrained(tiny_gpt2_model, str(tmp_path / 'hf-save-pretrained'))
+#     for p1, p2 in zip(trainer.state.model.model.parameters(), loaded_peft_model.parameters()):
+#         torch.testing.assert_close(p1, p2)
+
+
+# @pytest.mark.gpu
+# @world_size(2)
+# @pytest.mark.parametrize('just_lora', [True, False])
+# @pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.13.0'),
+#                     reason='requires PyTorch 1.13 or higher')
+# def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, world_size, just_lora):
+#     pytest.importorskip('peft')
+
+#     fsdp_config = {
+#         'sharding_strategy': 'FULL_SHARD',
+#         'cpu_offload': False,
+#         'mixed_precision': 'PURE',
+#         'backward_prefetch': 'BACKWARD_PRE',
+#         'activation_checkpointing': False,
+#         'activation_cpu_offload': False,
+#         'verbose': False
+#     }
+
+#     stashed_model = copy.deepcopy(tiny_gpt2_model)
+
+#     trainer = get_lm_trainer(
+#         tiny_gpt2_model,
+#         tiny_gpt2_tokenizer,
+#         str(tmp_path / 'trainer1'),
+#         peft_config=gpt2_peft_config,
+#         device_train_microbatch_size=1,
+#         mlm=False,
+#         fsdp_config=fsdp_config,
+#         just_lora=just_lora,
+#     )
+
+#     for n, p in trainer.state.model.model.named_parameters():
+#         if 'lora' in n:
+#             assert p.requires_grad
+#         else:
+#             assert not p.requires_grad
+
+#     trainer.fit()
+#     trainer.close()
+
+#     load_trainer = get_lm_trainer(
+#         stashed_model,
+#         tiny_gpt2_tokenizer,
+#         str(tmp_path / 'trainer2'),
+#         peft_config=gpt2_peft_config,
+#         device_train_microbatch_size=1,
+#         mlm=False,
+#         load_path=str(tmp_path / 'trainer1' / 'hf-checkpoint.pt'),
+#         fsdp_config=fsdp_config,
+#         just_lora=just_lora,
+#     )
+
+#     for n, p in load_trainer.state.model.model.named_parameters():
+#         if 'lora' in n:
+#             assert p.requires_grad
+#         else:
+#             assert not p.requires_grad
+
+#     from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+
+#     with FSDP.summon_full_params(trainer.state.model), FSDP.summon_full_params(load_trainer.state.model):
+#         for p1, p2 in zip(trainer.state.model.parameters(), load_trainer.state.model.parameters()):
+#             torch.testing.assert_close(p1, p2)
+
+#     if dist.get_global_rank() == 0:
+#         loaded_ckpt_1 = torch.load(str(tmp_path / 'trainer1' / 'hf-checkpoint.pt'))
+
+#         # Check that only the LoRA parameters were saved
+#         if just_lora:
+#             assert all('lora' in k for k in loaded_ckpt_1['state']['model'].keys())
+#         else:
+#             assert not all('lora' in k for k in loaded_ckpt_1['state']['model'].keys())
+
+
+# def test_filtered_state_dict(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path):
+#     pytest.importorskip('peft')
+
+#     hf_model = HuggingFaceModel(tiny_gpt2_model,
+#                                 tokenizer=tiny_gpt2_tokenizer,
+#                                 peft_config=gpt2_peft_config,
+#                                 peft_filter_state_dict_trainable=True)
+#     state_dict = hf_model.state_dict()
+
+#     assert len(state_dict.keys()) == 4

From 1c869d6bd560d67d38339f2b2ac3f7e04f714bca Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Thu, 18 Jan 2024 13:38:43 -0800
Subject: [PATCH 37/64] tests back

---
 tests/common/models.py | 58 +++++++++++++++++++++---------------------
 tests/conftest.py      | 22 ++++++++--------
 2 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/tests/common/models.py b/tests/common/models.py
index 4efd832b62..a0b66d8929 100644
--- a/tests/common/models.py
+++ b/tests/common/models.py
@@ -574,32 +574,32 @@ def configure_tiny_t5_hf_model(use_logits: bool = True) -> HuggingFaceModel:
     return HuggingFaceModel(configure_tiny_t5_model(), configure_tiny_t5_tokenizer(), use_logits)
 
 
-# def configure_tiny_mistral_model() -> 'PreTrainedModel':
-#     try:
-#         from transformers import PreTrainedModel
-#         assert isinstance(pytest.tiny_mistral_model, PreTrainedModel)
-#         return copy.deepcopy(pytest.tiny_mistral_model)
-#     except AttributeError:
-#         pytest.skip('Composer installed without NLP support')
-
-
-# def configure_tiny_mistral_tokenizer() -> Union['PreTrainedTokenizer', 'PreTrainedTokenizerFast']:
-#     try:
-#         from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
-#         assert isinstance(pytest.tiny_mistral_tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast))
-#         return copy.deepcopy(pytest.tiny_mistral_tokenizer)
-#     except AttributeError:
-#         pytest.skip('Composer installed without NLP support')
-
-
-# def configure_tiny_mistral_config() -> 'PretrainedConfig':
-#     try:
-#         from transformers import PretrainedConfig
-#         assert isinstance(pytest.tiny_mistral_config, PretrainedConfig)
-#         return copy.deepcopy(pytest.tiny_mistral_config)
-#     except AttributeError:
-#         pytest.skip('Composer installed without NLP support')
-
-
-# def configure_tiny_mistral_hf_model(use_logits: bool = True) -> HuggingFaceModel:
-#     return HuggingFaceModel(configure_tiny_mistral_model(), configure_tiny_mistral_tokenizer(), use_logits)
+def configure_tiny_mistral_model() -> 'PreTrainedModel':
+    try:
+        from transformers import PreTrainedModel
+        assert isinstance(pytest.tiny_mistral_model, PreTrainedModel)
+        return copy.deepcopy(pytest.tiny_mistral_model)
+    except AttributeError:
+        pytest.skip('Composer installed without NLP support')
+
+
+def configure_tiny_mistral_tokenizer() -> Union['PreTrainedTokenizer', 'PreTrainedTokenizerFast']:
+    try:
+        from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+        assert isinstance(pytest.tiny_mistral_tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast))
+        return copy.deepcopy(pytest.tiny_mistral_tokenizer)
+    except AttributeError:
+        pytest.skip('Composer installed without NLP support')
+
+
+def configure_tiny_mistral_config() -> 'PretrainedConfig':
+    try:
+        from transformers import PretrainedConfig
+        assert isinstance(pytest.tiny_mistral_config, PretrainedConfig)
+        return copy.deepcopy(pytest.tiny_mistral_config)
+    except AttributeError:
+        pytest.skip('Composer installed without NLP support')
+
+
+def configure_tiny_mistral_hf_model(use_logits: bool = True) -> HuggingFaceModel:
+    return HuggingFaceModel(configure_tiny_mistral_model(), configure_tiny_mistral_tokenizer(), use_logits)
diff --git a/tests/conftest.py b/tests/conftest.py
index 9e5c21f97a..607e6dd90a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -109,18 +109,18 @@ def pytest_configure():
         TRANSFORMERS_INSTALLED = False
 
     if TRANSFORMERS_INSTALLED:
-        # from tests.fixtures.fixtures import (tiny_bert_config_helper, tiny_bert_model_helper,
-        #                                      tiny_bert_tokenizer_helper, tiny_gpt2_config_helper,
-        #                                      tiny_gpt2_model_helper, tiny_gpt2_tokenizer_helper,
-        #                                      tiny_mistral_config_helper, tiny_mistral_model_helper,
-        #                                      tiny_mistral_tokenizer_helper, tiny_opt_config_helper,
-        #                                      tiny_opt_model_helper, tiny_opt_tokenizer_helper, tiny_t5_config_helper,
-        #                                      tiny_t5_model_helper, tiny_t5_tokenizer_helper)
         from tests.fixtures.fixtures import (tiny_bert_config_helper, tiny_bert_model_helper,
                                              tiny_bert_tokenizer_helper, tiny_gpt2_config_helper,
-                                             tiny_gpt2_model_helper, tiny_gpt2_tokenizer_helper, tiny_opt_config_helper,
+                                             tiny_gpt2_model_helper, tiny_gpt2_tokenizer_helper,
+                                             tiny_mistral_config_helper, tiny_mistral_model_helper,
+                                             tiny_mistral_tokenizer_helper, tiny_opt_config_helper,
                                              tiny_opt_model_helper, tiny_opt_tokenizer_helper, tiny_t5_config_helper,
                                              tiny_t5_model_helper, tiny_t5_tokenizer_helper)
+        # from tests.fixtures.fixtures import (tiny_bert_config_helper, tiny_bert_model_helper,
+        #                                      tiny_bert_tokenizer_helper, tiny_gpt2_config_helper,
+        #                                      tiny_gpt2_model_helper, tiny_gpt2_tokenizer_helper, tiny_opt_config_helper,
+        #                                      tiny_opt_model_helper, tiny_opt_tokenizer_helper, tiny_t5_config_helper,
+        #                                      tiny_t5_model_helper, tiny_t5_tokenizer_helper)
         pytest.tiny_bert_config = tiny_bert_config_helper()  # type: ignore
         pytest.tiny_bert_model = tiny_bert_model_helper(pytest.tiny_bert_config)  # type: ignore
         pytest.tiny_bert_tokenizer = tiny_bert_tokenizer_helper()  # type: ignore
@@ -133,9 +133,9 @@ def pytest_configure():
         pytest.tiny_t5_config = tiny_t5_config_helper()  # type: ignore
         pytest.tiny_t5_model = tiny_t5_model_helper(pytest.tiny_t5_config)  # type: ignore
         pytest.tiny_t5_tokenizer = tiny_t5_tokenizer_helper()  # type: ignore
-        # pytest.tiny_mistral_config = tiny_mistral_config_helper()  # type: ignore
-        # pytest.tiny_mistral_model = tiny_mistral_model_helper(pytest.tiny_mistral_config)  # type: ignore
-        # pytest.tiny_mistral_tokenizer = tiny_mistral_tokenizer_helper()  # type: ignore
+        pytest.tiny_mistral_config = tiny_mistral_config_helper()  # type: ignore
+        pytest.tiny_mistral_model = tiny_mistral_model_helper(pytest.tiny_mistral_config)  # type: ignore
+        pytest.tiny_mistral_tokenizer = tiny_mistral_tokenizer_helper()  # type: ignore
 
 
 def pytest_sessionfinish(session: pytest.Session, exitstatus: int):

From c923125ed0e027f7ccd21dad8d57fcb059b972c3 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Thu, 18 Jan 2024 13:51:40 -0800
Subject: [PATCH 38/64] add more back

---
 tests/conftest.py             | 5 -----
 tests/models/test_hf_model.py | 6 +++---
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 607e6dd90a..bb923e8870 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -116,11 +116,6 @@ def pytest_configure():
                                              tiny_mistral_tokenizer_helper, tiny_opt_config_helper,
                                              tiny_opt_model_helper, tiny_opt_tokenizer_helper, tiny_t5_config_helper,
                                              tiny_t5_model_helper, tiny_t5_tokenizer_helper)
-        # from tests.fixtures.fixtures import (tiny_bert_config_helper, tiny_bert_model_helper,
-        #                                      tiny_bert_tokenizer_helper, tiny_gpt2_config_helper,
-        #                                      tiny_gpt2_model_helper, tiny_gpt2_tokenizer_helper, tiny_opt_config_helper,
-        #                                      tiny_opt_model_helper, tiny_opt_tokenizer_helper, tiny_t5_config_helper,
-        #                                      tiny_t5_model_helper, tiny_t5_tokenizer_helper)
         pytest.tiny_bert_config = tiny_bert_config_helper()  # type: ignore
         pytest.tiny_bert_model = tiny_bert_model_helper(pytest.tiny_bert_config)  # type: ignore
         pytest.tiny_bert_tokenizer = tiny_bert_tokenizer_helper()  # type: ignore
diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index b34ca67369..10903437b4 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -25,9 +25,9 @@
 from composer.utils import dist, is_model_fsdp
 from tests.common.datasets import RandomTextClassificationDataset, RandomTextLMDataset, RandomTextRegressionDataset
 from tests.common.markers import device, world_size
-# from tests.common.models import (configure_tiny_bert_model, configure_tiny_bert_tokenizer, configure_tiny_gpt2_model,
-#                                  configure_tiny_gpt2_tokenizer, configure_tiny_mistral_model,
-#                                  configure_tiny_mistral_tokenizer, configure_tiny_t5_model, configure_tiny_t5_tokenizer)
+from tests.common.models import (configure_tiny_bert_model, configure_tiny_bert_tokenizer, configure_tiny_gpt2_model,
+                                 configure_tiny_gpt2_tokenizer, configure_tiny_mistral_model,
+                                 configure_tiny_mistral_tokenizer, configure_tiny_t5_model, configure_tiny_t5_tokenizer)
 from tests.common.models import (configure_tiny_bert_model, configure_tiny_bert_tokenizer, configure_tiny_gpt2_model,
                                  configure_tiny_gpt2_tokenizer, configure_tiny_t5_model, configure_tiny_t5_tokenizer)
 from tests.loggers.test_remote_uploader_downloader import DummyObjectStore

From 06bc6b627d6afb938950382ba50b6b71432d7cf6 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Thu, 18 Jan 2024 14:03:17 -0800
Subject: [PATCH 39/64] add back

---
 tests/models/test_hf_model.py | 400 +++++++++++++++++-----------------
 1 file changed, 200 insertions(+), 200 deletions(-)

diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index 10903437b4..c3c59e1ced 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -1205,204 +1205,204 @@ def test_eval_forward_generate(device, world_size, hf_model, hf_tokenizer, use_f
     assert all(isinstance(decoded_generation, str) for decoded_generation in generation2)
 
 
-# def test_peft_init(tiny_gpt2_model, gpt2_peft_config):
-#     pytest.importorskip('peft')
-#     from peft import PeftModelForCausalLM
-
-#     original_model = copy.deepcopy(tiny_gpt2_model)
-#     hf_model = HuggingFaceModel(tiny_gpt2_model, peft_config=gpt2_peft_config)
-#     assert isinstance(hf_model.model, PeftModelForCausalLM)
-#     assert hf_model.model.peft_config['default'].peft_type == 'LORA'
-#     assert hf_model.model.peft_config['default'].task_type == 'CAUSAL_LM'
-#     assert hf_model.model.config == original_model.config
+def test_peft_init(tiny_gpt2_model, gpt2_peft_config):
+    pytest.importorskip('peft')
+    from peft import PeftModelForCausalLM
+
+    original_model = copy.deepcopy(tiny_gpt2_model)
+    hf_model = HuggingFaceModel(tiny_gpt2_model, peft_config=gpt2_peft_config)
+    assert isinstance(hf_model.model, PeftModelForCausalLM)
+    assert hf_model.model.peft_config['default'].peft_type == 'LORA'
+    assert hf_model.model.peft_config['default'].task_type == 'CAUSAL_LM'
+    assert hf_model.model.config == original_model.config
+
+
+def test_peft_init_not_installed(tiny_gpt2_model, gpt2_peft_config):
+    pytest.importorskip('peft')
+
+    with patch('composer.models.huggingface._peft_installed', False):
+        with pytest.raises(ImportError):
+            from composer.models import HuggingFaceModel
+            _ = HuggingFaceModel(tiny_gpt2_model, peft_config=gpt2_peft_config)
+
+
+@pytest.mark.parametrize('just_lora', [True, False])
+def test_peft_trains_and_loads(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, just_lora):
+    pytest.importorskip('peft')
+
+    trainer = get_lm_trainer(
+        tiny_gpt2_model,
+        tiny_gpt2_tokenizer,
+        str(tmp_path),
+        peft_config=gpt2_peft_config,
+        device_train_microbatch_size=1,
+        mlm=False,
+        just_lora=just_lora,
+    )
+    trainer.fit()
+
+    load_trainer = get_lm_trainer(
+        tiny_gpt2_model,
+        tiny_gpt2_tokenizer,
+        str(tmp_path),
+        peft_config=gpt2_peft_config,
+        device_train_microbatch_size=1,
+        mlm=False,
+        load_path=str(tmp_path / 'hf-checkpoint.pt'),
+        just_lora=just_lora,
+    )
+
+    for p1, p2 in zip(trainer.state.model.parameters(), load_trainer.state.model.parameters()):
+        torch.testing.assert_close(p1, p2)
+
+
+@pytest.mark.parametrize('model,tokenizer,peft_config', [
+    (configure_tiny_gpt2_model, configure_tiny_gpt2_tokenizer, _gpt2_peft_config()),
+    (configure_tiny_mistral_model, configure_tiny_mistral_tokenizer, _mistral_peft_config()),
+])
+def test_peft_generate(model, tokenizer, peft_config):
+    pytest.importorskip('peft')
+
+    model = model()
+    tokenizer = tokenizer()
+
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    hf_model = HuggingFaceModel(model, tokenizer=tokenizer, peft_config=peft_config)
+
+    input_dict = tokenizer(['hello', 'goodbyes'], return_tensors='pt', padding=True)
+    hf_model.generate(**input_dict, max_new_tokens=5, pad_token_id=tokenizer.pad_token_id)
+
+
+def test_peft_metadata(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config):
+    pytest.importorskip('peft')
+
+    from peft import get_peft_config
+
+    hf_model = HuggingFaceModel(tiny_gpt2_model, tokenizer=tiny_gpt2_tokenizer, peft_config=gpt2_peft_config)
+    metadata = hf_model.get_metadata()
+    loaded_peft_config = get_peft_config(metadata['model']['peft_config']['content'])
+
+    assert loaded_peft_config == gpt2_peft_config
+
+
+@pytest.mark.parametrize('just_lora', [True, False])
+def test_peft_write_hf_from_composer(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, just_lora):
+    peft = pytest.importorskip('peft')
+    transformers = pytest.importorskip('transformers')
+
+    # Simulate a local model instead of a hub model
+    tiny_gpt2_model.save_pretrained(tmp_path / 'hf-save-to-load')
+    tiny_gpt2_model = transformers.AutoModelForCausalLM.from_pretrained(tmp_path / 'hf-save-to-load')
+
+    trainer = get_lm_trainer(
+        tiny_gpt2_model,
+        tiny_gpt2_tokenizer,
+        str(tmp_path),
+        peft_config=gpt2_peft_config,
+        device_train_microbatch_size=1,
+        mlm=False,
+        just_lora=just_lora,
+    )
+    trainer.fit()
+
+    from composer.models.huggingface import write_huggingface_pretrained_from_composer_checkpoint
+    write_huggingface_pretrained_from_composer_checkpoint(str(tmp_path / 'hf-checkpoint.pt'),
+                                                          tmp_path / 'hf-save-pretrained')
+
+    # Test we can load back in using transformers interface
+    loaded_hf_model = transformers.AutoModelForCausalLM.from_pretrained(str(tmp_path / 'hf-save-pretrained'))
+    for p1, p2 in zip(trainer.state.model.model.parameters(), loaded_hf_model.parameters()):
+        torch.testing.assert_close(p1, p2)
+
+    # Test we can load back in using peft interface
+    loaded_peft_model = peft.PeftModelForCausalLM.from_pretrained(tiny_gpt2_model, str(tmp_path / 'hf-save-pretrained'))
+    for p1, p2 in zip(trainer.state.model.model.parameters(), loaded_peft_model.parameters()):
+        torch.testing.assert_close(p1, p2)
+
+
+@pytest.mark.gpu
+@world_size(2)
+@pytest.mark.parametrize('just_lora', [True, False])
+@pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.13.0'),
+                    reason='requires PyTorch 1.13 or higher')
+def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, world_size, just_lora):
+    pytest.importorskip('peft')
+
+    fsdp_config = {
+        'sharding_strategy': 'FULL_SHARD',
+        'cpu_offload': False,
+        'mixed_precision': 'PURE',
+        'backward_prefetch': 'BACKWARD_PRE',
+        'activation_checkpointing': False,
+        'activation_cpu_offload': False,
+        'verbose': False
+    }
+
+    stashed_model = copy.deepcopy(tiny_gpt2_model)
+
+    trainer = get_lm_trainer(
+        tiny_gpt2_model,
+        tiny_gpt2_tokenizer,
+        str(tmp_path / 'trainer1'),
+        peft_config=gpt2_peft_config,
+        device_train_microbatch_size=1,
+        mlm=False,
+        fsdp_config=fsdp_config,
+        just_lora=just_lora,
+    )
+
+    for n, p in trainer.state.model.model.named_parameters():
+        if 'lora' in n:
+            assert p.requires_grad
+        else:
+            assert not p.requires_grad
+
+    trainer.fit()
+    trainer.close()
+
+    load_trainer = get_lm_trainer(
+        stashed_model,
+        tiny_gpt2_tokenizer,
+        str(tmp_path / 'trainer2'),
+        peft_config=gpt2_peft_config,
+        device_train_microbatch_size=1,
+        mlm=False,
+        load_path=str(tmp_path / 'trainer1' / 'hf-checkpoint.pt'),
+        fsdp_config=fsdp_config,
+        just_lora=just_lora,
+    )
+
+    for n, p in load_trainer.state.model.model.named_parameters():
+        if 'lora' in n:
+            assert p.requires_grad
+        else:
+            assert not p.requires_grad
+
+    from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+
+    with FSDP.summon_full_params(trainer.state.model), FSDP.summon_full_params(load_trainer.state.model):
+        for p1, p2 in zip(trainer.state.model.parameters(), load_trainer.state.model.parameters()):
+            torch.testing.assert_close(p1, p2)
+
+    if dist.get_global_rank() == 0:
+        loaded_ckpt_1 = torch.load(str(tmp_path / 'trainer1' / 'hf-checkpoint.pt'))
+
+        # Check that only the LoRA parameters were saved
+        if just_lora:
+            assert all('lora' in k for k in loaded_ckpt_1['state']['model'].keys())
+        else:
+            assert not all('lora' in k for k in loaded_ckpt_1['state']['model'].keys())
+
+
+def test_filtered_state_dict(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path):
+    pytest.importorskip('peft')
+
+    hf_model = HuggingFaceModel(tiny_gpt2_model,
+                                tokenizer=tiny_gpt2_tokenizer,
+                                peft_config=gpt2_peft_config,
+                                peft_filter_state_dict_trainable=True)
+    state_dict = hf_model.state_dict()
 
-
-# def test_peft_init_not_installed(tiny_gpt2_model, gpt2_peft_config):
-#     pytest.importorskip('peft')
-
-#     with patch('composer.models.huggingface._peft_installed', False):
-#         with pytest.raises(ImportError):
-#             from composer.models import HuggingFaceModel
-#             _ = HuggingFaceModel(tiny_gpt2_model, peft_config=gpt2_peft_config)
-
-
-# @pytest.mark.parametrize('just_lora', [True, False])
-# def test_peft_trains_and_loads(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, just_lora):
-#     pytest.importorskip('peft')
-
-#     trainer = get_lm_trainer(
-#         tiny_gpt2_model,
-#         tiny_gpt2_tokenizer,
-#         str(tmp_path),
-#         peft_config=gpt2_peft_config,
-#         device_train_microbatch_size=1,
-#         mlm=False,
-#         just_lora=just_lora,
-#     )
-#     trainer.fit()
-
-#     load_trainer = get_lm_trainer(
-#         tiny_gpt2_model,
-#         tiny_gpt2_tokenizer,
-#         str(tmp_path),
-#         peft_config=gpt2_peft_config,
-#         device_train_microbatch_size=1,
-#         mlm=False,
-#         load_path=str(tmp_path / 'hf-checkpoint.pt'),
-#         just_lora=just_lora,
-#     )
-
-#     for p1, p2 in zip(trainer.state.model.parameters(), load_trainer.state.model.parameters()):
-#         torch.testing.assert_close(p1, p2)
-
-
-# @pytest.mark.parametrize('model,tokenizer,peft_config', [
-#     (configure_tiny_gpt2_model, configure_tiny_gpt2_tokenizer, _gpt2_peft_config()),
-#     (configure_tiny_mistral_model, configure_tiny_mistral_tokenizer, _mistral_peft_config()),
-# ])
-# def test_peft_generate(model, tokenizer, peft_config):
-#     pytest.importorskip('peft')
-
-#     model = model()
-#     tokenizer = tokenizer()
-
-#     if tokenizer.pad_token is None:
-#         tokenizer.pad_token = tokenizer.eos_token
-
-#     hf_model = HuggingFaceModel(model, tokenizer=tokenizer, peft_config=peft_config)
-
-#     input_dict = tokenizer(['hello', 'goodbyes'], return_tensors='pt', padding=True)
-#     hf_model.generate(**input_dict, max_new_tokens=5, pad_token_id=tokenizer.pad_token_id)
-
-
-# def test_peft_metadata(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config):
-#     pytest.importorskip('peft')
-
-#     from peft import get_peft_config
-
-#     hf_model = HuggingFaceModel(tiny_gpt2_model, tokenizer=tiny_gpt2_tokenizer, peft_config=gpt2_peft_config)
-#     metadata = hf_model.get_metadata()
-#     loaded_peft_config = get_peft_config(metadata['model']['peft_config']['content'])
-
-#     assert loaded_peft_config == gpt2_peft_config
-
-
-# @pytest.mark.parametrize('just_lora', [True, False])
-# def test_peft_write_hf_from_composer(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, just_lora):
-#     peft = pytest.importorskip('peft')
-#     transformers = pytest.importorskip('transformers')
-
-#     # Simulate a local model instead of a hub model
-#     tiny_gpt2_model.save_pretrained(tmp_path / 'hf-save-to-load')
-#     tiny_gpt2_model = transformers.AutoModelForCausalLM.from_pretrained(tmp_path / 'hf-save-to-load')
-
-#     trainer = get_lm_trainer(
-#         tiny_gpt2_model,
-#         tiny_gpt2_tokenizer,
-#         str(tmp_path),
-#         peft_config=gpt2_peft_config,
-#         device_train_microbatch_size=1,
-#         mlm=False,
-#         just_lora=just_lora,
-#     )
-#     trainer.fit()
-
-#     from composer.models.huggingface import write_huggingface_pretrained_from_composer_checkpoint
-#     write_huggingface_pretrained_from_composer_checkpoint(str(tmp_path / 'hf-checkpoint.pt'),
-#                                                           tmp_path / 'hf-save-pretrained')
-
-#     # Test we can load back in using transformers interface
-#     loaded_hf_model = transformers.AutoModelForCausalLM.from_pretrained(str(tmp_path / 'hf-save-pretrained'))
-#     for p1, p2 in zip(trainer.state.model.model.parameters(), loaded_hf_model.parameters()):
-#         torch.testing.assert_close(p1, p2)
-
-#     # Test we can load back in using peft interface
-#     loaded_peft_model = peft.PeftModelForCausalLM.from_pretrained(tiny_gpt2_model, str(tmp_path / 'hf-save-pretrained'))
-#     for p1, p2 in zip(trainer.state.model.model.parameters(), loaded_peft_model.parameters()):
-#         torch.testing.assert_close(p1, p2)
-
-
-# @pytest.mark.gpu
-# @world_size(2)
-# @pytest.mark.parametrize('just_lora', [True, False])
-# @pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.13.0'),
-#                     reason='requires PyTorch 1.13 or higher')
-# def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, world_size, just_lora):
-#     pytest.importorskip('peft')
-
-#     fsdp_config = {
-#         'sharding_strategy': 'FULL_SHARD',
-#         'cpu_offload': False,
-#         'mixed_precision': 'PURE',
-#         'backward_prefetch': 'BACKWARD_PRE',
-#         'activation_checkpointing': False,
-#         'activation_cpu_offload': False,
-#         'verbose': False
-#     }
-
-#     stashed_model = copy.deepcopy(tiny_gpt2_model)
-
-#     trainer = get_lm_trainer(
-#         tiny_gpt2_model,
-#         tiny_gpt2_tokenizer,
-#         str(tmp_path / 'trainer1'),
-#         peft_config=gpt2_peft_config,
-#         device_train_microbatch_size=1,
-#         mlm=False,
-#         fsdp_config=fsdp_config,
-#         just_lora=just_lora,
-#     )
-
-#     for n, p in trainer.state.model.model.named_parameters():
-#         if 'lora' in n:
-#             assert p.requires_grad
-#         else:
-#             assert not p.requires_grad
-
-#     trainer.fit()
-#     trainer.close()
-
-#     load_trainer = get_lm_trainer(
-#         stashed_model,
-#         tiny_gpt2_tokenizer,
-#         str(tmp_path / 'trainer2'),
-#         peft_config=gpt2_peft_config,
-#         device_train_microbatch_size=1,
-#         mlm=False,
-#         load_path=str(tmp_path / 'trainer1' / 'hf-checkpoint.pt'),
-#         fsdp_config=fsdp_config,
-#         just_lora=just_lora,
-#     )
-
-#     for n, p in load_trainer.state.model.model.named_parameters():
-#         if 'lora' in n:
-#             assert p.requires_grad
-#         else:
-#             assert not p.requires_grad
-
-#     from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-
-#     with FSDP.summon_full_params(trainer.state.model), FSDP.summon_full_params(load_trainer.state.model):
-#         for p1, p2 in zip(trainer.state.model.parameters(), load_trainer.state.model.parameters()):
-#             torch.testing.assert_close(p1, p2)
-
-#     if dist.get_global_rank() == 0:
-#         loaded_ckpt_1 = torch.load(str(tmp_path / 'trainer1' / 'hf-checkpoint.pt'))
-
-#         # Check that only the LoRA parameters were saved
-#         if just_lora:
-#             assert all('lora' in k for k in loaded_ckpt_1['state']['model'].keys())
-#         else:
-#             assert not all('lora' in k for k in loaded_ckpt_1['state']['model'].keys())
-
-
-# def test_filtered_state_dict(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path):
-#     pytest.importorskip('peft')
-
-#     hf_model = HuggingFaceModel(tiny_gpt2_model,
-#                                 tokenizer=tiny_gpt2_tokenizer,
-#                                 peft_config=gpt2_peft_config,
-#                                 peft_filter_state_dict_trainable=True)
-#     state_dict = hf_model.state_dict()
-
-#     assert len(state_dict.keys()) == 4
+    assert len(state_dict.keys()) == 4

From e6ace25007d1f7b2fec13aaffb04e7f6f8a4ab40 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Thu, 18 Jan 2024 14:35:57 -0800
Subject: [PATCH 40/64] add upper

---
 composer/models/huggingface.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index f3855ce6c4..efa911d68d 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -111,6 +111,10 @@ def __init__(self,
                                                     conda_package='peft',
                                                     conda_channel='conda-forge')
 
+        if peft_config is not None:
+            # Hugging Face requires the peft type to be upper case, so we do that here
+            # https://github.com/huggingface/peft/blob/ebbff4023ad276cbcb2466fd7e99be7d3ae0ae11/src/peft/utils/peft_types.py#L22-L51
+            peft_config.peft_type = peft_config.peft_type.upper()
         if peft_config is not None and peft_config.peft_type != 'LORA':
             raise ValueError(
                 f'PEFT type {peft_config.peft_type} is not supported by HuggingFaceModel. Only LORA is supported.')

From 23961743885e704b3f4bd77ba8410b2508d5127c Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Thu, 18 Jan 2024 14:38:34 -0800
Subject: [PATCH 41/64] also task type

---
 composer/models/huggingface.py | 7 +++++--
 tests/models/test_hf_model.py  | 2 --
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index efa911d68d..c223a91749 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -112,9 +112,12 @@ def __init__(self,
                                                     conda_channel='conda-forge')
 
         if peft_config is not None:
-            # Hugging Face requires the peft type to be upper case, so we do that here
+            # Hugging Face requires the peft type and task type to be upper case, so we do that here
             # https://github.com/huggingface/peft/blob/ebbff4023ad276cbcb2466fd7e99be7d3ae0ae11/src/peft/utils/peft_types.py#L22-L51
-            peft_config.peft_type = peft_config.peft_type.upper()
+            if isinstance(peft_config.peft_type, str):
+                peft_config.peft_type = peft_config.peft_type.upper()
+            if isinstance(peft_config.task_type, str):
+                peft_config.task_type = peft_config.task_type.upper()
         if peft_config is not None and peft_config.peft_type != 'LORA':
             raise ValueError(
                 f'PEFT type {peft_config.peft_type} is not supported by HuggingFaceModel. Only LORA is supported.')
diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index c3c59e1ced..a69b56b5a8 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -28,8 +28,6 @@
 from tests.common.models import (configure_tiny_bert_model, configure_tiny_bert_tokenizer, configure_tiny_gpt2_model,
                                  configure_tiny_gpt2_tokenizer, configure_tiny_mistral_model,
                                  configure_tiny_mistral_tokenizer, configure_tiny_t5_model, configure_tiny_t5_tokenizer)
-from tests.common.models import (configure_tiny_bert_model, configure_tiny_bert_tokenizer, configure_tiny_gpt2_model,
-                                 configure_tiny_gpt2_tokenizer, configure_tiny_t5_model, configure_tiny_t5_tokenizer)
 from tests.loggers.test_remote_uploader_downloader import DummyObjectStore
 
 if TYPE_CHECKING:

From 87941a4321d56861cd810a48e8e842406d67917c Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Thu, 18 Jan 2024 15:06:46 -0800
Subject: [PATCH 42/64] remove import

---
 composer/models/huggingface.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index c223a91749..fdb6eda0a4 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -96,7 +96,7 @@ def __init__(self,
         self.model = model
         self.config: PretrainedConfig = model.config
         self.model_forward_args = inspect.getfullargspec(self.model.forward).args
-        import transformers
+
         if _peft_installed and self.model_forward_args == ['self']:
             from peft import PeftModel
             if isinstance(self.model, PeftModel):

From db9d6c08b0fa2cd0dede8aa9113098a45a61c1a6 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Thu, 18 Jan 2024 15:27:16 -0800
Subject: [PATCH 43/64] add quant and error

---
 composer/models/huggingface.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index fdb6eda0a4..55a6180a42 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -102,6 +102,14 @@ def __init__(self,
             if isinstance(self.model, PeftModel):
                 self.model_forward_args = inspect.getfullargspec(self.model.base_model.model.forward).args
 
+        # inspect.getfullargspec HuggingFace quantized model could not return args correctly
+        if not self.model_forward_args:
+            self.model_forward_args = inspect.signature(
+                self.model.forward).parameters.keys()
+            
+        if not self.model_forward_args:
+            raise ValueError('Could not determine the forward arguments of the model. Please open a GitHub issue.')
+
         self.tokenizer = tokenizer
 
         self.peft_filter_state_dict_trainable = peft_filter_state_dict_trainable

From 807128520afbf92502fa8708b0d74da06f302a33 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Thu, 18 Jan 2024 15:28:41 -0800
Subject: [PATCH 44/64] precommit

---
 composer/models/huggingface.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index 55a6180a42..eff8bb93e7 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -104,9 +104,8 @@ def __init__(self,
 
         # inspect.getfullargspec HuggingFace quantized model could not return args correctly
         if not self.model_forward_args:
-            self.model_forward_args = inspect.signature(
-                self.model.forward).parameters.keys()
-            
+            self.model_forward_args = inspect.signature(self.model.forward).parameters.keys()
+
         if not self.model_forward_args:
             raise ValueError('Could not determine the forward arguments of the model. Please open a GitHub issue.')
 

From a82a57dd86777fa959bc49b415b6d8f4af7ad280 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Thu, 18 Jan 2024 17:19:22 -0800
Subject: [PATCH 45/64] clean up

---
 composer/models/huggingface.py | 15 ++++++++++++++-
 tests/models/test_hf_model.py  | 19 +++++++++++++++++--
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index eff8bb93e7..1cc094a924 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -739,7 +739,7 @@ def get_peft_config_from_composer_state_dict(state_dict: Dict[str, Any]) -> Opti
         state_dict (Dict[str, Any]): The state dict to get the config from
 
     Returns:
-        peft.PeftConfig: The PEFT config
+        Optional[peft.PeftConfig]: The PEFT config. Will be ``None`` if the model is not a PEFT model.
     """
     try:
         import peft
@@ -850,6 +850,19 @@ def write_huggingface_pretrained_from_composer_checkpoint(
 def filter_state_dict_peft(state_dict: Dict[str, Any],
                            peft_config: 'PeftConfig',
                            remove_adapter_names: bool = True) -> Dict[str, Any]:
+    """Filter a state dict to only include the weights needed for a PEFT model
+
+    Note: This function only works with LORA PEFT models right now.
+
+    Args:
+        state_dict (Dict[str, Any]): The state dict to filter
+        peft_config (PeftConfig): The PEFT config to use to filter the state dict
+        remove_adapter_names (bool, optional): Whether to remove the adapter names from the state dict keys. Defaults to True.
+
+    Returns:
+        Dict[str, Any]: The filtered state dict
+    """
+
     if peft_config.peft_type != 'LORA':
         raise NotImplementedError(f'Only LoRA PEFT is supported. Got {peft_config.peft_type}')
 
diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index a69b56b5a8..c55a4cdba3 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -1203,18 +1203,33 @@ def test_eval_forward_generate(device, world_size, hf_model, hf_tokenizer, use_f
     assert all(isinstance(decoded_generation, str) for decoded_generation in generation2)
 
 
-def test_peft_init(tiny_gpt2_model, gpt2_peft_config):
+@pytest.mark.parametrize('peft_type', ['LORA', 'loRa'])
+@pytest.mark.parametrize('task_type', ['CAUSAL_LM', 'causal_lm'])
+def test_peft_init(peft_type: str, task_type: str, tiny_gpt2_model, gpt2_peft_config):
     pytest.importorskip('peft')
     from peft import PeftModelForCausalLM
 
+    peft_config = copy.deepcopy(gpt2_peft_config)
+    peft_config.peft_type = peft_type
+    peft_config.task_type = task_type
+
     original_model = copy.deepcopy(tiny_gpt2_model)
-    hf_model = HuggingFaceModel(tiny_gpt2_model, peft_config=gpt2_peft_config)
+    hf_model = HuggingFaceModel(tiny_gpt2_model, peft_config=peft_config)
     assert isinstance(hf_model.model, PeftModelForCausalLM)
     assert hf_model.model.peft_config['default'].peft_type == 'LORA'
     assert hf_model.model.peft_config['default'].task_type == 'CAUSAL_LM'
     assert hf_model.model.config == original_model.config
 
 
+def test_peft_init_errors(tiny_gpt2_model, gpt2_peft_config):
+    pytest.importorskip('peft')
+    peft_config = copy.deepcopy(gpt2_peft_config)
+    peft_config.peft_type = 'NOT_LORA'
+
+    with pytest.raises(ValueError):
+        _ = HuggingFaceModel(tiny_gpt2_model, peft_config=peft_config)
+
+
 def test_peft_init_not_installed(tiny_gpt2_model, gpt2_peft_config):
     pytest.importorskip('peft')
 

From 1ba8f4032bdfba285fe78aed5fb183f9e1350c89 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Fri, 19 Jan 2024 17:00:46 -0800
Subject: [PATCH 46/64] adjust

---
 composer/models/huggingface.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index 1cc094a924..2ad56fdc27 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -83,7 +83,7 @@ def __init__(self,
                  shift_labels: Optional[bool] = None,
                  allow_embedding_resizing: bool = False,
                  peft_config: Optional['PeftConfig'] = None,
-                 peft_filter_state_dict_trainable: bool = False) -> None:
+                 peft_filter_state_dict_trainable: bool = True) -> None:
         try:
             import transformers
             del transformers  # unused
@@ -192,11 +192,16 @@ def __init__(self,
                 self.model = get_peft_model(self.model, peft_config)
                 log.info(f'PEFT model created. {self.model}')
 
+        self.model_is_peft = False
+        if _peft_installed:
+            from peft import PeftModel
+            self.using_peft = isinstance(self.model, PeftModel)
+
     def state_dict(self, *args, **kwargs) -> Dict[str, Any]:
         """Returns the state dict of the model."""
         full_state_dict = super().state_dict(*args, **kwargs)
 
-        if self.peft_filter_state_dict_trainable:
+        if self.using_peft and self.peft_filter_state_dict_trainable:
             active_adapter = self.model.active_adapter
             assert isinstance(active_adapter, str)
             full_state_dict = filter_state_dict_peft(full_state_dict, self.model.peft_config[active_adapter], False)

From a0e217f2c644c685efd4bbf3aad41783aecd4bf5 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Mon, 22 Jan 2024 17:55:57 -0800
Subject: [PATCH 47/64] simplify attempt

---
 composer/models/huggingface.py | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index 2ad56fdc27..eeff8a4ce2 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -26,9 +26,9 @@
 
 try:
     from peft import PeftModel, get_peft_model
-    _peft_installed = True
+    peft_installed = True
 except:
-    _peft_installed = False
+    peft_installed = False
 
 if TYPE_CHECKING:
     import transformers
@@ -56,7 +56,9 @@ class HuggingFaceModel(ComposerModel):
         eval_metrics (list[Metric], optional): list of torchmetrics to compute on the eval_dataloader, or be accessible to :class:`Evaluator`s. Default: ``None``.
         shift_labels (bool, optional): If True, the batch's labels will be shifted before being used to calculate metrics. This should be set to true for CausalLM models and false otherwise. If not specified, `shift_labels` will be set automatically based on the model class name. Default: ``None``.
         allow_embedding_resizing (bool, optional): If True, the model's embeddings will be automatically resized when they are smaller than the tokenizer vocab size. Default: ``False``.
-
+        peft_config (PeftConfig, optional): Optional PEFT config to apply to the model. If provided, the model will be converted to a PEFT model. Only LoRA is currently supported.
+        peft_filter_state_dict_trainable (bool, optional): If True _and_ PEFT is active, the state dict will only contain the PEFT weights, not the frozen base model weights.
+        
         .. note:: To ensure correct behavior, set `shift_labels` manually if using a custom model (i.e., if `model` is not
         an instance of a registered 🤗 Transformers class).
     .. warning:: This wrapper is designed to work with 🤗 datasets that define a `labels` column.
@@ -95,16 +97,12 @@ def __init__(self,
         super().__init__()
         self.model = model
         self.config: PretrainedConfig = model.config
-        self.model_forward_args = inspect.getfullargspec(self.model.forward).args
 
-        if _peft_installed and self.model_forward_args == ['self']:
+        self.model_forward_args = inspect.signature(self.model.forward).parameters.keys()
+        if peft_installed:
             from peft import PeftModel
             if isinstance(self.model, PeftModel):
-                self.model_forward_args = inspect.getfullargspec(self.model.base_model.model.forward).args
-
-        # inspect.getfullargspec HuggingFace quantized model could not return args correctly
-        if not self.model_forward_args:
-            self.model_forward_args = inspect.signature(self.model.forward).parameters.keys()
+                self.model_forward_args = inspect.signature(self.model.base_model.model.forward).parameters.keys()
 
         if not self.model_forward_args:
             raise ValueError('Could not determine the forward arguments of the model. Please open a GitHub issue.')
@@ -113,7 +111,7 @@ def __init__(self,
 
         self.peft_filter_state_dict_trainable = peft_filter_state_dict_trainable
         if peft_config is not None:
-            if not _peft_installed:
+            if not peft_installed:
                 raise MissingConditionalImportError(extra_deps_group='peft',
                                                     conda_package='peft',
                                                     conda_channel='conda-forge')
@@ -193,7 +191,7 @@ def __init__(self,
                 log.info(f'PEFT model created. {self.model}')
 
         self.model_is_peft = False
-        if _peft_installed:
+        if peft_installed:
             from peft import PeftModel
             self.using_peft = isinstance(self.model, PeftModel)
 
@@ -576,7 +574,7 @@ def get_metadata(self):
             }
 
             # Also save PEFT config if the model is a peft model
-            if _peft_installed:
+            if peft_installed:
                 from peft import PeftModel
                 if isinstance(self.model, PeftModel):
                     active_adapter = self.model.active_adapter
@@ -678,7 +676,7 @@ def _is_registered_causal_lm(model: Union[transformers.PreTrainedModel, 'PeftMod
                                             conda_package='transformers',
                                             conda_channel='conda-forge') from e
 
-    if _peft_installed and isinstance(model, PeftModel):
+    if peft_installed and isinstance(model, PeftModel):
         model_to_check = model.base_model.model
     else:
         model_to_check = model

From 090a0261e6ea1cf40faf344a83b0cafe18fac3cd Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Mon, 22 Jan 2024 17:58:59 -0800
Subject: [PATCH 48/64] precommit

---
 composer/models/huggingface.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index eeff8a4ce2..35b0824d95 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -58,7 +58,7 @@ class HuggingFaceModel(ComposerModel):
         allow_embedding_resizing (bool, optional): If True, the model's embeddings will be automatically resized when they are smaller than the tokenizer vocab size. Default: ``False``.
         peft_config (PeftConfig, optional): Optional PEFT config to apply to the model. If provided, the model will be converted to a PEFT model. Only LoRA is currently supported.
         peft_filter_state_dict_trainable (bool, optional): If True _and_ PEFT is active, the state dict will only contain the PEFT weights, not the frozen base model weights.
-        
+
         .. note:: To ensure correct behavior, set `shift_labels` manually if using a custom model (i.e., if `model` is not
         an instance of a registered 🤗 Transformers class).
     .. warning:: This wrapper is designed to work with 🤗 datasets that define a `labels` column.

From 144340e688e65912e498cf953b1687702ec08bd7 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Mon, 22 Jan 2024 22:34:01 -0800
Subject: [PATCH 49/64] fix tests

---
 composer/models/huggingface.py | 2 ++
 tests/models/test_hf_model.py  | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index 35b0824d95..6b352b5dc8 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -107,6 +107,8 @@ def __init__(self,
         if not self.model_forward_args:
             raise ValueError('Could not determine the forward arguments of the model. Please open a GitHub issue.')
 
+        self.model_forward_args = set(self.model_forward_args)
+
         self.tokenizer = tokenizer
 
         self.peft_filter_state_dict_trainable = peft_filter_state_dict_trainable
diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index 332fd68c91..59e9c1dc10 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -1235,7 +1235,7 @@ def test_peft_init_errors(tiny_gpt2_model, gpt2_peft_config):
 def test_peft_init_not_installed(tiny_gpt2_model, gpt2_peft_config):
     pytest.importorskip('peft')
 
-    with patch('composer.models.huggingface._peft_installed', False):
+    with patch('composer.models.huggingface.peft_installed', False):
         with pytest.raises(ImportError):
             from composer.models import HuggingFaceModel
             _ = HuggingFaceModel(tiny_gpt2_model, peft_config=gpt2_peft_config)

From f78d58048f6f28de6454b70d40bc790903db0cca Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Wed, 24 Jan 2024 17:41:12 -0800
Subject: [PATCH 50/64] export peft installed

---
 composer/models/huggingface.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index 6b352b5dc8..7181b75379 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -38,7 +38,7 @@
 
 log = logging.getLogger(__name__)
 
-__all__ = ['HuggingFaceModel']
+__all__ = ['HuggingFaceModel', 'peft_installed']
 
 
 class HuggingFaceModel(ComposerModel):

From 176fdbeca1159766db3904f98f45d787e152d6d0 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Thu, 25 Jan 2024 15:51:41 -0800
Subject: [PATCH 51/64] first part of pr comments

---
 composer/models/huggingface.py | 66 +++++++++++++++++-----------------
 1 file changed, 34 insertions(+), 32 deletions(-)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index 7181b75379..2dcb7402e8 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -46,7 +46,7 @@ class HuggingFaceModel(ComposerModel):
     A wrapper class that converts 🤗 Transformers models to composer models.
 
     Args:
-        model (transformers.PreTrainedModel): A 🤗 Transformers model.
+        model (Union[transformers.PreTrainedModel, peft.PeftModel)): A 🤗 Transformers model or a PEFT model.
         tokenizer (transformers.PreTrainedTokenizer, optional): The tokenizer used to prepare the dataset. Default ``None``.
 
             .. note:: If the tokenizer is provided, its config will be saved in the composer checkpoint, and it can be reloaded
@@ -94,6 +94,24 @@ def __init__(self,
                                                 conda_package='transformers',
                                                 conda_channel='conda-forge') from e
 
+        if peft_config is not None:
+            if not peft_installed:
+                raise MissingConditionalImportError(extra_deps_group='peft',
+                                                    conda_package='peft',
+                                                    conda_channel='conda-forge')
+
+        if peft_config is not None:
+            # Hugging Face requires the peft type and task type to be upper case, so we do that here
+            # https://github.com/huggingface/peft/blob/ebbff4023ad276cbcb2466fd7e99be7d3ae0ae11/src/peft/utils/peft_types.py#L22-L51
+            if isinstance(peft_config.peft_type, str):
+                peft_config.peft_type = peft_config.peft_type.upper()
+            if isinstance(peft_config.task_type, str):
+                peft_config.task_type = peft_config.task_type.upper()
+
+            if peft_config.peft_type != 'LORA':
+                raise ValueError(
+                    f'PEFT type {peft_config.peft_type} is not supported by HuggingFaceModel. Only LORA is supported.')
+
         super().__init__()
         self.model = model
         self.config: PretrainedConfig = model.config
@@ -112,22 +130,6 @@ def __init__(self,
         self.tokenizer = tokenizer
 
         self.peft_filter_state_dict_trainable = peft_filter_state_dict_trainable
-        if peft_config is not None:
-            if not peft_installed:
-                raise MissingConditionalImportError(extra_deps_group='peft',
-                                                    conda_package='peft',
-                                                    conda_channel='conda-forge')
-
-        if peft_config is not None:
-            # Hugging Face requires the peft type and task type to be upper case, so we do that here
-            # https://github.com/huggingface/peft/blob/ebbff4023ad276cbcb2466fd7e99be7d3ae0ae11/src/peft/utils/peft_types.py#L22-L51
-            if isinstance(peft_config.peft_type, str):
-                peft_config.peft_type = peft_config.peft_type.upper()
-            if isinstance(peft_config.task_type, str):
-                peft_config.task_type = peft_config.task_type.upper()
-        if peft_config is not None and peft_config.peft_type != 'LORA':
-            raise ValueError(
-                f'PEFT type {peft_config.peft_type} is not supported by HuggingFaceModel. Only LORA is supported.')
 
         if self.tokenizer is None:
             log.warning(
@@ -192,7 +194,7 @@ def __init__(self,
                 self.model = get_peft_model(self.model, peft_config)
                 log.info(f'PEFT model created. {self.model}')
 
-        self.model_is_peft = False
+        self.using_peft = False
         if peft_installed:
             from peft import PeftModel
             self.using_peft = isinstance(self.model, PeftModel)
@@ -576,18 +578,17 @@ def get_metadata(self):
             }
 
             # Also save PEFT config if the model is a peft model
-            if peft_installed:
-                from peft import PeftModel
-                if isinstance(self.model, PeftModel):
-                    active_adapter = self.model.active_adapter
-                    self.model.peft_config[active_adapter].save_pretrained(str(model_dir))
-                    with open(model_dir / 'adapter_config.json') as _peft_config:
-                        peft_config = json.load(_peft_config)
-
-                    model_output['peft_config'] = {
-                        'file_extension': '.json',
-                        'content': peft_config,
-                    }
+            if self.using_peft:
+                assert isinstance(self.model, PeftModel)
+                active_adapter = self.model.active_adapter
+                self.model.peft_config[active_adapter].save_pretrained(str(model_dir))
+                with open(model_dir / 'adapter_config.json') as _peft_config_file:
+                    peft_config = json.load(_peft_config_file)
+
+                model_output['peft_config'] = {
+                    'file_extension': '.json',
+                    'content': peft_config,
+                }
 
             if self.tokenizer is not None:
                 for tokenizer_file_name in tokenizer_dir.iterdir():
@@ -752,10 +753,11 @@ def get_peft_config_from_composer_state_dict(state_dict: Dict[str, Any]) -> Opti
         raise MissingConditionalImportError(extra_deps_group='nlp', conda_package='peft',
                                             conda_channel='conda-forge') from e
 
-    if 'peft_config' not in state_dict['state']['integrations']['huggingface']['model']:
+    hf_model_dict = state_dict['state']['integrations']['huggingface']['model']
+    if 'peft_config' not in hf_model_dict:
         return None
 
-    peft_config_dict = state_dict['state']['integrations']['huggingface']['model']['peft_config']['content']
+    peft_config_dict = hf_model_dict['peft_config']['content']
 
     return peft.get_peft_config(peft_config_dict)
 

From 320ff55e2887724fedb50cb0061124f7e8e88862 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Thu, 25 Jan 2024 15:55:41 -0800
Subject: [PATCH 52/64] clean up config usage

---
 composer/models/huggingface.py | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index 2dcb7402e8..5fddf42510 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -501,10 +501,7 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
 
             # HF encoder decoder models like T5 expect either decoder_input_ids or labels,
             # so we add decoder_input_ids to the batch if it is missing
-            from transformers import PretrainedConfig
-            assert isinstance(self.model.config, PretrainedConfig)
-            model_config: PretrainedConfig = self.model.config
-            if model_config.is_encoder_decoder and 'decoder_input_ids' not in batch:
+            if self.config.is_encoder_decoder and 'decoder_input_ids' not in batch:
                 if hasattr(self.model, 'prepare_decoder_input_ids_from_labels'):
                     batch['decoder_input_ids'] = self.model.prepare_decoder_input_ids_from_labels(labels=self.labels)
                 else:
@@ -561,9 +558,7 @@ def get_metadata(self):
             model_dir = tmp_dir / 'model'
             tokenizer_dir = tmp_dir / 'tokenizer'
 
-            from transformers import PretrainedConfig
-            assert isinstance(self.model.config, PretrainedConfig)
-            original_model_config: PretrainedConfig = self.model.config
+            original_model_config: PretrainedConfig = self.config
             original_model_config.save_pretrained(model_dir)
             if self.tokenizer is not None:
                 self.tokenizer.save_pretrained(tokenizer_dir)
@@ -645,10 +640,7 @@ def generate(self, input_ids: torch.Tensor, **kwargs):
         if not using_torch_2() and not self.dummy_forward_called and is_model_fsdp(self.model):
             with torch.no_grad():
                 maybe_decoder_input_ids = {}
-                from transformers import PretrainedConfig
-                assert isinstance(self.model.config, PretrainedConfig)
-                model_config: PretrainedConfig = self.model.config
-                if model_config.is_encoder_decoder:
+                if self.config.is_encoder_decoder:
                     maybe_decoder_input_ids['decoder_input_ids'] = torch.tensor([[0]],
                                                                                 dtype=torch.long,
                                                                                 device=input_ids.device)

From 67e3cb2330b58b6772845ab391e3ab26dba0aaa8 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Thu, 25 Jan 2024 16:02:12 -0800
Subject: [PATCH 53/64] refactor underlying model get

---
 composer/models/huggingface.py | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index 5fddf42510..f960b02a5e 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -116,11 +116,8 @@ def __init__(self,
         self.model = model
         self.config: PretrainedConfig = model.config
 
-        self.model_forward_args = inspect.signature(self.model.forward).parameters.keys()
-        if peft_installed:
-            from peft import PeftModel
-            if isinstance(self.model, PeftModel):
-                self.model_forward_args = inspect.signature(self.model.base_model.model.forward).parameters.keys()
+        model_for_forward = maybe_get_underlying_model(model)
+        self.model_forward_args = inspect.signature(model_for_forward.forward).parameters.keys()
 
         if not self.model_forward_args:
             raise ValueError('Could not determine the forward arguments of the model. Please open a GitHub issue.')
@@ -662,6 +659,22 @@ def generate(self, input_ids: torch.Tensor, **kwargs):
             return self.model.generate(input_ids=input_ids, pad_token_id=pad_token_id, **kwargs)
 
 
+def maybe_get_underlying_model(
+        model: Union[transformers.PreTrainedModel, 'PeftModel']) -> Union[transformers.PreTrainedModel, 'PeftModel']:
+    """Get the underlying PreTrainedModel from a model if it is a PEFT model
+
+    Args:
+        model (Union[transformers.PreTrainedModel, 'PeftModel']): The model to get the underlying model from
+
+    Returns:
+        Union[transformers.PreTrainedModel]: The underlying transformers model
+    """
+    if peft_installed and isinstance(model, PeftModel):
+        return model.base_model.model
+    else:
+        return model
+
+
 def _is_registered_causal_lm(model: Union[transformers.PreTrainedModel, 'PeftModel']) -> bool:
     """Return True if model class is either a registered 🤗 Causal LM or a subclass of one"""
     try:
@@ -671,10 +684,7 @@ def _is_registered_causal_lm(model: Union[transformers.PreTrainedModel, 'PeftMod
                                             conda_package='transformers',
                                             conda_channel='conda-forge') from e
 
-    if peft_installed and isinstance(model, PeftModel):
-        model_to_check = model.base_model.model
-    else:
-        model_to_check = model
+    model_to_check = maybe_get_underlying_model(model)
 
     # This try/except is needed until https://github.com/huggingface/transformers/issues/26778
     # is resolved in a release. This means that this attempt to automatically detect causal LMs

From a438f0a7ffb509d1e7f8c14ed9942e49770903a3 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Thu, 25 Jan 2024 16:43:41 -0800
Subject: [PATCH 54/64] fix

---
 composer/models/huggingface.py | 93 ++++++++++++++++++++--------------
 1 file changed, 55 insertions(+), 38 deletions(-)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index f960b02a5e..dc520672ad 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -5,6 +5,7 @@
 
 from __future__ import annotations
 
+import copy
 import inspect
 import json
 import logging
@@ -15,7 +16,7 @@
 import textwrap
 import warnings
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple, Type, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Set, Tuple, Type, Union
 
 import torch
 from torchmetrics import Metric
@@ -115,18 +116,18 @@ def __init__(self,
         super().__init__()
         self.model = model
         self.config: PretrainedConfig = model.config
-
-        model_for_forward = maybe_get_underlying_model(model)
-        self.model_forward_args = inspect.signature(model_for_forward.forward).parameters.keys()
-
-        if not self.model_forward_args:
-            raise ValueError('Could not determine the forward arguments of the model. Please open a GitHub issue.')
-
-        self.model_forward_args = set(self.model_forward_args)
-
+        self.model_forward_args = self._get_model_forward_args()
         self.tokenizer = tokenizer
-
         self.peft_filter_state_dict_trainable = peft_filter_state_dict_trainable
+        self.use_logits = use_logits
+        self.labels: Optional[torch.Tensor] = None  # set in eval_forward() if exists
+        self.dummy_forward_called = False  # Used to make FSDP generate work, see generate function for more details
+        self.train_metrics: Optional[Dict] = self._get_metric_dict(metrics) if metrics is not None else None
+        self.val_metrics: Optional[Dict] = self._get_metric_dict(
+            eval_metrics) if eval_metrics is not None else copy.deepcopy(self.train_metrics)
+
+        is_causal_lm = _is_registered_causal_lm(self.model)
+        self.shift_labels = is_causal_lm if shift_labels is None else shift_labels
 
         if self.tokenizer is None:
             log.warning(
@@ -158,44 +159,34 @@ def __init__(self,
                 f' constructor. The vocab size is sometimes intentionally set to a multiple of 32 or 64 to improve'
                 f' performance.')
 
-        self.use_logits = use_logits
-
-        self.train_metrics: Optional[Dict] = None
-        self.val_metrics: Optional[Dict] = None
-
-        if eval_metrics is not None:
-            self.val_metrics = {metric.__class__.__name__: metric for metric in eval_metrics}
-        if metrics is not None:
-            self.train_metrics = {metric.__class__.__name__: metric for metric in metrics}
-            # if eval_metrics is None, use the same metrics as train_metrics
-            if eval_metrics is None:
-                self.val_metrics = {metric.__class__.__name__: metric for metric in metrics}
-
-        self.labels: Optional[torch.Tensor] = None  # set in eval_forward() if exists
-
-        is_causal_lm = _is_registered_causal_lm(self.model)
-
-        self.shift_labels = is_causal_lm if shift_labels is None else shift_labels
         if is_causal_lm and not self.shift_labels:
             log.warning('The shift_labels argument was set to False but the model is an instance of a'
                         ' HuggingFace Causal LM. This may lead to incorrect behavior.')
             # Note: No warning if shift_labels and not is_causal_lm, since the model may simply be a custom class.
 
-        self.dummy_forward_called = False
-
         if peft_config is not None:
-            from peft import PeftModel
-            if isinstance(self.model, PeftModel):
-                warnings.warn('PEFT model was passed in directly. Ignoring the provided PEFT config.')
-            else:
-                self.model = get_peft_model(self.model, peft_config)
-                log.info(f'PEFT model created. {self.model}')
+            self.model = _maybe_get_peft_model(peft_config, self.model)
 
         self.using_peft = False
         if peft_installed:
             from peft import PeftModel
             self.using_peft = isinstance(self.model, PeftModel)
 
+    def _get_metric_dict(self, metrics: List[Metric]) -> Dict[str, Metric]:
+        """Returns a dictionary of metrics keyed by their class name."""
+        return {metric.__class__.__name__: metric for metric in metrics}
+
+    def _get_model_forward_args(self) -> Set[str]:
+        """Returns the arguments to the model's forward function."""
+        model_forward_args = inspect.signature(maybe_get_underlying_model(self.model).forward).parameters.keys()
+
+        if not model_forward_args:
+            raise ValueError('Could not determine the forward arguments of the model. Please open a GitHub issue.')
+
+        model_forward_args = set(model_forward_args)
+
+        return model_forward_args
+
     def state_dict(self, *args, **kwargs) -> Dict[str, Any]:
         """Returns the state dict of the model."""
         full_state_dict = super().state_dict(*args, **kwargs)
@@ -571,8 +562,8 @@ def get_metadata(self):
 
             # Also save PEFT config if the model is a peft model
             if self.using_peft:
-                assert isinstance(self.model, PeftModel)
                 active_adapter = self.model.active_adapter
+                assert isinstance(active_adapter, str)
                 self.model.peft_config[active_adapter].save_pretrained(str(model_dir))
                 with open(model_dir / 'adapter_config.json') as _peft_config_file:
                     peft_config = json.load(_peft_config_file)
@@ -659,6 +650,32 @@ def generate(self, input_ids: torch.Tensor, **kwargs):
             return self.model.generate(input_ids=input_ids, pad_token_id=pad_token_id, **kwargs)
 
 
+def _maybe_get_peft_model(
+    peft_config: 'PeftConfig',
+    model: Union[transformers.PreTrainedModel, 'PeftModel'],
+) -> 'PeftModel':
+    """Creates a PEFT model if the model is not already a PEFT model.
+
+    Args:
+        peft_config (Optional[peft.PeftConfig]): The PEFT config to use to create the PEFT model
+        model (Union[transformers.PreTrainedModel, 'PeftModel']): The model to create the PEFT model from
+
+    Returns:
+        PeftModel: The PEFT model
+    """
+    if not peft_installed:
+        raise MissingConditionalImportError(extra_deps_group='peft', conda_package='peft', conda_channel='conda-forge')
+
+    if not isinstance(model, PeftModel):
+        log.info('Creating PEFT model')
+        peft_model = get_peft_model(model, peft_config)
+        assert isinstance(peft_model, PeftModel)
+        return peft_model
+    else:
+        warnings.warn('PEFT model was passed in directly. Ignoring the provided PEFT config.')
+        return model
+
+
 def maybe_get_underlying_model(
         model: Union[transformers.PreTrainedModel, 'PeftModel']) -> Union[transformers.PreTrainedModel, 'PeftModel']:
     """Get the underlying PreTrainedModel from a model if it is a PEFT model

From 090b736b93dbf91160a3ee6983afccd3353acde9 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Thu, 25 Jan 2024 16:52:07 -0800
Subject: [PATCH 55/64] finish cleaning up init

---
 composer/models/huggingface.py | 38 +++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index dc520672ad..31353c38e9 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -129,49 +129,49 @@ def __init__(self,
         is_causal_lm = _is_registered_causal_lm(self.model)
         self.shift_labels = is_causal_lm if shift_labels is None else shift_labels
 
+        self._check_tokenizer_and_maybe_resize_embeddings(allow_embedding_resizing)
+
+        if is_causal_lm and not self.shift_labels:
+            log.warning('The shift_labels argument was set to False but the model is an instance of a'
+                        ' HuggingFace Causal LM. This may lead to incorrect behavior.')
+            # Note: No warning if shift_labels and not is_causal_lm, since the model may simply be a custom class.
+
+        if peft_config is not None:
+            self.model = _maybe_get_peft_model(peft_config, self.model)
+
+        self.using_peft = isinstance(self.model, PeftModel) if peft_installed else False
+
+    def _check_tokenizer_and_maybe_resize_embeddings(self, allow_embedding_resizing: bool) -> None:
         if self.tokenizer is None:
             log.warning(
                 'The tokenizer was not provided. This means the tokenizer config will not be saved in the checkpoint.')
 
-        if tokenizer is not None and self.config.vocab_size < len(tokenizer):
+        if self.tokenizer is not None and self.config.vocab_size < len(self.tokenizer):
             if allow_embedding_resizing:
                 # when the embedding size is smaller than the tokenizer vocab size,
                 # the embeddings should get resized to match the tokenizer vocab size
                 log.warning(f'The number of tokens in the tokenizer is greater than the number of tokens in the model.'
                             f' This would cause an error during training.'
-                            f' Resizing the model embeddings to {len(tokenizer)} from {self.config.vocab_size}.')
-                self.model.resize_token_embeddings(len(tokenizer))
+                            f' Resizing the model embeddings to {len(self.tokenizer)} from {self.config.vocab_size}.')
+                self.model.resize_token_embeddings(len(self.tokenizer))
             else:
                 raise ValueError(
                     f'The number of tokens in the tokenizer is greater than the number of tokens in the model.'
                     f' This would cause an error during training.'
-                    f' You can resize the model embeddings to {len(tokenizer)} from {self.config.vocab_size}'
+                    f' You can resize the model embeddings to {len(self.tokenizer)} from {self.config.vocab_size}'
                     f' by calling `model.resize_token_embeddings(len(tokenizer))` before calling the `HuggingFaceModel`'
                     f' constructor, or pass `allow_embedding_resizing=True` to have it done automatically.')
-        elif tokenizer is not None and self.config.vocab_size > len(tokenizer):
+        elif self.tokenizer is not None and self.config.vocab_size > len(self.tokenizer):
             # when the embedding size is greater than the tokenizer vocab size,
             # the embeddings do not _need_ to be resized to match the tokenizer vocab size,
             # and should be done by the user if desired
             log.info(
                 f'The number of tokens in the tokenizer is less than the number of tokens in the model.'
-                f' You may want to resize the model embeddings to {len(tokenizer)} from {self.config.vocab_size}'
+                f' You may want to resize the model embeddings to {len(self.tokenizer)} from {self.config.vocab_size}'
                 f' by calling `model.resize_token_embeddings(len(tokenizer))` before calling the `HuggingFaceModel`'
                 f' constructor. The vocab size is sometimes intentionally set to a multiple of 32 or 64 to improve'
                 f' performance.')
 
-        if is_causal_lm and not self.shift_labels:
-            log.warning('The shift_labels argument was set to False but the model is an instance of a'
-                        ' HuggingFace Causal LM. This may lead to incorrect behavior.')
-            # Note: No warning if shift_labels and not is_causal_lm, since the model may simply be a custom class.
-
-        if peft_config is not None:
-            self.model = _maybe_get_peft_model(peft_config, self.model)
-
-        self.using_peft = False
-        if peft_installed:
-            from peft import PeftModel
-            self.using_peft = isinstance(self.model, PeftModel)
-
     def _get_metric_dict(self, metrics: List[Metric]) -> Dict[str, Metric]:
         """Returns a dictionary of metrics keyed by their class name."""
         return {metric.__class__.__name__: metric for metric in metrics}

From d608099b6a8580e51e498133f8b209a5ed84e90e Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Thu, 25 Jan 2024 16:55:37 -0800
Subject: [PATCH 56/64] adapter name cleanup

---
 composer/models/huggingface.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index 31353c38e9..a010859a0b 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -864,9 +864,9 @@ def write_huggingface_pretrained_from_composer_checkpoint(
     weights_state_dict = composer_state_dict['state']['model']
     torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(weights_state_dict, prefix='model.')
 
-    # NOTE: This only works for default adapter name
+    # NOTE: This only works for default adapter name, not multiple adapters
     if peft_config is not None:
-        weights_state_dict = filter_state_dict_peft(weights_state_dict, peft_config)
+        weights_state_dict = filter_state_dict_peft(weights_state_dict, peft_config, adapter_name='default')
 
         torch.save(weights_state_dict, Path(output_folder) / 'adapter_model.bin')
     else:
@@ -875,6 +875,7 @@ def write_huggingface_pretrained_from_composer_checkpoint(
 
 def filter_state_dict_peft(state_dict: Dict[str, Any],
                            peft_config: 'PeftConfig',
+                           adapter_name: str = 'default',
                            remove_adapter_names: bool = True) -> Dict[str, Any]:
     """Filter a state dict to only include the weights needed for a PEFT model
 
@@ -908,8 +909,8 @@ def filter_state_dict_peft(state_dict: Dict[str, Any],
                     to_return[bias_name] = state_dict[bias_name]
     else:
         raise NotImplementedError
-    to_return = {k: v for k, v in to_return.items() if (('lora_' in k and 'default' in k) or ('bias' in k))}
+    to_return = {k: v for k, v in to_return.items() if (('lora_' in k and adapter_name in k) or ('bias' in k))}
 
     if remove_adapter_names:
-        to_return = {k.replace(f'.default', ''): v for k, v in to_return.items()}
+        to_return = {k.replace(f'.{adapter_name}', ''): v for k, v in to_return.items()}
     return to_return

From d982d400e61191d270a44f14e3999c27d9811736 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Thu, 25 Jan 2024 16:57:08 -0800
Subject: [PATCH 57/64] fix args

---
 composer/models/huggingface.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index a010859a0b..37fcaa469b 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -194,7 +194,10 @@ def state_dict(self, *args, **kwargs) -> Dict[str, Any]:
         if self.using_peft and self.peft_filter_state_dict_trainable:
             active_adapter = self.model.active_adapter
             assert isinstance(active_adapter, str)
-            full_state_dict = filter_state_dict_peft(full_state_dict, self.model.peft_config[active_adapter], False)
+            full_state_dict = filter_state_dict_peft(full_state_dict,
+                                                     self.model.peft_config[active_adapter],
+                                                     adapter_name='default',
+                                                     remove_adapter_names=False)
 
         return full_state_dict
 
@@ -884,6 +887,7 @@ def filter_state_dict_peft(state_dict: Dict[str, Any],
     Args:
         state_dict (Dict[str, Any]): The state dict to filter
         peft_config (PeftConfig): The PEFT config to use to filter the state dict
+        adapter_name (str, optional): The name of the adapter to filter for. Defaults to 'default'.
         remove_adapter_names (bool, optional): Whether to remove the adapter names from the state dict keys. Defaults to True.
 
     Returns:

From ccfd0044c175bdc3716fbd200ff83df9e8e576a9 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Fri, 26 Jan 2024 10:23:15 -0800
Subject: [PATCH 58/64] rename

---
 composer/models/huggingface.py | 8 ++++----
 tests/models/test_hf_model.py  | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index 37fcaa469b..e7b4f5ae13 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -58,7 +58,7 @@ class HuggingFaceModel(ComposerModel):
         shift_labels (bool, optional): If True, the batch's labels will be shifted before being used to calculate metrics. This should be set to true for CausalLM models and false otherwise. If not specified, `shift_labels` will be set automatically based on the model class name. Default: ``None``.
         allow_embedding_resizing (bool, optional): If True, the model's embeddings will be automatically resized when they are smaller than the tokenizer vocab size. Default: ``False``.
         peft_config (PeftConfig, optional): Optional PEFT config to apply to the model. If provided, the model will be converted to a PEFT model. Only LoRA is currently supported.
-        peft_filter_state_dict_trainable (bool, optional): If True _and_ PEFT is active, the state dict will only contain the PEFT weights, not the frozen base model weights.
+        should_filter_state_dict_peft (bool, optional): If True _and_ PEFT is active, the state dict will only contain the PEFT weights, not the frozen base model weights.
 
         .. note:: To ensure correct behavior, set `shift_labels` manually if using a custom model (i.e., if `model` is not
         an instance of a registered 🤗 Transformers class).
@@ -86,7 +86,7 @@ def __init__(self,
                  shift_labels: Optional[bool] = None,
                  allow_embedding_resizing: bool = False,
                  peft_config: Optional['PeftConfig'] = None,
-                 peft_filter_state_dict_trainable: bool = True) -> None:
+                 should_filter_state_dict_peft: bool = True) -> None:
         try:
             import transformers
             del transformers  # unused
@@ -118,7 +118,7 @@ def __init__(self,
         self.config: PretrainedConfig = model.config
         self.model_forward_args = self._get_model_forward_args()
         self.tokenizer = tokenizer
-        self.peft_filter_state_dict_trainable = peft_filter_state_dict_trainable
+        self.should_filter_state_dict_peft = should_filter_state_dict_peft
         self.use_logits = use_logits
         self.labels: Optional[torch.Tensor] = None  # set in eval_forward() if exists
         self.dummy_forward_called = False  # Used to make FSDP generate work, see generate function for more details
@@ -191,7 +191,7 @@ def state_dict(self, *args, **kwargs) -> Dict[str, Any]:
         """Returns the state dict of the model."""
         full_state_dict = super().state_dict(*args, **kwargs)
 
-        if self.using_peft and self.peft_filter_state_dict_trainable:
+        if self.using_peft and self.should_filter_state_dict_peft:
             active_adapter = self.model.active_adapter
             assert isinstance(active_adapter, str)
             full_state_dict = filter_state_dict_peft(full_state_dict,
diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index 59e9c1dc10..e2ec8b4134 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -487,7 +487,7 @@ def get_lm_trainer(hf_model,
         metrics=metrics,
         use_logits=True,
         peft_config=peft_config,
-        peft_filter_state_dict_trainable=just_lora,
+        should_filter_state_dict_peft=just_lora,
     )
 
     vocab_size = hf_model.config.vocab_size
@@ -1417,7 +1417,7 @@ def test_filtered_state_dict(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_con
     hf_model = HuggingFaceModel(tiny_gpt2_model,
                                 tokenizer=tiny_gpt2_tokenizer,
                                 peft_config=gpt2_peft_config,
-                                peft_filter_state_dict_trainable=True)
+                                should_filter_state_dict_peft=True)
     state_dict = hf_model.state_dict()
 
     assert len(state_dict.keys()) == 4

From 59b8106e313b43bb67d310505514cd1f7bbb53ea Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Fri, 26 Jan 2024 13:06:25 -0800
Subject: [PATCH 59/64] Update tests/models/test_hf_model.py

Co-authored-by: Irene Dea <deaairene@gmail.com>
---
 tests/models/test_hf_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index e2ec8b4134..f03d45e4b3 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -474,7 +474,7 @@ def get_lm_trainer(hf_model,
                    sequence_length: int = 4,
                    size: int = 4,
                    peft_config: Optional['PeftConfig'] = None,
-                   just_lora: bool = False):
+                   should_filter_state_dict_peft: bool = False):
     transformers = pytest.importorskip('transformers')
 
     metrics: List[Metric] = [LanguageCrossEntropy(ignore_index=-100)]

From fdbdc0cc757b597ec48a7a62bf6cffab511ff003 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Fri, 26 Jan 2024 13:06:32 -0800
Subject: [PATCH 60/64] Update tests/models/test_hf_model.py

Co-authored-by: Irene Dea <deaairene@gmail.com>
---
 tests/models/test_hf_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index f03d45e4b3..9e8e5fc4be 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -487,7 +487,7 @@ def get_lm_trainer(hf_model,
         metrics=metrics,
         use_logits=True,
         peft_config=peft_config,
-        should_filter_state_dict_peft=just_lora,
+        should_filter_state_dict_peft=should_filter_state_dict_peft,
     )
 
     vocab_size = hf_model.config.vocab_size

From 0cb15871183fd0f71b963ee0f258ca8b03769c7d Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Fri, 26 Jan 2024 13:35:33 -0800
Subject: [PATCH 61/64] fix rename

---
 tests/models/test_hf_model.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index 9e8e5fc4be..862fdcc128 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -1241,8 +1241,8 @@ def test_peft_init_not_installed(tiny_gpt2_model, gpt2_peft_config):
             _ = HuggingFaceModel(tiny_gpt2_model, peft_config=gpt2_peft_config)
 
 
-@pytest.mark.parametrize('just_lora', [True, False])
-def test_peft_trains_and_loads(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, just_lora):
+@pytest.mark.parametrize('should_filter_state_dict_peft', [True, False])
+def test_peft_trains_and_loads(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, should_filter_state_dict_peft):
     pytest.importorskip('peft')
 
     trainer = get_lm_trainer(
@@ -1252,7 +1252,7 @@ def test_peft_trains_and_loads(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_c
         peft_config=gpt2_peft_config,
         device_train_microbatch_size=1,
         mlm=False,
-        just_lora=just_lora,
+        should_filter_state_dict_peft=should_filter_state_dict_peft,
     )
     trainer.fit()
 
@@ -1264,7 +1264,7 @@ def test_peft_trains_and_loads(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_c
         device_train_microbatch_size=1,
         mlm=False,
         load_path=str(tmp_path / 'hf-checkpoint.pt'),
-        just_lora=just_lora,
+        should_filter_state_dict_peft=should_filter_state_dict_peft,
     )
 
     for p1, p2 in zip(trainer.state.model.parameters(), load_trainer.state.model.parameters()):
@@ -1302,8 +1302,8 @@ def test_peft_metadata(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config):
     assert loaded_peft_config == gpt2_peft_config
 
 
-@pytest.mark.parametrize('just_lora', [True, False])
-def test_peft_write_hf_from_composer(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, just_lora):
+@pytest.mark.parametrize('should_filter_state_dict_peft', [True, False])
+def test_peft_write_hf_from_composer(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, should_filter_state_dict_peft):
     peft = pytest.importorskip('peft')
     transformers = pytest.importorskip('transformers')
 
@@ -1318,7 +1318,7 @@ def test_peft_write_hf_from_composer(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_
         peft_config=gpt2_peft_config,
         device_train_microbatch_size=1,
         mlm=False,
-        just_lora=just_lora,
+        should_filter_state_dict_peft=should_filter_state_dict_peft,
     )
     trainer.fit()
 
@@ -1339,10 +1339,10 @@ def test_peft_write_hf_from_composer(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_
 
 @pytest.mark.gpu
 @world_size(2)
-@pytest.mark.parametrize('just_lora', [True, False])
+@pytest.mark.parametrize('should_filter_state_dict_peft', [True, False])
 @pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.13.0'),
                     reason='requires PyTorch 1.13 or higher')
-def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, world_size, just_lora):
+def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, world_size, should_filter_state_dict_peft):
     pytest.importorskip('peft')
 
     fsdp_config = {
@@ -1365,7 +1365,7 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config
         device_train_microbatch_size=1,
         mlm=False,
         fsdp_config=fsdp_config,
-        just_lora=just_lora,
+        should_filter_state_dict_peft=should_filter_state_dict_peft,
     )
 
     for n, p in trainer.state.model.model.named_parameters():
@@ -1386,7 +1386,7 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config
         mlm=False,
         load_path=str(tmp_path / 'trainer1' / 'hf-checkpoint.pt'),
         fsdp_config=fsdp_config,
-        just_lora=just_lora,
+        should_filter_state_dict_peft=should_filter_state_dict_peft,
     )
 
     for n, p in load_trainer.state.model.model.named_parameters():
@@ -1405,7 +1405,7 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config
         loaded_ckpt_1 = torch.load(str(tmp_path / 'trainer1' / 'hf-checkpoint.pt'))
 
         # Check that only the LoRA parameters were saved
-        if just_lora:
+        if should_filter_state_dict_peft:
             assert all('lora' in k for k in loaded_ckpt_1['state']['model'].keys())
         else:
             assert not all('lora' in k for k in loaded_ckpt_1['state']['model'].keys())

From d7cf36105b54d59da7f054525de3968639219f99 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Fri, 26 Jan 2024 13:56:31 -0800
Subject: [PATCH 62/64] precommit

---
 tests/models/test_hf_model.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index 862fdcc128..0a00045474 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -1242,7 +1242,8 @@ def test_peft_init_not_installed(tiny_gpt2_model, gpt2_peft_config):
 
 
 @pytest.mark.parametrize('should_filter_state_dict_peft', [True, False])
-def test_peft_trains_and_loads(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, should_filter_state_dict_peft):
+def test_peft_trains_and_loads(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path,
+                               should_filter_state_dict_peft):
     pytest.importorskip('peft')
 
     trainer = get_lm_trainer(
@@ -1303,7 +1304,8 @@ def test_peft_metadata(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config):
 
 
 @pytest.mark.parametrize('should_filter_state_dict_peft', [True, False])
-def test_peft_write_hf_from_composer(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, should_filter_state_dict_peft):
+def test_peft_write_hf_from_composer(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path,
+                                     should_filter_state_dict_peft):
     peft = pytest.importorskip('peft')
     transformers = pytest.importorskip('transformers')
 
@@ -1342,7 +1344,8 @@ def test_peft_write_hf_from_composer(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_
 @pytest.mark.parametrize('should_filter_state_dict_peft', [True, False])
 @pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.13.0'),
                     reason='requires PyTorch 1.13 or higher')
-def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, world_size, should_filter_state_dict_peft):
+def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, world_size,
+                          should_filter_state_dict_peft):
     pytest.importorskip('peft')
 
     fsdp_config = {

From 54c041f6d98834a20812f5ccc7753b3964d46e6d Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Mon, 29 Jan 2024 10:11:44 -0800
Subject: [PATCH 63/64] nit

---
 composer/models/huggingface.py |  8 ++++----
 tests/models/test_hf_model.py  | 31 +++++++++++++++----------------
 2 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index e7b4f5ae13..0c5f24af7c 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -58,7 +58,7 @@ class HuggingFaceModel(ComposerModel):
         shift_labels (bool, optional): If True, the batch's labels will be shifted before being used to calculate metrics. This should be set to true for CausalLM models and false otherwise. If not specified, `shift_labels` will be set automatically based on the model class name. Default: ``None``.
         allow_embedding_resizing (bool, optional): If True, the model's embeddings will be automatically resized when they are smaller than the tokenizer vocab size. Default: ``False``.
         peft_config (PeftConfig, optional): Optional PEFT config to apply to the model. If provided, the model will be converted to a PEFT model. Only LoRA is currently supported.
-        should_filter_state_dict_peft (bool, optional): If True _and_ PEFT is active, the state dict will only contain the PEFT weights, not the frozen base model weights.
+        should_save_just_peft (bool, optional): If True _and_ PEFT is active, the state dict will only contain the PEFT weights, not the frozen base model weights.
 
         .. note:: To ensure correct behavior, set `shift_labels` manually if using a custom model (i.e., if `model` is not
         an instance of a registered 🤗 Transformers class).
@@ -86,7 +86,7 @@ def __init__(self,
                  shift_labels: Optional[bool] = None,
                  allow_embedding_resizing: bool = False,
                  peft_config: Optional['PeftConfig'] = None,
-                 should_filter_state_dict_peft: bool = True) -> None:
+                 should_save_just_peft: bool = True) -> None:
         try:
             import transformers
             del transformers  # unused
@@ -118,7 +118,7 @@ def __init__(self,
         self.config: PretrainedConfig = model.config
         self.model_forward_args = self._get_model_forward_args()
         self.tokenizer = tokenizer
-        self.should_filter_state_dict_peft = should_filter_state_dict_peft
+        self.should_save_just_peft = should_save_just_peft
         self.use_logits = use_logits
         self.labels: Optional[torch.Tensor] = None  # set in eval_forward() if exists
         self.dummy_forward_called = False  # Used to make FSDP generate work, see generate function for more details
@@ -191,7 +191,7 @@ def state_dict(self, *args, **kwargs) -> Dict[str, Any]:
         """Returns the state dict of the model."""
         full_state_dict = super().state_dict(*args, **kwargs)
 
-        if self.using_peft and self.should_filter_state_dict_peft:
+        if self.using_peft and self.should_save_just_peft:
             active_adapter = self.model.active_adapter
             assert isinstance(active_adapter, str)
             full_state_dict = filter_state_dict_peft(full_state_dict,
diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index 0a00045474..d5e4d29d3e 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -474,7 +474,7 @@ def get_lm_trainer(hf_model,
                    sequence_length: int = 4,
                    size: int = 4,
                    peft_config: Optional['PeftConfig'] = None,
-                   should_filter_state_dict_peft: bool = False):
+                   should_save_just_peft: bool = False):
     transformers = pytest.importorskip('transformers')
 
     metrics: List[Metric] = [LanguageCrossEntropy(ignore_index=-100)]
@@ -487,7 +487,7 @@ def get_lm_trainer(hf_model,
         metrics=metrics,
         use_logits=True,
         peft_config=peft_config,
-        should_filter_state_dict_peft=should_filter_state_dict_peft,
+        should_save_just_peft=should_save_just_peft,
     )
 
     vocab_size = hf_model.config.vocab_size
@@ -1241,9 +1241,8 @@ def test_peft_init_not_installed(tiny_gpt2_model, gpt2_peft_config):
             _ = HuggingFaceModel(tiny_gpt2_model, peft_config=gpt2_peft_config)
 
 
-@pytest.mark.parametrize('should_filter_state_dict_peft', [True, False])
-def test_peft_trains_and_loads(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path,
-                               should_filter_state_dict_peft):
+@pytest.mark.parametrize('should_save_just_peft', [True, False])
+def test_peft_trains_and_loads(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, should_save_just_peft):
     pytest.importorskip('peft')
 
     trainer = get_lm_trainer(
@@ -1253,7 +1252,7 @@ def test_peft_trains_and_loads(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_c
         peft_config=gpt2_peft_config,
         device_train_microbatch_size=1,
         mlm=False,
-        should_filter_state_dict_peft=should_filter_state_dict_peft,
+        should_save_just_peft=should_save_just_peft,
     )
     trainer.fit()
 
@@ -1265,7 +1264,7 @@ def test_peft_trains_and_loads(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_c
         device_train_microbatch_size=1,
         mlm=False,
         load_path=str(tmp_path / 'hf-checkpoint.pt'),
-        should_filter_state_dict_peft=should_filter_state_dict_peft,
+        should_save_just_peft=should_save_just_peft,
     )
 
     for p1, p2 in zip(trainer.state.model.parameters(), load_trainer.state.model.parameters()):
@@ -1303,9 +1302,9 @@ def test_peft_metadata(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config):
     assert loaded_peft_config == gpt2_peft_config
 
 
-@pytest.mark.parametrize('should_filter_state_dict_peft', [True, False])
+@pytest.mark.parametrize('should_save_just_peft', [True, False])
 def test_peft_write_hf_from_composer(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path,
-                                     should_filter_state_dict_peft):
+                                     should_save_just_peft):
     peft = pytest.importorskip('peft')
     transformers = pytest.importorskip('transformers')
 
@@ -1320,7 +1319,7 @@ def test_peft_write_hf_from_composer(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_
         peft_config=gpt2_peft_config,
         device_train_microbatch_size=1,
         mlm=False,
-        should_filter_state_dict_peft=should_filter_state_dict_peft,
+        should_save_just_peft=should_save_just_peft,
     )
     trainer.fit()
 
@@ -1341,11 +1340,11 @@ def test_peft_write_hf_from_composer(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_
 
 @pytest.mark.gpu
 @world_size(2)
-@pytest.mark.parametrize('should_filter_state_dict_peft', [True, False])
+@pytest.mark.parametrize('should_save_just_peft', [True, False])
 @pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.13.0'),
                     reason='requires PyTorch 1.13 or higher')
 def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, world_size,
-                          should_filter_state_dict_peft):
+                          should_save_just_peft):
     pytest.importorskip('peft')
 
     fsdp_config = {
@@ -1368,7 +1367,7 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config
         device_train_microbatch_size=1,
         mlm=False,
         fsdp_config=fsdp_config,
-        should_filter_state_dict_peft=should_filter_state_dict_peft,
+        should_save_just_peft=should_save_just_peft,
     )
 
     for n, p in trainer.state.model.model.named_parameters():
@@ -1389,7 +1388,7 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config
         mlm=False,
         load_path=str(tmp_path / 'trainer1' / 'hf-checkpoint.pt'),
         fsdp_config=fsdp_config,
-        should_filter_state_dict_peft=should_filter_state_dict_peft,
+        should_save_just_peft=should_save_just_peft,
     )
 
     for n, p in load_trainer.state.model.model.named_parameters():
@@ -1408,7 +1407,7 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config
         loaded_ckpt_1 = torch.load(str(tmp_path / 'trainer1' / 'hf-checkpoint.pt'))
 
         # Check that only the LoRA parameters were saved
-        if should_filter_state_dict_peft:
+        if should_save_just_peft:
             assert all('lora' in k for k in loaded_ckpt_1['state']['model'].keys())
         else:
             assert not all('lora' in k for k in loaded_ckpt_1['state']['model'].keys())
@@ -1420,7 +1419,7 @@ def test_filtered_state_dict(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_con
     hf_model = HuggingFaceModel(tiny_gpt2_model,
                                 tokenizer=tiny_gpt2_tokenizer,
                                 peft_config=gpt2_peft_config,
-                                should_filter_state_dict_peft=True)
+                                should_save_just_peft=True)
     state_dict = hf_model.state_dict()
 
     assert len(state_dict.keys()) == 4

From e9c4d4a3209639ab75e1244b75d0e96e07595b13 Mon Sep 17 00:00:00 2001
From: Daniel King <daniel@mosaicml.com>
Date: Mon, 29 Jan 2024 10:15:56 -0800
Subject: [PATCH 64/64] rename

---
 composer/models/huggingface.py |  8 ++++----
 tests/models/test_hf_model.py  | 30 +++++++++++++++---------------
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py
index 0c5f24af7c..e635f0cec7 100644
--- a/composer/models/huggingface.py
+++ b/composer/models/huggingface.py
@@ -58,7 +58,7 @@ class HuggingFaceModel(ComposerModel):
         shift_labels (bool, optional): If True, the batch's labels will be shifted before being used to calculate metrics. This should be set to true for CausalLM models and false otherwise. If not specified, `shift_labels` will be set automatically based on the model class name. Default: ``None``.
         allow_embedding_resizing (bool, optional): If True, the model's embeddings will be automatically resized when they are smaller than the tokenizer vocab size. Default: ``False``.
         peft_config (PeftConfig, optional): Optional PEFT config to apply to the model. If provided, the model will be converted to a PEFT model. Only LoRA is currently supported.
-        should_save_just_peft (bool, optional): If True _and_ PEFT is active, the state dict will only contain the PEFT weights, not the frozen base model weights.
+        should_save_peft_only (bool, optional): If True _and_ PEFT is active, the state dict will only contain the PEFT weights, not the frozen base model weights.
 
         .. note:: To ensure correct behavior, set `shift_labels` manually if using a custom model (i.e., if `model` is not
         an instance of a registered 🤗 Transformers class).
@@ -86,7 +86,7 @@ def __init__(self,
                  shift_labels: Optional[bool] = None,
                  allow_embedding_resizing: bool = False,
                  peft_config: Optional['PeftConfig'] = None,
-                 should_save_just_peft: bool = True) -> None:
+                 should_save_peft_only: bool = True) -> None:
         try:
             import transformers
             del transformers  # unused
@@ -118,7 +118,7 @@ def __init__(self,
         self.config: PretrainedConfig = model.config
         self.model_forward_args = self._get_model_forward_args()
         self.tokenizer = tokenizer
-        self.should_save_just_peft = should_save_just_peft
+        self.should_save_peft_only = should_save_peft_only
         self.use_logits = use_logits
         self.labels: Optional[torch.Tensor] = None  # set in eval_forward() if exists
         self.dummy_forward_called = False  # Used to make FSDP generate work, see generate function for more details
@@ -191,7 +191,7 @@ def state_dict(self, *args, **kwargs) -> Dict[str, Any]:
         """Returns the state dict of the model."""
         full_state_dict = super().state_dict(*args, **kwargs)
 
-        if self.using_peft and self.should_save_just_peft:
+        if self.using_peft and self.should_save_peft_only:
             active_adapter = self.model.active_adapter
             assert isinstance(active_adapter, str)
             full_state_dict = filter_state_dict_peft(full_state_dict,
diff --git a/tests/models/test_hf_model.py b/tests/models/test_hf_model.py
index d5e4d29d3e..aeb42cd513 100644
--- a/tests/models/test_hf_model.py
+++ b/tests/models/test_hf_model.py
@@ -474,7 +474,7 @@ def get_lm_trainer(hf_model,
                    sequence_length: int = 4,
                    size: int = 4,
                    peft_config: Optional['PeftConfig'] = None,
-                   should_save_just_peft: bool = False):
+                   should_save_peft_only: bool = False):
     transformers = pytest.importorskip('transformers')
 
     metrics: List[Metric] = [LanguageCrossEntropy(ignore_index=-100)]
@@ -487,7 +487,7 @@ def get_lm_trainer(hf_model,
         metrics=metrics,
         use_logits=True,
         peft_config=peft_config,
-        should_save_just_peft=should_save_just_peft,
+        should_save_peft_only=should_save_peft_only,
     )
 
     vocab_size = hf_model.config.vocab_size
@@ -1241,8 +1241,8 @@ def test_peft_init_not_installed(tiny_gpt2_model, gpt2_peft_config):
             _ = HuggingFaceModel(tiny_gpt2_model, peft_config=gpt2_peft_config)
 
 
-@pytest.mark.parametrize('should_save_just_peft', [True, False])
-def test_peft_trains_and_loads(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, should_save_just_peft):
+@pytest.mark.parametrize('should_save_peft_only', [True, False])
+def test_peft_trains_and_loads(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, should_save_peft_only):
     pytest.importorskip('peft')
 
     trainer = get_lm_trainer(
@@ -1252,7 +1252,7 @@ def test_peft_trains_and_loads(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_c
         peft_config=gpt2_peft_config,
         device_train_microbatch_size=1,
         mlm=False,
-        should_save_just_peft=should_save_just_peft,
+        should_save_peft_only=should_save_peft_only,
     )
     trainer.fit()
 
@@ -1264,7 +1264,7 @@ def test_peft_trains_and_loads(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_c
         device_train_microbatch_size=1,
         mlm=False,
         load_path=str(tmp_path / 'hf-checkpoint.pt'),
-        should_save_just_peft=should_save_just_peft,
+        should_save_peft_only=should_save_peft_only,
     )
 
     for p1, p2 in zip(trainer.state.model.parameters(), load_trainer.state.model.parameters()):
@@ -1302,9 +1302,9 @@ def test_peft_metadata(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config):
     assert loaded_peft_config == gpt2_peft_config
 
 
-@pytest.mark.parametrize('should_save_just_peft', [True, False])
+@pytest.mark.parametrize('should_save_peft_only', [True, False])
 def test_peft_write_hf_from_composer(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path,
-                                     should_save_just_peft):
+                                     should_save_peft_only):
     peft = pytest.importorskip('peft')
     transformers = pytest.importorskip('transformers')
 
@@ -1319,7 +1319,7 @@ def test_peft_write_hf_from_composer(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_
         peft_config=gpt2_peft_config,
         device_train_microbatch_size=1,
         mlm=False,
-        should_save_just_peft=should_save_just_peft,
+        should_save_peft_only=should_save_peft_only,
     )
     trainer.fit()
 
@@ -1340,11 +1340,11 @@ def test_peft_write_hf_from_composer(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_
 
 @pytest.mark.gpu
 @world_size(2)
-@pytest.mark.parametrize('should_save_just_peft', [True, False])
+@pytest.mark.parametrize('should_save_peft_only', [True, False])
 @pytest.mark.skipif(version.parse(torch.__version__) < version.parse('1.13.0'),
                     reason='requires PyTorch 1.13 or higher')
 def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config, tmp_path, world_size,
-                          should_save_just_peft):
+                          should_save_peft_only):
     pytest.importorskip('peft')
 
     fsdp_config = {
@@ -1367,7 +1367,7 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config
         device_train_microbatch_size=1,
         mlm=False,
         fsdp_config=fsdp_config,
-        should_save_just_peft=should_save_just_peft,
+        should_save_peft_only=should_save_peft_only,
     )
 
     for n, p in trainer.state.model.model.named_parameters():
@@ -1388,7 +1388,7 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config
         mlm=False,
         load_path=str(tmp_path / 'trainer1' / 'hf-checkpoint.pt'),
         fsdp_config=fsdp_config,
-        should_save_just_peft=should_save_just_peft,
+        should_save_peft_only=should_save_peft_only,
     )
 
     for n, p in load_trainer.state.model.model.named_parameters():
@@ -1407,7 +1407,7 @@ def test_peft_fsdp_trains(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_config
         loaded_ckpt_1 = torch.load(str(tmp_path / 'trainer1' / 'hf-checkpoint.pt'))
 
         # Check that only the LoRA parameters were saved
-        if should_save_just_peft:
+        if should_save_peft_only:
             assert all('lora' in k for k in loaded_ckpt_1['state']['model'].keys())
         else:
             assert not all('lora' in k for k in loaded_ckpt_1['state']['model'].keys())
@@ -1419,7 +1419,7 @@ def test_filtered_state_dict(tiny_gpt2_model, tiny_gpt2_tokenizer, gpt2_peft_con
     hf_model = HuggingFaceModel(tiny_gpt2_model,
                                 tokenizer=tiny_gpt2_tokenizer,
                                 peft_config=gpt2_peft_config,
-                                should_save_just_peft=True)
+                                should_save_peft_only=True)
     state_dict = hf_model.state_dict()
 
     assert len(state_dict.keys()) == 4