mosaicml · irenedea · Jul 21, 2024 · Jul 21, 2024 · Jul 21, 2024 · Jul 21, 2024
@@ -436,7 +436,7 @@ def _save_checkpoint(self, state: State, logger: Logger):
         cpu_offload = True
 
         # Add a dtensor->cpu tensor hook to avoid CUDA OOM
-        def dtensor_to_tensor_hook(
+        def tensor_hook(
             module: nn.Module,
             state_dict: Dict[str, Any],
             prefix: str,
@@ -449,20 +449,23 @@ def dtensor_to_tensor_hook(
                     dtensor_fqns.append(fqn)
                     tensor = tensor.full_tensor()  # type: ignore
                     if dist.get_global_rank() == 0:
+                        # Offload any DTensors to CPU
                         if cpu_offload:
                             tensor = tensor.cpu()
                         state_dict[fqn] = tensor
+                    else:
+                        state_dict[fqn] = None
+                # Convert the state dict to the requested precision
+                if isinstance(tensor, torch.Tensor):
+                    state_dict[fqn] = tensor.to(dtype=self.dtype)
+                del tensor
             if dist.get_global_rank() != 0:
-                for fqn in dtensor_fqns:
-                    del state_dict[fqn]
+                state_dict = {}
             return state_dict
 
         hooks = []
         for _, module in state_dict_model.named_modules():
-            if isinstance(module, FSDP):
-                hooks.append(
-                    module._register_state_dict_hook(dtensor_to_tensor_hook),
-                )
+            hooks.append(module._register_state_dict_hook(tensor_hook),)
 
         state_dict = get_model_state_dict(
             state_dict_model,
@@ -474,11 +477,6 @@ def dtensor_to_tensor_hook(
         for hook in hooks:
             hook.remove()
 
-        # Convert the state dict to the requested precision
-        for k, v in state_dict.items():
-            if isinstance(v, torch.Tensor):
-                state_dict[k] = v.to(dtype=self.dtype)
-
         new_model_instance = None  # Need this for pyright because variable could be unbound
 
         if dist.get_global_rank() == 0:
@@ -537,7 +535,7 @@ def dtensor_to_tensor_hook(
                 original_tokenizer.save_pretrained(temp_save_dir)
 
             # Only need to edit files for MPT because it has custom code
-            if original_model.config.model_type == 'mpt':
+            if new_model_instance.config.model_type == 'mpt':
                 log.debug('Editing MPT files for HuggingFace compatibility')
                 edit_files_for_hf_compatibility(
                     temp_save_dir,

@@ -260,7 +260,7 @@ def train(cfg: DictConfig) -> Trainer:
 
     if fsdp_config is not None:
         if 'load_planner' in fsdp_config:
-            load_planners = fsdp_config['load_planner'].items()
+            load_planners = list(fsdp_config['load_planner'].items())
             if len(load_planners) > 1:
                 raise ValueError(
                     'Only one load planner can be specified in the config.',
@@ -272,7 +272,7 @@ def train(cfg: DictConfig) -> Trainer:
             )
 
         if 'save_planner' in fsdp_config:
-            save_planners = fsdp_config['save_planner'].items()
+            save_planners = list(fsdp_config['save_planner'].items())
             if len(save_planners) > 1:
                 raise ValueError(
                     'Only one save planner can be specified in the config.',

@@ -11,7 +11,6 @@
     Any,
     Dict,
     List,
-    Mapping,
     Optional,
     Tuple,
     Union,
@@ -23,7 +22,6 @@
 from transformers import (
     AutoConfig,
     AutoModelForCausalLM,
-    PretrainedConfig,
     PreTrainedModel,
     PreTrainedTokenizerBase,
 )
@@ -36,7 +34,7 @@
 from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithFSDP
 from llmfoundry.models.layers.attention import is_flash_v2_installed
 from llmfoundry.models.utils import init_empty_weights
-from llmfoundry.utils.config_utils import get_hf_config_value
+from llmfoundry.utils.config_utils import set_config_overrides
 
 if TYPE_CHECKING:
     from peft import PeftConfig, PeftModel
@@ -105,9 +103,12 @@ def __init__(
             config_overrides=config_overrides,
             load_in_8bit=load_in_8bit,
             pretrained=pretrained,
-            prepare_for_fsdp=True,
         )
 
+        model = self.transform_model(model)
+
+        ComposerHFCausalLM.prepare_inner_model(model, init_device)
+
         train_metrics, eval_metrics = ComposerHFCausalLM.build_metrics(
             use_train_metrics=use_train_metrics,
             additional_train_metrics=additional_train_metrics,
@@ -121,7 +122,7 @@ def __init__(
 
         peft_config_object = None
         if peft_config is not None:
-            peft_config_object = self._get_peft_config(peft_config)
+            peft_config_object = self.get_peft_config(peft_config)
 
         # Set up config args for the model construction and base classes
         super().__init__(
@@ -135,6 +136,17 @@ def __init__(
             should_save_peft_only=should_save_peft_only,
         )
 
+    def transform_model(self, model: PreTrainedModel) -> PreTrainedModel:
+        """Transforms the model after initialization.
+
+        Args:
+            model (PreTrainedModel): The model to transform.
+
+        Returns:
+            PreTrainedModel: The transformed model.
+        """
+        return model
+
     @staticmethod
     def build_metrics(
         use_train_metrics: bool,
@@ -179,7 +191,6 @@ def build_inner_model(
         config_overrides: Dict[str, Any],
         load_in_8bit: bool,
         pretrained: bool,
-        prepare_for_fsdp: bool = False,
     ) -> Union[PreTrainedModel, 'PeftModel']:
         """Builds the inner model for the ComposerHFCausalLM.
 
@@ -259,50 +270,7 @@ def _autoset_attn_implementation_monkeypatch(
             _autoset_attn_implementation_monkeypatch,
         )
 
-        # set config overrides
-        for k, v in config_overrides.items():
-            if not hasattr(config, k):
-                raise ValueError(
-                    f'config does not have attribute "{k}" to override ({k}: {v}).',
-                )
-
-            attr = getattr(config, k)
-            # attempt to disallow typos in nested configs
-            if isinstance(attr, Mapping):
-                extra_keys = [_k for _k in v.keys() if _k not in attr.keys()]
-                if extra_keys:
-                    raise ValueError(
-                        f'Config dict override got unknown keys. ' +
-                        f'Extra keys: {extra_keys}. ' +
-                        f'Expected (a subset of) keys: {list(attr.keys())}.',
-                    )
-                getattr(config, k).update(v)
-            # necessary case to allow for rope_scaling to be overriden in llama config
-            elif attr is None and isinstance(v, Mapping):
-                setattr(config, k, {})
-                getattr(config, k).update(v)
-            elif isinstance(attr, PretrainedConfig):
-                if not isinstance(v, Mapping):
-                    raise ValueError(
-                        f'Expected a dictionary for config override {k}, but got {v}.',
-                    )
-
-                for _k, _v in v.items():
-                    if not hasattr(attr, _k):
-                        raise ValueError(
-                            f'config does not have attribute "{_k}" to override ({k}: {_k}: {_v}).',
-                        )
-                    setattr(attr, _k, _v)
-            else:
-                setattr(config, k, v)
-
-        if hasattr(config, 'attn_config') and get_hf_config_value(
-            config.attn_config,
-            'seq_parallel_world_size',
-        ) is not None:
-            raise NotImplementedError(
-                'Sequence Parallelism is not supported for HuggingFace models.',
-            )
+        set_config_overrides(config, config_overrides)
 
         # We need to have all non-zero local ranks be not-pretrained
         # Rank 0 will still be pretrained, and distribute the weights appropriately
@@ -393,12 +361,9 @@ def _autoset_attn_implementation_monkeypatch(
                 pretrained_lora_id_or_path,
             )
 
-        if prepare_for_fsdp:
-            ComposerHFCausalLM.prepare_inner_model(model, init_device)
         return model
 
-    @staticmethod
-    def _get_peft_config(peft_config_dict: Dict[str, Any]) -> 'PeftConfig':
+    def get_peft_config(self, peft_config_dict: Dict[str, Any]) -> 'PeftConfig':
         if peft_installed:
             from peft import LoraConfig
             peft_type = peft_config_dict.get('peft_type', '')

@@ -812,3 +812,45 @@ def _verify_uc_path(path: str) -> bool:
                 f'but your `UCVolumeDatasetSource` might be invalid.',
             )
     return False
+
+
+def set_config_overrides(
+    config: PretrainedConfig,
+    config_overrides: Dict[str, Any],
+):
+    # set config overrides
+    for k, v in config_overrides.items():
+        if not hasattr(config, k):
+            raise ValueError(
+                f'config does not have attribute "{k}" to override ({k}: {v}).',
+            )
+
+        attr = getattr(config, k)
+        # attempt to disallow typos in nested configs
+        if isinstance(attr, Mapping):
+            extra_keys = [_k for _k in v.keys() if _k not in attr.keys()]
+            if extra_keys:
+                raise ValueError(
+                    f'Config dict override got unknown keys. ' +
+                    f'Extra keys: {extra_keys}. ' +
+                    f'Expected (a subset of) keys: {list(attr.keys())}.',
+                )
+            getattr(config, k).update(v)
+        # necessary case to allow for rope_scaling to be overriden in llama config
+        elif attr is None and isinstance(v, Mapping):
+            setattr(config, k, {})
+            getattr(config, k).update(v)
+        elif isinstance(attr, PretrainedConfig):
+            if not isinstance(v, Mapping):
+                raise ValueError(
+                    f'Expected a dictionary for config override {k}, but got {v}.',
+                )
+
+            for _k, _v in v.items():
+                if not hasattr(attr, _k):
+                    raise ValueError(
+                        f'config does not have attribute "{_k}" to override ({k}: {_k}: {_v}).',
+                    )
+                setattr(attr, _k, _v)
+        else:
+            setattr(config, k, v)
diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py
@@ -383,6 +383,8 @@ def test_huggingface_conversion_callback_interval(
     mlflow_logger_mock.model_registry_prefix = ''
     mlflow_logger_mock._experiment_id = 'mlflow-experiment-id'
     mlflow_logger_mock._run_id = 'mlflow-run-id'
+    mlflow_logger_mock._enabled = True
+    mlflow_logger_mock.run_url = 'fake-url'
     checkpointer_callback.transform_model_pre_registration = MagicMock(
         wraps=checkpointer_callback.transform_model_pre_registration,
     )