From c13767a8ae0d09d3836ff926ed08536b04ae799c Mon Sep 17 00:00:00 2001 From: Daniel King Date: Fri, 15 Sep 2023 22:35:31 -0700 Subject: [PATCH 01/54] wip --- llmfoundry/callbacks/hf_checkpointer.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index fe3028ab19..23402b2171 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -13,7 +13,7 @@ from composer.callbacks.utils import create_interval_scheduler from composer.core import Callback, Event, State, Time from composer.core.state import fsdp_state_dict_type_context -from composer.loggers import Logger +from composer.loggers import Logger, MLFlowLogger from composer.loggers.remote_uploader_downloader import RemoteUploaderDownloader from composer.models import HuggingFaceModel from composer.utils import dist, format_name_with_dist_and_time, parse_uri @@ -39,6 +39,7 @@ class HuggingFaceCheckpointer(Callback): huggingface_folder_name (str): Folder to save each checkpoint under (can be a format string). Default is ``ba{batch}``. precision: The precision to save the model in. Default is ``float32``. Options are ``bfloat16``, ``float16``, or ``float32``. overwrite (bool): Whether to overwrite previous checkpoints. + log_to_mlflow (bool): Whether to log and register the checkpoint to MLFlow. Default is ``False``. """ def __init__( @@ -48,6 +49,7 @@ def __init__( huggingface_folder_name: str = 'ba{batch}', precision: str = 'fp32', overwrite: bool = False, + log_to_mlflow: bool = False, ): self.backend, self.bucket_name, self.save_dir_format_str = parse_uri( save_folder) @@ -58,6 +60,7 @@ def __init__( 'float16': torch.float16, 'bfloat16': torch.bfloat16, }[precision] + self.log_to_mlflow = log_to_mlflow self.huggingface_folder_name_fstr = os.path.join( 'huggingface', huggingface_folder_name) self.check_interval = create_interval_scheduler( @@ -77,7 +80,7 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None: if state.get_elapsed_duration() is not None and self.check_interval( state, event) and self.last_checkpoint_batch != state.timestamp.batch: - self._save_checkpoint(state, logger) + self._save_checkpoint(state, logger, is_fit_end=event == Event.FIT_END) elif event == Event.INIT: if not isinstance(state.model, HuggingFaceModel): raise ValueError( @@ -87,7 +90,15 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None: self.remote_ud.init(state, logger) state.callbacks.append(self.remote_ud) - def _save_checkpoint(self, state: State, logger: Logger): + if self.log_to_mlflow: + mlflow_loggers = [logger_destination for logger_destination in state.loggers if isinstance(logger_destination, MLFlowLogger)] + if len(mlflow_loggers) == 0: + raise ValueError( + f'`log_to_mlflow` was set to `True` but no `MLFlowLogger` was found in the `state.loggers` list. ' + + 'Please add an `MLFlowLogger` or set `log_to_mlflow` to `False`.' + ) + + def _save_checkpoint(self, state: State, logger: Logger, is_fit_end: bool = False): del logger # unused self.last_checkpoint_batch = state.timestamp.batch @@ -164,4 +175,7 @@ def _save_checkpoint(self, state: State, logger: Logger): overwrite=self.overwrite, ) + if self.log_to_mlflow: + + dist.barrier() From 2c0ff34f0f8a820ebafaf046b8281d4832a9a8ba Mon Sep 17 00:00:00 2001 From: Daniel King Date: Mon, 18 Sep 2023 19:08:41 -0700 Subject: [PATCH 02/54] wip --- llmfoundry/callbacks/hf_checkpointer.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 23402b2171..1f1ebcec6a 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -20,6 +20,7 @@ from transformers import PreTrainedTokenizerBase from llmfoundry.models.mpt import MPTConfig, MPTForCausalLM +from llmfoundry.models.utils import init_empty_weights from llmfoundry.utils.huggingface_hub_utils import \ edit_files_for_hf_compatibility @@ -162,7 +163,6 @@ def _save_checkpoint(self, state: State, logger: Logger, is_fit_end: bool = Fals if self.upload_to_object_store: assert self.remote_ud is not None - # TODO change to log after other pr log.info( f'Uploading HuggingFace formatted checkpoint to {self.backend}://{self.bucket_name}/{save_dir}' ) @@ -176,6 +176,10 @@ def _save_checkpoint(self, state: State, logger: Logger, is_fit_end: bool = Fals ) if self.log_to_mlflow: + # Free up memory before creating another copy of the model to log to MLFlow + del state_dict + + dist.barrier() From 63dc74402ecc6e1789c423a9dcc050b94581b558 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 19 Sep 2023 03:01:25 +0000 Subject: [PATCH 03/54] wip --- llmfoundry/callbacks/hf_checkpointer.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 1f1ebcec6a..4b359a1028 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -75,6 +75,7 @@ def __init__( self.remote_ud = None self.last_checkpoint_batch: Optional[Time] = None + self.mlflow_loggers = [] def run_event(self, event: Event, state: State, logger: Logger) -> None: # The interval scheduler handles only returning True for the appropriate events @@ -92,8 +93,8 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None: state.callbacks.append(self.remote_ud) if self.log_to_mlflow: - mlflow_loggers = [logger_destination for logger_destination in state.loggers if isinstance(logger_destination, MLFlowLogger)] - if len(mlflow_loggers) == 0: + self.mlflow_loggers = [logger_destination for logger_destination in state.loggers if isinstance(logger_destination, MLFlowLogger)] + if len(self.mlflow_loggers) == 0: raise ValueError( f'`log_to_mlflow` was set to `True` but no `MLFlowLogger` was found in the `state.loggers` list. ' + 'Please add an `MLFlowLogger` or set `log_to_mlflow` to `False`.' @@ -138,6 +139,8 @@ def _save_checkpoint(self, state: State, logger: Logger, is_fit_end: bool = Fals if dist.get_global_rank() == 0: # We raise above if the model is not a HuggingFaceModel, so this assert is safe assert hasattr(state.model.model, 'save_pretrained') + print(type(state.model.model.module)) + print(type(state.model.model)) state.model.model.save_pretrained(temp_save_dir, state_dict=state_dict) @@ -179,6 +182,14 @@ def _save_checkpoint(self, state: State, logger: Logger, is_fit_end: bool = Fals # Free up memory before creating another copy of the model to log to MLFlow del state_dict + from torch.distributed.fsdp import FullyShardedDataParallel as FSDP + if isinstance(state.model.model, FSDP): + model_class = state.model.model.module + else: + model_class = state.model.model + + new_instance = model_class.from_pretrained(temp_save_dir) + From 694eca4e43b029fda33657c0177cdcb51628159a Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 19 Sep 2023 07:06:49 +0000 Subject: [PATCH 04/54] small model works --- llmfoundry/callbacks/hf_checkpointer.py | 38 +++++++++++++++++-------- 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 4b359a1028..dade12121e 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -17,10 +17,9 @@ from composer.loggers.remote_uploader_downloader import RemoteUploaderDownloader from composer.models import HuggingFaceModel from composer.utils import dist, format_name_with_dist_and_time, parse_uri -from transformers import PreTrainedTokenizerBase +from transformers import PreTrainedTokenizerBase, AutoTokenizer from llmfoundry.models.mpt import MPTConfig, MPTForCausalLM -from llmfoundry.models.utils import init_empty_weights from llmfoundry.utils.huggingface_hub_utils import \ edit_files_for_hf_compatibility @@ -51,6 +50,8 @@ def __init__( precision: str = 'fp32', overwrite: bool = False, log_to_mlflow: bool = False, + mlflow_task: str = 'text-generation', + mlflow_metadata: Optional[dict] = None, ): self.backend, self.bucket_name, self.save_dir_format_str = parse_uri( save_folder) @@ -62,6 +63,12 @@ def __init__( 'bfloat16': torch.bfloat16, }[precision] self.log_to_mlflow = log_to_mlflow + self.mlflow_task = mlflow_task + + self.mlflow_metadata = mlflow_metadata + if self.mlflow_metadata is None: + self.mlflow_metadata = {'task': 'llm/v1/completions'} + self.huggingface_folder_name_fstr = os.path.join( 'huggingface', huggingface_folder_name) self.check_interval = create_interval_scheduler( @@ -93,10 +100,10 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None: state.callbacks.append(self.remote_ud) if self.log_to_mlflow: - self.mlflow_loggers = [logger_destination for logger_destination in state.loggers if isinstance(logger_destination, MLFlowLogger)] + self.mlflow_loggers = [logger_destination for logger_destination in logger.destinations if isinstance(logger_destination, MLFlowLogger)] if len(self.mlflow_loggers) == 0: raise ValueError( - f'`log_to_mlflow` was set to `True` but no `MLFlowLogger` was found in the `state.loggers` list. ' + + f'`log_to_mlflow` was set to `True` but no `MLFlowLogger` was found in the `logger.destinations` list. ' + 'Please add an `MLFlowLogger` or set `log_to_mlflow` to `False`.' ) @@ -139,8 +146,6 @@ def _save_checkpoint(self, state: State, logger: Logger, is_fit_end: bool = Fals if dist.get_global_rank() == 0: # We raise above if the model is not a HuggingFaceModel, so this assert is safe assert hasattr(state.model.model, 'save_pretrained') - print(type(state.model.model.module)) - print(type(state.model.model)) state.model.model.save_pretrained(temp_save_dir, state_dict=state_dict) @@ -188,9 +193,18 @@ def _save_checkpoint(self, state: State, logger: Logger, is_fit_end: bool = Fals else: model_class = state.model.model - new_instance = model_class.from_pretrained(temp_save_dir) - - - - - dist.barrier() + new_model_instance = model_class.from_pretrained(temp_save_dir) + components = {'model': new_model_instance} + if state.model.tokenizer is not None: + new_tokenizer_instance = AutoTokenizer.from_pretrained(temp_save_dir) + components['tokenizer'] = new_tokenizer_instance + + for mlflow_logger in self.mlflow_loggers: + mlflow_logger.log_model( + flavor='transformers', + transformers_model=components, + artifact_path=os.path.basename(save_dir), + task=self.mlflow_task, + registered_model_name=f'{state.run_name}_{os.path.basename(save_dir)}', + metadata=self.mlflow_metadata, + ) From 91c1c0365b7b674c85a868153216bb221fe796b5 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 19 Sep 2023 00:07:15 -0700 Subject: [PATCH 05/54] temp comment out --- llmfoundry/utils/config_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 8690271874..7a41d9cbde 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -162,5 +162,5 @@ def log_config(cfg: DictConfig) -> None: import mlflow except ImportError as e: raise e - if mlflow.active_run(): - mlflow.log_params(params=om.to_container(cfg, resolve=True)) + # if mlflow.active_run(): + # mlflow.log_params(params=om.to_container(cfg, resolve=True)) From 54105570fb985c8cc2d2411d2bca970a2aaed4e7 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 19 Sep 2023 00:33:43 -0700 Subject: [PATCH 06/54] more logs --- llmfoundry/callbacks/hf_checkpointer.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index dade12121e..5a3341a4d8 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -134,6 +134,7 @@ def _save_checkpoint(self, state: State, logger: Logger, is_fit_end: bool = Fals assert isinstance(temp_save_dir, str) # pyright doesn't know about enter_result + log.debug('Gathering state dict') with fsdp_state_dict_type_context(state.model.model, state_dict_type='full'): state_dict = state.model.model.state_dict() @@ -144,6 +145,7 @@ def _save_checkpoint(self, state: State, logger: Logger, is_fit_end: bool = Fals state_dict[k] = v.to(dtype=self.dtype) if dist.get_global_rank() == 0: + log.debug('Saving Hugging Face checkpoint to disk') # We raise above if the model is not a HuggingFaceModel, so this assert is safe assert hasattr(state.model.model, 'save_pretrained') state.model.model.save_pretrained(temp_save_dir, @@ -184,6 +186,7 @@ def _save_checkpoint(self, state: State, logger: Logger, is_fit_end: bool = Fals ) if self.log_to_mlflow: + log.debug('Reloading model to log to MLFlow') # Free up memory before creating another copy of the model to log to MLFlow del state_dict @@ -194,11 +197,13 @@ def _save_checkpoint(self, state: State, logger: Logger, is_fit_end: bool = Fals model_class = state.model.model new_model_instance = model_class.from_pretrained(temp_save_dir) + new_model_instance.to(self.dtype) components = {'model': new_model_instance} if state.model.tokenizer is not None: new_tokenizer_instance = AutoTokenizer.from_pretrained(temp_save_dir) components['tokenizer'] = new_tokenizer_instance + log.debug('Logging Hugging Face model to MLFlow') for mlflow_logger in self.mlflow_loggers: mlflow_logger.log_model( flavor='transformers', From 3f6049847d8ef13e57a71b26b66cd85fdf1e86fb Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 19 Sep 2023 01:27:44 -0700 Subject: [PATCH 07/54] tweaks --- llmfoundry/callbacks/hf_checkpointer.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 5a3341a4d8..1626d45ce9 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -106,6 +106,9 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None: f'`log_to_mlflow` was set to `True` but no `MLFlowLogger` was found in the `logger.destinations` list. ' + 'Please add an `MLFlowLogger` or set `log_to_mlflow` to `False`.' ) + + import mlflow + mlflow.environment_variables.MLFLOW_HUGGINGFACE_MODEL_MAX_SHARD_SIZE.set("5GB") def _save_checkpoint(self, state: State, logger: Logger, is_fit_end: bool = False): del logger # unused @@ -185,7 +188,7 @@ def _save_checkpoint(self, state: State, logger: Logger, is_fit_end: bool = Fals overwrite=self.overwrite, ) - if self.log_to_mlflow: + if self.log_to_mlflow and is_fit_end: log.debug('Reloading model to log to MLFlow') # Free up memory before creating another copy of the model to log to MLFlow del state_dict @@ -212,4 +215,5 @@ def _save_checkpoint(self, state: State, logger: Logger, is_fit_end: bool = Fals task=self.mlflow_task, registered_model_name=f'{state.run_name}_{os.path.basename(save_dir)}', metadata=self.mlflow_metadata, + await_registration_for=None, ) From 765a599a253f1a8f5931c991a2ed273e7bc456b1 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 19 Sep 2023 01:43:23 -0700 Subject: [PATCH 08/54] fix fit end --- llmfoundry/callbacks/hf_checkpointer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 1626d45ce9..9c84801ae4 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -89,7 +89,7 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None: if state.get_elapsed_duration() is not None and self.check_interval( state, event) and self.last_checkpoint_batch != state.timestamp.batch: - self._save_checkpoint(state, logger, is_fit_end=event == Event.FIT_END) + self._save_checkpoint(state, logger) elif event == Event.INIT: if not isinstance(state.model, HuggingFaceModel): raise ValueError( @@ -110,7 +110,7 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None: import mlflow mlflow.environment_variables.MLFLOW_HUGGINGFACE_MODEL_MAX_SHARD_SIZE.set("5GB") - def _save_checkpoint(self, state: State, logger: Logger, is_fit_end: bool = False): + def _save_checkpoint(self, state: State, logger: Logger): del logger # unused self.last_checkpoint_batch = state.timestamp.batch @@ -188,7 +188,7 @@ def _save_checkpoint(self, state: State, logger: Logger, is_fit_end: bool = Fals overwrite=self.overwrite, ) - if self.log_to_mlflow and is_fit_end: + if self.log_to_mlflow and state.get_elapsed_duration() >= 1.0: log.debug('Reloading model to log to MLFlow') # Free up memory before creating another copy of the model to log to MLFlow del state_dict From 9095cdbd88fa767dd9d309f864e0560047eeb6d4 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 19 Sep 2023 01:58:47 -0700 Subject: [PATCH 09/54] speedup attempt --- llmfoundry/callbacks/hf_checkpointer.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 9c84801ae4..62c759f9ad 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -190,8 +190,6 @@ def _save_checkpoint(self, state: State, logger: Logger): if self.log_to_mlflow and state.get_elapsed_duration() >= 1.0: log.debug('Reloading model to log to MLFlow') - # Free up memory before creating another copy of the model to log to MLFlow - del state_dict from torch.distributed.fsdp import FullyShardedDataParallel as FSDP if isinstance(state.model.model, FSDP): @@ -199,8 +197,9 @@ def _save_checkpoint(self, state: State, logger: Logger): else: model_class = state.model.model - new_model_instance = model_class.from_pretrained(temp_save_dir) - new_model_instance.to(self.dtype) + new_model_instance = model_class.from_config(state.model.model.config, torch_dtype=self.dtype) + new_model_instance.load_state_dict(state_dict) + del state_dict components = {'model': new_model_instance} if state.model.tokenizer is not None: new_tokenizer_instance = AutoTokenizer.from_pretrained(temp_save_dir) From cfe7312c983e033f72cd13217a12944caf431e5c Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 19 Sep 2023 02:01:23 -0700 Subject: [PATCH 10/54] fix --- llmfoundry/callbacks/hf_checkpointer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 62c759f9ad..65b1912cef 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -197,7 +197,8 @@ def _save_checkpoint(self, state: State, logger: Logger): else: model_class = state.model.model - new_model_instance = model_class.from_config(state.model.model.config, torch_dtype=self.dtype) + new_model_instance = model_class(state.model.model.config) + new_model_instance.to(dtype=self.dtype) new_model_instance.load_state_dict(state_dict) del state_dict components = {'model': new_model_instance} From f3546386babed5695542dfafa96ea58f61b67209 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 19 Sep 2023 02:11:00 -0700 Subject: [PATCH 11/54] fix --- llmfoundry/callbacks/hf_checkpointer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 65b1912cef..5afa3fab14 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -197,7 +197,7 @@ def _save_checkpoint(self, state: State, logger: Logger): else: model_class = state.model.model - new_model_instance = model_class(state.model.model.config) + new_model_instance = type(model_class)(state.model.model.config) new_model_instance.to(dtype=self.dtype) new_model_instance.load_state_dict(state_dict) del state_dict From 300bda5a6ddca58c999c61564d9747bb6476d4e0 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 19 Sep 2023 10:59:14 -0700 Subject: [PATCH 12/54] fix meta --- llmfoundry/callbacks/hf_checkpointer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 5afa3fab14..ac41b5cbeb 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -197,7 +197,7 @@ def _save_checkpoint(self, state: State, logger: Logger): else: model_class = state.model.model - new_model_instance = type(model_class)(state.model.model.config) + new_model_instance = type(model_class)(edited_config) new_model_instance.to(dtype=self.dtype) new_model_instance.load_state_dict(state_dict) del state_dict From 9f270ebbcffe956a4026494ffc3cbfbe996b4763 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 19 Sep 2023 13:03:05 -0700 Subject: [PATCH 13/54] fix config creation --- llmfoundry/callbacks/hf_checkpointer.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index ac41b5cbeb..e921d9cbcd 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import contextlib +import copy import json import logging import os @@ -197,7 +198,12 @@ def _save_checkpoint(self, state: State, logger: Logger): else: model_class = state.model.model - new_model_instance = type(model_class)(edited_config) + copied_config = copy.deepcopy(state.model.model.config) + if state.model.model.config.model_type == 'mpt': + copied_config.attn_config['attn_impl'] = 'torch' + copied_config.init_device = 'cpu' + + new_model_instance = type(model_class)(copied_config) new_model_instance.to(dtype=self.dtype) new_model_instance.load_state_dict(state_dict) del state_dict From 7685fbe45b0056f3ef05b45c684b808162d21060 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 26 Sep 2023 18:03:22 -0700 Subject: [PATCH 14/54] add mlflow log model test --- tests/test_hf_conversion_script.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/test_hf_conversion_script.py b/tests/test_hf_conversion_script.py index c944dcfc97..cf35e76816 100644 --- a/tests/test_hf_conversion_script.py +++ b/tests/test_hf_conversion_script.py @@ -5,6 +5,7 @@ import os import pathlib import sys +from unittest.mock import MagicMock from composer import Trainer from composer.utils import dist, get_device @@ -184,8 +185,9 @@ def test_callback_inits_with_defaults(): @pytest.mark.gpu @pytest.mark.parametrize('model', ['mpt', 'neo', 'llama2']) @pytest.mark.parametrize('fsdp_state_dict_type', ['full', 'sharded']) +@pytest.mark.parametrize('log_to_mlflow', [True, False]) def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path, - fsdp_state_dict_type: str): + fsdp_state_dict_type: str, log_to_mlflow: bool, monkeypatch): delete_transformers_cache() dist.initialize_dist(get_device('gpu')) @@ -199,10 +201,14 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path, precision_str = 'bfloat16' precision = torch.bfloat16 + mock_log_model = MagicMock() + monkeypatch.setattr('mlflow.transformers.log_model', mock_log_model) + checkpointer_callback = HuggingFaceCheckpointer( save_folder=os.path.join(tmp_path, 'checkpoints'), save_interval=f'{huggingface_save_interval_batches}ba', precision=precision_str, + log_to_mlflow=log_to_mlflow, ) # get small version of each model @@ -338,6 +344,8 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path, ) trainer.fit() + assert mock_log_model.call_count == int(log_to_mlflow) + # summon full params to check equivalence from torch.distributed.fsdp import FullyShardedDataParallel as FSDP with FSDP.summon_full_params(trainer.state.model, From c52056bc1b02012a2ee56b3221e57d61d2db2201 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 26 Sep 2023 18:09:03 -0700 Subject: [PATCH 15/54] fix --- tests/test_hf_conversion_script.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_hf_conversion_script.py b/tests/test_hf_conversion_script.py index cf35e76816..921d621a2e 100644 --- a/tests/test_hf_conversion_script.py +++ b/tests/test_hf_conversion_script.py @@ -339,6 +339,7 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path, save_interval=f'{save_interval_batches}ba', max_duration=f'{max_duration_batches}ba', callbacks=[checkpointer_callback], + loggers=[MagicMock()] if log_to_mlflow else [], optimizers=optimizer, save_latest_filename=None, ) From e6bb71a74b1ec882a34c3688e9df0a310dc4b0f2 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 26 Sep 2023 18:12:53 -0700 Subject: [PATCH 16/54] fix --- tests/test_hf_conversion_script.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_hf_conversion_script.py b/tests/test_hf_conversion_script.py index 921d621a2e..8d98670911 100644 --- a/tests/test_hf_conversion_script.py +++ b/tests/test_hf_conversion_script.py @@ -8,6 +8,7 @@ from unittest.mock import MagicMock from composer import Trainer +from composer.loggers import MLFlowLogger from composer.utils import dist, get_device from llmfoundry.callbacks import HuggingFaceCheckpointer @@ -201,7 +202,7 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path, precision_str = 'bfloat16' precision = torch.bfloat16 - mock_log_model = MagicMock() + mock_log_model = MagicMock(spec=MLFlowLogger) monkeypatch.setattr('mlflow.transformers.log_model', mock_log_model) checkpointer_callback = HuggingFaceCheckpointer( From cfa730e7a622040a91b77127c11ae7c6c5a737e9 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Wed, 27 Sep 2023 01:35:50 +0000 Subject: [PATCH 17/54] fix test --- tests/test_hf_conversion_script.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tests/test_hf_conversion_script.py b/tests/test_hf_conversion_script.py index 8d98670911..a345fe24e1 100644 --- a/tests/test_hf_conversion_script.py +++ b/tests/test_hf_conversion_script.py @@ -202,9 +202,6 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path, precision_str = 'bfloat16' precision = torch.bfloat16 - mock_log_model = MagicMock(spec=MLFlowLogger) - monkeypatch.setattr('mlflow.transformers.log_model', mock_log_model) - checkpointer_callback = HuggingFaceCheckpointer( save_folder=os.path.join(tmp_path, 'checkpoints'), save_interval=f'{huggingface_save_interval_batches}ba', @@ -331,6 +328,9 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path, optimizer = build_optimizer(original_model, optimizer_name, optimizer_config) + mlflow_logger_mock = MagicMock(spec=MLFlowLogger) + mlflow_logger_mock.state_dict = lambda *args, **kwargs: {} + mlflow_logger_mock.log_model = MagicMock() trainer = Trainer( model=original_model, device='gpu', @@ -340,13 +340,16 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path, save_interval=f'{save_interval_batches}ba', max_duration=f'{max_duration_batches}ba', callbacks=[checkpointer_callback], - loggers=[MagicMock()] if log_to_mlflow else [], + loggers=[mlflow_logger_mock] if log_to_mlflow else [], optimizers=optimizer, save_latest_filename=None, ) trainer.fit() - assert mock_log_model.call_count == int(log_to_mlflow) + if dist.get_global_rank() == 0: + assert mlflow_logger_mock.log_model.call_count == (1 if log_to_mlflow else 0) + else: + assert mlflow_logger_mock.log_model.call_count == 0 # summon full params to check equivalence from torch.distributed.fsdp import FullyShardedDataParallel as FSDP From d29fb572dfaa5487bc295da2abe93404b73d4076 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 26 Sep 2023 18:38:09 -0700 Subject: [PATCH 18/54] precommit --- llmfoundry/callbacks/hf_checkpointer.py | 25 +++++++++++++++++-------- tests/test_hf_conversion_script.py | 6 ++++-- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index ba1aa6378c..c880fa9fbf 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -18,7 +18,7 @@ from composer.loggers.remote_uploader_downloader import RemoteUploaderDownloader from composer.models import HuggingFaceModel from composer.utils import dist, format_name_with_dist_and_time, parse_uri -from transformers import PreTrainedTokenizerBase, AutoTokenizer +from transformers import AutoTokenizer, PreTrainedTokenizerBase from llmfoundry.models.mpt import MPTConfig, MPTForCausalLM from llmfoundry.utils.huggingface_hub_utils import \ @@ -101,15 +101,21 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None: state.callbacks.append(self.remote_ud) if self.log_to_mlflow: - self.mlflow_loggers = [logger_destination for logger_destination in logger.destinations if isinstance(logger_destination, MLFlowLogger)] + self.mlflow_loggers = [ + logger_destination + for logger_destination in logger.destinations + if isinstance(logger_destination, MLFlowLogger) + ] if len(self.mlflow_loggers) == 0: raise ValueError( - f'`log_to_mlflow` was set to `True` but no `MLFlowLogger` was found in the `logger.destinations` list. ' + + f'`log_to_mlflow` was set to `True` but no `MLFlowLogger` was found in the `logger.destinations` list. ' + + 'Please add an `MLFlowLogger` or set `log_to_mlflow` to `False`.' ) - + import mlflow - mlflow.environment_variables.MLFLOW_HUGGINGFACE_MODEL_MAX_SHARD_SIZE.set("5GB") + mlflow.environment_variables.MLFLOW_HUGGINGFACE_MODEL_MAX_SHARD_SIZE.set( + '5GB') def _save_checkpoint(self, state: State, logger: Logger): del logger # unused @@ -192,7 +198,8 @@ def _save_checkpoint(self, state: State, logger: Logger): if self.log_to_mlflow and state.get_elapsed_duration() >= 1.0: log.debug('Reloading model to log to MLFlow') - from torch.distributed.fsdp import FullyShardedDataParallel as FSDP + from torch.distributed.fsdp import \ + FullyShardedDataParallel as FSDP if isinstance(state.model.model, FSDP): model_class = state.model.model.module else: @@ -209,7 +216,8 @@ def _save_checkpoint(self, state: State, logger: Logger): del state_dict components = {'model': new_model_instance} if state.model.tokenizer is not None: - new_tokenizer_instance = AutoTokenizer.from_pretrained(temp_save_dir) + new_tokenizer_instance = AutoTokenizer.from_pretrained( + temp_save_dir) components['tokenizer'] = new_tokenizer_instance log.debug('Logging Hugging Face model to MLFlow') @@ -219,7 +227,8 @@ def _save_checkpoint(self, state: State, logger: Logger): transformers_model=components, artifact_path=os.path.basename(save_dir), task=self.mlflow_task, - registered_model_name=f'{state.run_name}_{os.path.basename(save_dir)}', + registered_model_name= + f'{state.run_name}_{os.path.basename(save_dir)}', metadata=self.mlflow_metadata, await_registration_for=None, ) diff --git a/tests/test_hf_conversion_script.py b/tests/test_hf_conversion_script.py index a345fe24e1..6a569e88b6 100644 --- a/tests/test_hf_conversion_script.py +++ b/tests/test_hf_conversion_script.py @@ -188,7 +188,8 @@ def test_callback_inits_with_defaults(): @pytest.mark.parametrize('fsdp_state_dict_type', ['full', 'sharded']) @pytest.mark.parametrize('log_to_mlflow', [True, False]) def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path, - fsdp_state_dict_type: str, log_to_mlflow: bool, monkeypatch): + fsdp_state_dict_type: str, + log_to_mlflow: bool, monkeypatch): delete_transformers_cache() dist.initialize_dist(get_device('gpu')) @@ -347,7 +348,8 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path, trainer.fit() if dist.get_global_rank() == 0: - assert mlflow_logger_mock.log_model.call_count == (1 if log_to_mlflow else 0) + assert mlflow_logger_mock.log_model.call_count == (1 if log_to_mlflow + else 0) else: assert mlflow_logger_mock.log_model.call_count == 0 From 6180466b647ab0c8610a4e05dd56b6000f8ca365 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 26 Sep 2023 18:38:55 -0700 Subject: [PATCH 19/54] precommit --- llmfoundry/utils/config_utils.py | 4 ++-- tests/test_hf_conversion_script.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 8c9e516d36..6680154e87 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -162,5 +162,5 @@ def log_config(cfg: DictConfig) -> None: import mlflow except ImportError as e: raise e - # if mlflow.active_run(): - # mlflow.log_params(params=om.to_container(cfg, resolve=True)) + if mlflow.active_run(): + mlflow.log_params(params=om.to_container(cfg, resolve=True)) diff --git a/tests/test_hf_conversion_script.py b/tests/test_hf_conversion_script.py index 6a569e88b6..81838655ff 100644 --- a/tests/test_hf_conversion_script.py +++ b/tests/test_hf_conversion_script.py @@ -189,7 +189,7 @@ def test_callback_inits_with_defaults(): @pytest.mark.parametrize('log_to_mlflow', [True, False]) def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path, fsdp_state_dict_type: str, - log_to_mlflow: bool, monkeypatch): + log_to_mlflow: bool): delete_transformers_cache() dist.initialize_dist(get_device('gpu')) From 62a0fd6ac6cad3b95e6cf0407d291aa58b19906a Mon Sep 17 00:00:00 2001 From: Daniel King Date: Wed, 27 Sep 2023 01:00:27 -0700 Subject: [PATCH 20/54] pyright --- llmfoundry/callbacks/hf_checkpointer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index c880fa9fbf..696bad18f9 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -195,7 +195,7 @@ def _save_checkpoint(self, state: State, logger: Logger): overwrite=self.overwrite, ) - if self.log_to_mlflow and state.get_elapsed_duration() >= 1.0: + if self.log_to_mlflow and state.get_elapsed_duration() is not None and state.get_elapsed_duration() >= 1.0: log.debug('Reloading model to log to MLFlow') from torch.distributed.fsdp import \ From 1efc4aea99bd2bb867662693d4e8da6180e6181b Mon Sep 17 00:00:00 2001 From: Daniel King Date: Wed, 27 Sep 2023 01:00:57 -0700 Subject: [PATCH 21/54] precommit --- llmfoundry/callbacks/hf_checkpointer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 696bad18f9..5f54f7ab71 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -195,7 +195,8 @@ def _save_checkpoint(self, state: State, logger: Logger): overwrite=self.overwrite, ) - if self.log_to_mlflow and state.get_elapsed_duration() is not None and state.get_elapsed_duration() >= 1.0: + if self.log_to_mlflow and state.get_elapsed_duration( + ) is not None and state.get_elapsed_duration() >= 1.0: log.debug('Reloading model to log to MLFlow') from torch.distributed.fsdp import \ From bbc27d2b2f2e99fdaec96dee9952a9cfd224b43b Mon Sep 17 00:00:00 2001 From: Daniel King Date: Wed, 27 Sep 2023 01:12:38 -0700 Subject: [PATCH 22/54] pyright --- llmfoundry/callbacks/hf_checkpointer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 5f54f7ab71..c9f8d0aff2 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -195,8 +195,8 @@ def _save_checkpoint(self, state: State, logger: Logger): overwrite=self.overwrite, ) - if self.log_to_mlflow and state.get_elapsed_duration( - ) is not None and state.get_elapsed_duration() >= 1.0: + elapsed_duration = state.get_elapsed_duration() + if self.log_to_mlflow and elapsed_duration is not None and elapsed_duration >= 1.0: log.debug('Reloading model to log to MLFlow') from torch.distributed.fsdp import \ From b55c71d230e5ff4fed8d6f8f484175aafd5aba79 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Thu, 28 Sep 2023 02:04:11 +0000 Subject: [PATCH 23/54] merge --- llmfoundry/callbacks/hf_checkpointer.py | 24 +++++++++++++++---- scripts/train/yamls/pretrain/mpt-125m.yaml | 28 +++++++++++++++------- 2 files changed, 39 insertions(+), 13 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index c9f8d0aff2..dd65b912ec 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -41,6 +41,11 @@ class HuggingFaceCheckpointer(Callback): precision: The precision to save the model in. Default is ``float32``. Options are ``bfloat16``, ``float16``, or ``float32``. overwrite (bool): Whether to overwrite previous checkpoints. log_to_mlflow (bool): Whether to log and register the checkpoint to MLFlow. Default is ``False``. + mlflow_task (str): The MLFlow task to log the checkpoint under. Only used if ``log_to_mlflow`` is ``True``. Default is ``text-generation``. + mlflow_metadata (Optional[dict]): The MLFlow metadata to log the checkpoint with. Only used if ``log_to_mlflow`` is ``True``. Default is ``None``. + uc_prefix: (Optional[str]): Prefix to use for the MLFlow registered model. If specified, the model will be logged to UC rather than + the workspace model registry. If specified, the prefix must be of the form ``{catalog}.{schema}`` + The model will be registered at ``{catalog}.{schema}.{model name}``. Only used if ``log_to_mlflow`` is ``True``. Default is ``None``. """ def __init__( @@ -53,6 +58,7 @@ def __init__( log_to_mlflow: bool = False, mlflow_task: str = 'text-generation', mlflow_metadata: Optional[dict] = None, + uc_prefix: Optional[str] = None, ): self.backend, self.bucket_name, self.save_dir_format_str = parse_uri( save_folder) @@ -85,6 +91,15 @@ def __init__( self.last_checkpoint_batch: Optional[Time] = None self.mlflow_loggers = [] + self.uc_prefix = uc_prefix + if self.log_to_mlflow and uc_prefix is not None: + split_prefix = uc_prefix.split('.') + if len(split_prefix) != 2: + raise ValueError( + f'`uc_prefix` must be of the form `{{catalog}}.{{schema}}`. Got {uc_prefix} instead.' + ) + + def run_event(self, event: Event, state: State, logger: Logger) -> None: # The interval scheduler handles only returning True for the appropriate events if state.get_elapsed_duration() is not None and self.check_interval( @@ -114,8 +129,8 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None: ) import mlflow - mlflow.environment_variables.MLFLOW_HUGGINGFACE_MODEL_MAX_SHARD_SIZE.set( - '5GB') + mlflow.environment_variables.MLFLOW_HUGGINGFACE_MODEL_MAX_SHARD_SIZE.set("5GB") + mlflow.set_registry_uri('databricks-uc') def _save_checkpoint(self, state: State, logger: Logger): del logger # unused @@ -222,14 +237,15 @@ def _save_checkpoint(self, state: State, logger: Logger): components['tokenizer'] = new_tokenizer_instance log.debug('Logging Hugging Face model to MLFlow') + registered_model_name = f'{state.run_name}_{os.path.basename(save_dir)}' + registered_model_name_full = f'{self.uc_prefix}.{registered_model_name}' if self.uc_prefix is not None else registered_model_name for mlflow_logger in self.mlflow_loggers: mlflow_logger.log_model( flavor='transformers', transformers_model=components, artifact_path=os.path.basename(save_dir), task=self.mlflow_task, - registered_model_name= - f'{state.run_name}_{os.path.basename(save_dir)}', + registered_model_name=registered_model_name_full, metadata=self.mlflow_metadata, await_registration_for=None, ) diff --git a/scripts/train/yamls/pretrain/mpt-125m.yaml b/scripts/train/yamls/pretrain/mpt-125m.yaml index 1d4c1d964c..ac95719662 100644 --- a/scripts/train/yamls/pretrain/mpt-125m.yaml +++ b/scripts/train/yamls/pretrain/mpt-125m.yaml @@ -4,7 +4,7 @@ max_seq_len: 2048 global_seed: 17 # Run Name -run_name: # If left blank, will be read from env var $RUN_NAME +run_name: test-mlflow-register-3 # Model model: @@ -31,7 +31,7 @@ train_loader: dataset: local: ${data_local} remote: ${data_remote} - split: train + split: train_small shuffle: true max_seq_len: ${max_seq_len} shuffle_seed: ${global_seed} @@ -43,7 +43,7 @@ eval_loader: dataset: local: ${data_local} remote: ${data_remote} - split: val + split: val_small shuffle: false max_seq_len: ${max_seq_len} shuffle_seed: ${global_seed} @@ -70,16 +70,16 @@ algorithms: clipping_type: norm clipping_threshold: 1.0 -max_duration: 4800ba # ~ 2.5B tokens +max_duration: 10ba # ~ 2.5B tokens eval_interval: 500ba eval_first: false -eval_subset_num_batches: -1 -global_train_batch_size: 256 +eval_subset_num_batches: 2 +global_train_batch_size: 2 # System seed: ${global_seed} -device_eval_batch_size: 16 -device_train_microbatch_size: 16 +device_eval_batch_size: 1 +device_train_microbatch_size: 1 # device_train_microbatch_size: auto precision: amp_bf16 @@ -104,6 +104,16 @@ callbacks: lr_monitor: {} memory_monitor: {} runtime_estimator: {} + hf_checkpointer: + save_interval: 10ba + precision: bfloat16 + save_folder: ./{run_name}/checkpoints + log_to_mlflow: true + uc_prefix: main.danielking + +loggers: + mlflow: + experiment_name: /Users/daniel.king@databricks.com/mlflow-logging-test # loggers: # wandb: {} @@ -111,7 +121,7 @@ callbacks: # Checkpoint to local filesystem or remote object store # save_interval: 500ba # save_num_checkpoints_to_keep: 1 # Important, this cleans up checkpoints saved to DISK -# save_folder: ./{run_name}/checkpoints +save_folder: ./{run_name}/checkpoints # save_folder: s3://my-bucket/my-folder/{run_name}/checkpoints # Load from local filesystem or remote object store From 6715d887754cb8cf672f7e9537497b8785dd4cca Mon Sep 17 00:00:00 2001 From: Daniel King Date: Wed, 27 Sep 2023 19:10:45 -0700 Subject: [PATCH 24/54] precommit --- llmfoundry/callbacks/hf_checkpointer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index dd65b912ec..66637f1d37 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -99,7 +99,6 @@ def __init__( f'`uc_prefix` must be of the form `{{catalog}}.{{schema}}`. Got {uc_prefix} instead.' ) - def run_event(self, event: Event, state: State, logger: Logger) -> None: # The interval scheduler handles only returning True for the appropriate events if state.get_elapsed_duration() is not None and self.check_interval( @@ -129,7 +128,8 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None: ) import mlflow - mlflow.environment_variables.MLFLOW_HUGGINGFACE_MODEL_MAX_SHARD_SIZE.set("5GB") + mlflow.environment_variables.MLFLOW_HUGGINGFACE_MODEL_MAX_SHARD_SIZE.set( + '5GB') mlflow.set_registry_uri('databricks-uc') def _save_checkpoint(self, state: State, logger: Logger): From 932ba7fb270cde0db0306734a75055fe036efbc5 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Wed, 27 Sep 2023 19:30:49 -0700 Subject: [PATCH 25/54] add logging --- llmfoundry/callbacks/hf_checkpointer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 66637f1d37..ec5aa36423 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -239,6 +239,7 @@ def _save_checkpoint(self, state: State, logger: Logger): log.debug('Logging Hugging Face model to MLFlow') registered_model_name = f'{state.run_name}_{os.path.basename(save_dir)}' registered_model_name_full = f'{self.uc_prefix}.{registered_model_name}' if self.uc_prefix is not None else registered_model_name + log.debug(f'Registering model to UC at {registered_model_name_full}') for mlflow_logger in self.mlflow_loggers: mlflow_logger.log_model( flavor='transformers', From ab6d082a43a387183a552fb4b2162822a2fd7839 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Thu, 28 Sep 2023 15:49:57 -0700 Subject: [PATCH 26/54] no uc --- llmfoundry/callbacks/hf_checkpointer.py | 50 ++++++++++++------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index ec5aa36423..0e9e6005b2 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -40,12 +40,10 @@ class HuggingFaceCheckpointer(Callback): huggingface_folder_name (str): Folder to save each checkpoint under (can be a format string). Default is ``ba{batch}``. precision: The precision to save the model in. Default is ``float32``. Options are ``bfloat16``, ``float16``, or ``float32``. overwrite (bool): Whether to overwrite previous checkpoints. - log_to_mlflow (bool): Whether to log and register the checkpoint to MLFlow. Default is ``False``. - mlflow_task (str): The MLFlow task to log the checkpoint under. Only used if ``log_to_mlflow`` is ``True``. Default is ``text-generation``. - mlflow_metadata (Optional[dict]): The MLFlow metadata to log the checkpoint with. Only used if ``log_to_mlflow`` is ``True``. Default is ``None``. - uc_prefix: (Optional[str]): Prefix to use for the MLFlow registered model. If specified, the model will be logged to UC rather than - the workspace model registry. If specified, the prefix must be of the form ``{catalog}.{schema}`` - The model will be registered at ``{catalog}.{schema}.{model name}``. Only used if ``log_to_mlflow`` is ``True``. Default is ``None``. + log_to_mlflow (bool): Whether to log and register the checkpoint to MLflow. Default is ``False``. + mlflow_logging_config (Optional[dict]): A dictionary of config arguments that will get passed along to the MLflow ``log_model`` call. + Expected to contain ``metadata`` and ``task`` keys. If either is unspecified, the defaults are ``'text-generation'`` and + """ def __init__( @@ -56,9 +54,7 @@ def __init__( precision: str = 'float32', overwrite: bool = False, log_to_mlflow: bool = False, - mlflow_task: str = 'text-generation', - mlflow_metadata: Optional[dict] = None, - uc_prefix: Optional[str] = None, + mlflow_logging_config: Optional[dict] = None, ): self.backend, self.bucket_name, self.save_dir_format_str = parse_uri( save_folder) @@ -69,12 +65,16 @@ def __init__( 'float16': torch.float16, 'bfloat16': torch.bfloat16, }[precision] - self.log_to_mlflow = log_to_mlflow - self.mlflow_task = mlflow_task - self.mlflow_metadata = mlflow_metadata - if self.mlflow_metadata is None: - self.mlflow_metadata = {'task': 'llm/v1/completions'} + # mlflow config setup + self.log_to_mlflow = log_to_mlflow + if mlflow_logging_config is None: + mlflow_logging_config = {} + if 'metadata' not in mlflow_logging_config: + mlflow_logging_config['metadata'] = {'task': 'llm/v1/completions'} + if 'task' not in mlflow_logging_config: + mlflow_logging_config['task'] = 'text-generation' + self.mlflow_logging_config = mlflow_logging_config self.huggingface_folder_name_fstr = os.path.join( 'huggingface', huggingface_folder_name) @@ -91,13 +91,13 @@ def __init__( self.last_checkpoint_batch: Optional[Time] = None self.mlflow_loggers = [] - self.uc_prefix = uc_prefix - if self.log_to_mlflow and uc_prefix is not None: - split_prefix = uc_prefix.split('.') - if len(split_prefix) != 2: - raise ValueError( - f'`uc_prefix` must be of the form `{{catalog}}.{{schema}}`. Got {uc_prefix} instead.' - ) + # self.uc_prefix = uc_prefix + # if self.log_to_mlflow and uc_prefix is not None: + # split_prefix = uc_prefix.split('.') + # if len(split_prefix) != 2: + # raise ValueError( + # f'`uc_prefix` must be of the form `{{catalog}}.{{schema}}`. Got {uc_prefix} instead.' + # ) def run_event(self, event: Event, state: State, logger: Logger) -> None: # The interval scheduler handles only returning True for the appropriate events @@ -130,7 +130,7 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None: import mlflow mlflow.environment_variables.MLFLOW_HUGGINGFACE_MODEL_MAX_SHARD_SIZE.set( '5GB') - mlflow.set_registry_uri('databricks-uc') + # mlflow.set_registry_uri('databricks-uc') def _save_checkpoint(self, state: State, logger: Logger): del logger # unused @@ -238,15 +238,15 @@ def _save_checkpoint(self, state: State, logger: Logger): log.debug('Logging Hugging Face model to MLFlow') registered_model_name = f'{state.run_name}_{os.path.basename(save_dir)}' - registered_model_name_full = f'{self.uc_prefix}.{registered_model_name}' if self.uc_prefix is not None else registered_model_name - log.debug(f'Registering model to UC at {registered_model_name_full}') + # registered_model_name_full = f'{self.uc_prefix}.{registered_model_name}' if self.uc_prefix is not None else registered_model_name + log.debug(f'Registering model to UC at {registered_model_name}') for mlflow_logger in self.mlflow_loggers: mlflow_logger.log_model( flavor='transformers', transformers_model=components, artifact_path=os.path.basename(save_dir), task=self.mlflow_task, - registered_model_name=registered_model_name_full, + registered_model_name=registered_model_name, metadata=self.mlflow_metadata, await_registration_for=None, ) From 24f370295b04d310cfb0a6f3393d32dec94825fe Mon Sep 17 00:00:00 2001 From: Daniel King Date: Thu, 28 Sep 2023 19:07:29 -0700 Subject: [PATCH 27/54] update to new save and register --- llmfoundry/callbacks/hf_checkpointer.py | 30 +++++++++++-------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 0e9e6005b2..0c5f953aa7 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -91,14 +91,6 @@ def __init__( self.last_checkpoint_batch: Optional[Time] = None self.mlflow_loggers = [] - # self.uc_prefix = uc_prefix - # if self.log_to_mlflow and uc_prefix is not None: - # split_prefix = uc_prefix.split('.') - # if len(split_prefix) != 2: - # raise ValueError( - # f'`uc_prefix` must be of the form `{{catalog}}.{{schema}}`. Got {uc_prefix} instead.' - # ) - def run_event(self, event: Event, state: State, logger: Logger) -> None: # The interval scheduler handles only returning True for the appropriate events if state.get_elapsed_duration() is not None and self.check_interval( @@ -238,15 +230,19 @@ def _save_checkpoint(self, state: State, logger: Logger): log.debug('Logging Hugging Face model to MLFlow') registered_model_name = f'{state.run_name}_{os.path.basename(save_dir)}' - # registered_model_name_full = f'{self.uc_prefix}.{registered_model_name}' if self.uc_prefix is not None else registered_model_name - log.debug(f'Registering model to UC at {registered_model_name}') - for mlflow_logger in self.mlflow_loggers: - mlflow_logger.log_model( + for i, mlflow_logger in enumerate(self.mlflow_loggers): + log.debug(f'Registering model to UC at {mlflow_logger.model_registry_prefix}.{registered_model_name}') + local_save_path = str( + Path(temp_save_dir) / f'mlflow_save_{i}') + mlflow_logger.save_model( flavor='transformers', transformers_model=components, - artifact_path=os.path.basename(save_dir), - task=self.mlflow_task, - registered_model_name=registered_model_name, - metadata=self.mlflow_metadata, - await_registration_for=None, + path=local_save_path, + **self.mlflow_logging_config, + ) + mlflow_logger.register_model( + model_uri=local_save_path, + name=registered_model_name, + await_registration_for=3600, ) + From 0d1add2c620d89030bf3e1db1dd48485d9e255fb Mon Sep 17 00:00:00 2001 From: Daniel King Date: Fri, 29 Sep 2023 03:04:45 +0000 Subject: [PATCH 28/54] monkeypatch --- llmfoundry/callbacks/hf_checkpointer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 0c5f953aa7..16b1d3e8c9 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -234,6 +234,8 @@ def _save_checkpoint(self, state: State, logger: Logger): log.debug(f'Registering model to UC at {mlflow_logger.model_registry_prefix}.{registered_model_name}') local_save_path = str( Path(temp_save_dir) / f'mlflow_save_{i}') + import mlflow + mlflow.store._unity_catalog.registry.rest_store.get_feature_dependencies = lambda *args, **kwargs: "" mlflow_logger.save_model( flavor='transformers', transformers_model=components, From 67ee5bd87fdeacffa52e8ae456da002882b9ee0d Mon Sep 17 00:00:00 2001 From: Daniel King Date: Fri, 29 Sep 2023 08:05:46 +0000 Subject: [PATCH 29/54] fix tests --- tests/test_hf_conversion_script.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/test_hf_conversion_script.py b/tests/test_hf_conversion_script.py index 81838655ff..23fec512c8 100644 --- a/tests/test_hf_conversion_script.py +++ b/tests/test_hf_conversion_script.py @@ -331,7 +331,9 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path, mlflow_logger_mock = MagicMock(spec=MLFlowLogger) mlflow_logger_mock.state_dict = lambda *args, **kwargs: {} - mlflow_logger_mock.log_model = MagicMock() + mlflow_logger_mock.save_model = MagicMock() + mlflow_logger_mock.register_model = MagicMock() + mlflow_logger_mock.model_registry_prefix = '' trainer = Trainer( model=original_model, device='gpu', @@ -348,10 +350,13 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path, trainer.fit() if dist.get_global_rank() == 0: - assert mlflow_logger_mock.log_model.call_count == (1 if log_to_mlflow + assert mlflow_logger_mock.save_model.call_count == (1 if log_to_mlflow + else 0) + assert mlflow_logger_mock.register_model.call_count == (1 if log_to_mlflow else 0) else: assert mlflow_logger_mock.log_model.call_count == 0 + assert mlflow_logger_mock.register_model.call_count == 0 # summon full params to check equivalence from torch.distributed.fsdp import FullyShardedDataParallel as FSDP From 28436a98847f8c93b402bb242e09de6d29807c8b Mon Sep 17 00:00:00 2001 From: Daniel King Date: Fri, 29 Sep 2023 01:06:30 -0700 Subject: [PATCH 30/54] precommit --- llmfoundry/callbacks/hf_checkpointer.py | 8 ++++---- tests/test_hf_conversion_script.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 16b1d3e8c9..3993324bbb 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -43,7 +43,6 @@ class HuggingFaceCheckpointer(Callback): log_to_mlflow (bool): Whether to log and register the checkpoint to MLflow. Default is ``False``. mlflow_logging_config (Optional[dict]): A dictionary of config arguments that will get passed along to the MLflow ``log_model`` call. Expected to contain ``metadata`` and ``task`` keys. If either is unspecified, the defaults are ``'text-generation'`` and - """ def __init__( @@ -231,11 +230,13 @@ def _save_checkpoint(self, state: State, logger: Logger): log.debug('Logging Hugging Face model to MLFlow') registered_model_name = f'{state.run_name}_{os.path.basename(save_dir)}' for i, mlflow_logger in enumerate(self.mlflow_loggers): - log.debug(f'Registering model to UC at {mlflow_logger.model_registry_prefix}.{registered_model_name}') + log.debug( + f'Registering model to UC at {mlflow_logger.model_registry_prefix}.{registered_model_name}' + ) local_save_path = str( Path(temp_save_dir) / f'mlflow_save_{i}') import mlflow - mlflow.store._unity_catalog.registry.rest_store.get_feature_dependencies = lambda *args, **kwargs: "" + mlflow.store._unity_catalog.registry.rest_store.get_feature_dependencies = lambda *args, **kwargs: '' mlflow_logger.save_model( flavor='transformers', transformers_model=components, @@ -247,4 +248,3 @@ def _save_checkpoint(self, state: State, logger: Logger): name=registered_model_name, await_registration_for=3600, ) - diff --git a/tests/test_hf_conversion_script.py b/tests/test_hf_conversion_script.py index 23fec512c8..53cf447050 100644 --- a/tests/test_hf_conversion_script.py +++ b/tests/test_hf_conversion_script.py @@ -351,9 +351,9 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path, if dist.get_global_rank() == 0: assert mlflow_logger_mock.save_model.call_count == (1 if log_to_mlflow - else 0) - assert mlflow_logger_mock.register_model.call_count == (1 if log_to_mlflow - else 0) + else 0) + assert mlflow_logger_mock.register_model.call_count == ( + 1 if log_to_mlflow else 0) else: assert mlflow_logger_mock.log_model.call_count == 0 assert mlflow_logger_mock.register_model.call_count == 0 From 7cffafebb35c9baf2094bcaf0ed021e6ceb3ac0c Mon Sep 17 00:00:00 2001 From: Daniel King Date: Fri, 29 Sep 2023 22:19:44 +0000 Subject: [PATCH 31/54] skip extra model load --- llmfoundry/callbacks/hf_checkpointer.py | 66 ++++++++++--------------- 1 file changed, 27 insertions(+), 39 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 3993324bbb..ff2924203d 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -162,31 +162,37 @@ def _save_checkpoint(self, state: State, logger: Logger): if dist.get_global_rank() == 0: log.debug('Saving Hugging Face checkpoint to disk') - # We raise above if the model is not a HuggingFaceModel, so this assert is safe - assert hasattr(state.model.model, 'save_pretrained') - state.model.model.save_pretrained(temp_save_dir, - state_dict=state_dict) + from torch.distributed.fsdp import \ + FullyShardedDataParallel as FSDP + if isinstance(state.model.model, FSDP): + model_class = state.model.model.module + else: + model_class = state.model.model + + copied_config = copy.deepcopy(state.model.model.config) + if state.model.model.config.model_type == 'mpt': + copied_config.attn_config['attn_impl'] = 'torch' + copied_config.init_device = 'cpu' + + # TODO: after torch 2.1, we can load a state dict into a meta model + # and skip the extra model init + log.debug(f'Creating new model instance') + new_model_instance = type(model_class)(copied_config) + new_model_instance.to(dtype=self.dtype) + new_model_instance.load_state_dict(state_dict) + del state_dict + + log.debug("Saving Hugging Face checkpoint to disk") + new_model_instance.save_pretrained('temp_save_dir') if state.model.tokenizer is not None: - assert isinstance(state.model.tokenizer, - PreTrainedTokenizerBase) - state.model.tokenizer.save_pretrained(temp_save_dir) + state.model.tokenizer.save_pretrained('temp_save_dir') # Only need to edit files for MPT because it has custom code if state.model.model.config.model_type == 'mpt': + log.debug('Editing MPT files for HuggingFace compatibility') edit_files_for_hf_compatibility(temp_save_dir) - with open(os.path.join(temp_save_dir, 'config.json'), 'r') as f: - edited_config = json.load(f) - - if state.model.model.config.model_type == 'mpt': - edited_config['attn_config']['attn_impl'] = 'torch' - edited_config['init_device'] = 'cpu' - - edited_config['torch_dtype'] = self.precision - with open(os.path.join(temp_save_dir, 'config.json'), 'w') as f: - json.dump(edited_config, f, indent=4) - if self.upload_to_object_store: assert self.remote_ud is not None log.info( @@ -203,29 +209,9 @@ def _save_checkpoint(self, state: State, logger: Logger): elapsed_duration = state.get_elapsed_duration() if self.log_to_mlflow and elapsed_duration is not None and elapsed_duration >= 1.0: - log.debug('Reloading model to log to MLFlow') - - from torch.distributed.fsdp import \ - FullyShardedDataParallel as FSDP - if isinstance(state.model.model, FSDP): - model_class = state.model.model.module - else: - model_class = state.model.model - - copied_config = copy.deepcopy(state.model.model.config) - if state.model.model.config.model_type == 'mpt': - copied_config.attn_config['attn_impl'] = 'torch' - copied_config.init_device = 'cpu' - - new_model_instance = type(model_class)(copied_config) - new_model_instance.to(dtype=self.dtype) - new_model_instance.load_state_dict(state_dict) - del state_dict components = {'model': new_model_instance} if state.model.tokenizer is not None: - new_tokenizer_instance = AutoTokenizer.from_pretrained( - temp_save_dir) - components['tokenizer'] = new_tokenizer_instance + components['tokenizer'] = state.model.tokenizer log.debug('Logging Hugging Face model to MLFlow') registered_model_name = f'{state.run_name}_{os.path.basename(save_dir)}' @@ -235,6 +221,8 @@ def _save_checkpoint(self, state: State, logger: Logger): ) local_save_path = str( Path(temp_save_dir) / f'mlflow_save_{i}') + + # TODO: Remove after mlflow fixes the bug that makes this necessary import mlflow mlflow.store._unity_catalog.registry.rest_store.get_feature_dependencies = lambda *args, **kwargs: '' mlflow_logger.save_model( From aa324c63caa1a96cb5ccb1491d9c50faf12ff993 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Fri, 29 Sep 2023 22:20:32 +0000 Subject: [PATCH 32/54] undo yaml changes --- scripts/train/yamls/pretrain/mpt-125m.yaml | 28 +++++++--------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/scripts/train/yamls/pretrain/mpt-125m.yaml b/scripts/train/yamls/pretrain/mpt-125m.yaml index ac95719662..1d4c1d964c 100644 --- a/scripts/train/yamls/pretrain/mpt-125m.yaml +++ b/scripts/train/yamls/pretrain/mpt-125m.yaml @@ -4,7 +4,7 @@ max_seq_len: 2048 global_seed: 17 # Run Name -run_name: test-mlflow-register-3 +run_name: # If left blank, will be read from env var $RUN_NAME # Model model: @@ -31,7 +31,7 @@ train_loader: dataset: local: ${data_local} remote: ${data_remote} - split: train_small + split: train shuffle: true max_seq_len: ${max_seq_len} shuffle_seed: ${global_seed} @@ -43,7 +43,7 @@ eval_loader: dataset: local: ${data_local} remote: ${data_remote} - split: val_small + split: val shuffle: false max_seq_len: ${max_seq_len} shuffle_seed: ${global_seed} @@ -70,16 +70,16 @@ algorithms: clipping_type: norm clipping_threshold: 1.0 -max_duration: 10ba # ~ 2.5B tokens +max_duration: 4800ba # ~ 2.5B tokens eval_interval: 500ba eval_first: false -eval_subset_num_batches: 2 -global_train_batch_size: 2 +eval_subset_num_batches: -1 +global_train_batch_size: 256 # System seed: ${global_seed} -device_eval_batch_size: 1 -device_train_microbatch_size: 1 +device_eval_batch_size: 16 +device_train_microbatch_size: 16 # device_train_microbatch_size: auto precision: amp_bf16 @@ -104,16 +104,6 @@ callbacks: lr_monitor: {} memory_monitor: {} runtime_estimator: {} - hf_checkpointer: - save_interval: 10ba - precision: bfloat16 - save_folder: ./{run_name}/checkpoints - log_to_mlflow: true - uc_prefix: main.danielking - -loggers: - mlflow: - experiment_name: /Users/daniel.king@databricks.com/mlflow-logging-test # loggers: # wandb: {} @@ -121,7 +111,7 @@ loggers: # Checkpoint to local filesystem or remote object store # save_interval: 500ba # save_num_checkpoints_to_keep: 1 # Important, this cleans up checkpoints saved to DISK -save_folder: ./{run_name}/checkpoints +# save_folder: ./{run_name}/checkpoints # save_folder: s3://my-bucket/my-folder/{run_name}/checkpoints # Load from local filesystem or remote object store From bcfb534d39060aaf5592ea9f26d6bc41bf93cb3e Mon Sep 17 00:00:00 2001 From: Daniel King Date: Fri, 29 Sep 2023 15:21:43 -0700 Subject: [PATCH 33/54] precommit --- llmfoundry/callbacks/hf_checkpointer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index ff2924203d..8e84374828 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -3,7 +3,6 @@ import contextlib import copy -import json import logging import os import tempfile @@ -18,7 +17,7 @@ from composer.loggers.remote_uploader_downloader import RemoteUploaderDownloader from composer.models import HuggingFaceModel from composer.utils import dist, format_name_with_dist_and_time, parse_uri -from transformers import AutoTokenizer, PreTrainedTokenizerBase +from transformers import PreTrainedTokenizerBase from llmfoundry.models.mpt import MPTConfig, MPTForCausalLM from llmfoundry.utils.huggingface_hub_utils import \ @@ -183,9 +182,10 @@ def _save_checkpoint(self, state: State, logger: Logger): new_model_instance.load_state_dict(state_dict) del state_dict - log.debug("Saving Hugging Face checkpoint to disk") + log.debug('Saving Hugging Face checkpoint to disk') new_model_instance.save_pretrained('temp_save_dir') if state.model.tokenizer is not None: + assert isinstance(state.model.tokenizer, PreTrainedTokenizerBase) state.model.tokenizer.save_pretrained('temp_save_dir') # Only need to edit files for MPT because it has custom code From 7a6ae1d18d41d07ea43e8530a529df3e5da1170e Mon Sep 17 00:00:00 2001 From: Daniel King Date: Fri, 29 Sep 2023 22:57:09 +0000 Subject: [PATCH 34/54] fixes --- llmfoundry/callbacks/hf_checkpointer.py | 4 ++-- tests/test_hf_conversion_script.py | 16 +++++++++++++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 8e84374828..e1aecfb3ca 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -183,10 +183,10 @@ def _save_checkpoint(self, state: State, logger: Logger): del state_dict log.debug('Saving Hugging Face checkpoint to disk') - new_model_instance.save_pretrained('temp_save_dir') + new_model_instance.save_pretrained(temp_save_dir) if state.model.tokenizer is not None: assert isinstance(state.model.tokenizer, PreTrainedTokenizerBase) - state.model.tokenizer.save_pretrained('temp_save_dir') + state.model.tokenizer.save_pretrained(temp_save_dir) # Only need to edit files for MPT because it has custom code if state.model.model.config.model_type == 'mpt': diff --git a/tests/test_hf_conversion_script.py b/tests/test_hf_conversion_script.py index 53cf447050..30412b8844 100644 --- a/tests/test_hf_conversion_script.py +++ b/tests/test_hf_conversion_script.py @@ -12,7 +12,7 @@ from composer.utils import dist, get_device from llmfoundry.callbacks import HuggingFaceCheckpointer -from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM +from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM, MPTConfig, MPTForCausalLM # Add repo root to path so we can import scripts and test it repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) @@ -150,6 +150,20 @@ def check_hf_model_equivalence(model1: PreTrainedModel, # so we remove it expected_model_config_dict.pop('_name_or_path') new_model_config_dict.pop('_name_or_path') + + # Special case a couple of differences that correctly occur when saving MPT to huggingface format + # checkpoint + architectures_1 = expected_model_config_dict.pop('architectures', None) + architectures_2 = new_model_config_dict.pop('architectures', None) + if architectures_1 != architectures_2: + assert architectures_1 is None and architectures_2 == ['MPTForCausalLM'] + + auto_map_1 = expected_model_config_dict.pop('auto_map', None) + auto_map_2 = new_model_config_dict.pop('auto_map', None) + if auto_map_1 != auto_map_2: + assert auto_map_1 == {'AutoConfig': 'configuration_mpt.MPTConfig'} + assert auto_map_2 == {'AutoConfig': 'configuration_mpt.MPTConfig', 'AutoModelForCausalLM': 'modeling_mpt.MPTForCausalLM'} + assert expected_model_config_dict == new_model_config_dict assert all( torch.equal(p1.cpu(), p2.cpu()) From 7190848f666fcb670c5d261dd0659e9ce5794e18 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Fri, 29 Sep 2023 15:58:05 -0700 Subject: [PATCH 35/54] precommit; --- llmfoundry/callbacks/hf_checkpointer.py | 3 ++- tests/test_hf_conversion_script.py | 8 ++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index e1aecfb3ca..f3962ac966 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -185,7 +185,8 @@ def _save_checkpoint(self, state: State, logger: Logger): log.debug('Saving Hugging Face checkpoint to disk') new_model_instance.save_pretrained(temp_save_dir) if state.model.tokenizer is not None: - assert isinstance(state.model.tokenizer, PreTrainedTokenizerBase) + assert isinstance(state.model.tokenizer, + PreTrainedTokenizerBase) state.model.tokenizer.save_pretrained(temp_save_dir) # Only need to edit files for MPT because it has custom code diff --git a/tests/test_hf_conversion_script.py b/tests/test_hf_conversion_script.py index 30412b8844..201c9bab2e 100644 --- a/tests/test_hf_conversion_script.py +++ b/tests/test_hf_conversion_script.py @@ -12,7 +12,8 @@ from composer.utils import dist, get_device from llmfoundry.callbacks import HuggingFaceCheckpointer -from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM, MPTConfig, MPTForCausalLM +from llmfoundry.models.mpt.modeling_mpt import (ComposerMPTCausalLM, MPTConfig, + MPTForCausalLM) # Add repo root to path so we can import scripts and test it repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) @@ -162,7 +163,10 @@ def check_hf_model_equivalence(model1: PreTrainedModel, auto_map_2 = new_model_config_dict.pop('auto_map', None) if auto_map_1 != auto_map_2: assert auto_map_1 == {'AutoConfig': 'configuration_mpt.MPTConfig'} - assert auto_map_2 == {'AutoConfig': 'configuration_mpt.MPTConfig', 'AutoModelForCausalLM': 'modeling_mpt.MPTForCausalLM'} + assert auto_map_2 == { + 'AutoConfig': 'configuration_mpt.MPTConfig', + 'AutoModelForCausalLM': 'modeling_mpt.MPTForCausalLM' + } assert expected_model_config_dict == new_model_config_dict assert all( From 5233e246d0d432d0eed37c66585fa1c9d5065ddf Mon Sep 17 00:00:00 2001 From: Daniel King Date: Fri, 29 Sep 2023 16:44:48 -0700 Subject: [PATCH 36/54] precommit --- llmfoundry/callbacks/hf_checkpointer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index f3962ac966..7c2801434b 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -17,9 +17,8 @@ from composer.loggers.remote_uploader_downloader import RemoteUploaderDownloader from composer.models import HuggingFaceModel from composer.utils import dist, format_name_with_dist_and_time, parse_uri -from transformers import PreTrainedTokenizerBase +from transformers import PreTrainedTokenizerBase, PreTrainedModel -from llmfoundry.models.mpt import MPTConfig, MPTForCausalLM from llmfoundry.utils.huggingface_hub_utils import \ edit_files_for_hf_compatibility @@ -183,6 +182,7 @@ def _save_checkpoint(self, state: State, logger: Logger): del state_dict log.debug('Saving Hugging Face checkpoint to disk') + assert isinstance(new_model_instance, PreTrainedModel) new_model_instance.save_pretrained(temp_save_dir) if state.model.tokenizer is not None: assert isinstance(state.model.tokenizer, From 33f21a99367aa036224e62c1494a50d3f984bc23 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Fri, 29 Sep 2023 16:48:16 -0700 Subject: [PATCH 37/54] precommit --- llmfoundry/callbacks/hf_checkpointer.py | 1 + tests/test_hf_conversion_script.py | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 7c2801434b..8d96128c93 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -19,6 +19,7 @@ from composer.utils import dist, format_name_with_dist_and_time, parse_uri from transformers import PreTrainedTokenizerBase, PreTrainedModel +from llmfoundry.models.mpt import MPTConfig, MPTForCausalLM from llmfoundry.utils.huggingface_hub_utils import \ edit_files_for_hf_compatibility diff --git a/tests/test_hf_conversion_script.py b/tests/test_hf_conversion_script.py index 201c9bab2e..62922b8a36 100644 --- a/tests/test_hf_conversion_script.py +++ b/tests/test_hf_conversion_script.py @@ -12,8 +12,7 @@ from composer.utils import dist, get_device from llmfoundry.callbacks import HuggingFaceCheckpointer -from llmfoundry.models.mpt.modeling_mpt import (ComposerMPTCausalLM, MPTConfig, - MPTForCausalLM) +from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM # Add repo root to path so we can import scripts and test it repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) From ed4eaf49b6170e2987a017d80093d319a55dd04c Mon Sep 17 00:00:00 2001 From: Daniel King Date: Fri, 29 Sep 2023 16:51:26 -0700 Subject: [PATCH 38/54] precommit --- llmfoundry/callbacks/hf_checkpointer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 8d96128c93..c3da99bbcf 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -17,7 +17,7 @@ from composer.loggers.remote_uploader_downloader import RemoteUploaderDownloader from composer.models import HuggingFaceModel from composer.utils import dist, format_name_with_dist_and_time, parse_uri -from transformers import PreTrainedTokenizerBase, PreTrainedModel +from transformers import PreTrainedModel, PreTrainedTokenizerBase from llmfoundry.models.mpt import MPTConfig, MPTForCausalLM from llmfoundry.utils.huggingface_hub_utils import \ @@ -177,13 +177,13 @@ def _save_checkpoint(self, state: State, logger: Logger): # TODO: after torch 2.1, we can load a state dict into a meta model # and skip the extra model init log.debug(f'Creating new model instance') - new_model_instance = type(model_class)(copied_config) + new_model_instance: PreTrainedModel = type(model_class)( + copied_config) new_model_instance.to(dtype=self.dtype) new_model_instance.load_state_dict(state_dict) del state_dict log.debug('Saving Hugging Face checkpoint to disk') - assert isinstance(new_model_instance, PreTrainedModel) new_model_instance.save_pretrained(temp_save_dir) if state.model.tokenizer is not None: assert isinstance(state.model.tokenizer, From d2f88b7725f874b2edfdd6294729858f188c2f28 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Fri, 29 Sep 2023 16:56:18 -0700 Subject: [PATCH 39/54] cleanup --- llmfoundry/callbacks/hf_checkpointer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index c3da99bbcf..f9d8470292 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -39,9 +39,10 @@ class HuggingFaceCheckpointer(Callback): huggingface_folder_name (str): Folder to save each checkpoint under (can be a format string). Default is ``ba{batch}``. precision: The precision to save the model in. Default is ``float32``. Options are ``bfloat16``, ``float16``, or ``float32``. overwrite (bool): Whether to overwrite previous checkpoints. - log_to_mlflow (bool): Whether to log and register the checkpoint to MLflow. Default is ``False``. - mlflow_logging_config (Optional[dict]): A dictionary of config arguments that will get passed along to the MLflow ``log_model`` call. + log_to_mlflow (bool): Whether to register the model to MLflow. This will only register one model at the end of training. Default is ``False``. + mlflow_logging_config (Optional[dict]): A dictionary of config arguments that will get passed along to the MLflow ``save_model`` call. Expected to contain ``metadata`` and ``task`` keys. If either is unspecified, the defaults are ``'text-generation'`` and + ``{'task': 'llm/v1/completions'}`` respectively. """ def __init__( @@ -120,7 +121,6 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None: import mlflow mlflow.environment_variables.MLFLOW_HUGGINGFACE_MODEL_MAX_SHARD_SIZE.set( '5GB') - # mlflow.set_registry_uri('databricks-uc') def _save_checkpoint(self, state: State, logger: Logger): del logger # unused From d5e0683cf551b9b26665afbf8afbfe183bcececf Mon Sep 17 00:00:00 2001 From: Daniel King Date: Sat, 30 Sep 2023 00:39:18 +0000 Subject: [PATCH 40/54] support ddp --- llmfoundry/callbacks/hf_checkpointer.py | 53 ++++++++++++++----------- tests/test_hf_conversion_script.py | 10 ++--- 2 files changed, 34 insertions(+), 29 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index f9d8470292..f375fb432b 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -17,7 +17,7 @@ from composer.loggers.remote_uploader_downloader import RemoteUploaderDownloader from composer.models import HuggingFaceModel from composer.utils import dist, format_name_with_dist_and_time, parse_uri -from transformers import PreTrainedModel, PreTrainedTokenizerBase +from transformers import PreTrainedTokenizerBase, PreTrainedModel from llmfoundry.models.mpt import MPTConfig, MPTForCausalLM from llmfoundry.utils.huggingface_hub_utils import \ @@ -134,8 +134,6 @@ def _save_checkpoint(self, state: State, logger: Logger): MPTConfig.register_for_auto_class() MPTForCausalLM.register_for_auto_class('AutoModelForCausalLM') - assert isinstance(state.model, HuggingFaceModel) - save_dir = format_name_with_dist_and_time( str( Path(self.save_dir_format_str) / @@ -150,9 +148,25 @@ def _save_checkpoint(self, state: State, logger: Logger): str) # pyright doesn't know about enter_result log.debug('Gathering state dict') - with fsdp_state_dict_type_context(state.model.model, - state_dict_type='full'): - state_dict = state.model.model.state_dict() + from torch.distributed.fsdp import \ + FullyShardedDataParallel as FSDP + + if state.is_model_ddp: + original_model = state.model.module.model + original_tokenizer = state.model.module.tokenizer + elif isinstance(state.model.model, FSDP): + original_model = state.model.model.module + original_tokenizer = state.model.tokenizer + else: + original_model = state.model.model + original_tokenizer = state.model.tokenizers + + assert isinstance(original_model, PreTrainedModel) + assert isinstance(original_tokenizer, PreTrainedTokenizerBase) + + state_dict_context = fsdp_state_dict_type_context(original_model, state_dict_type='full') if ((not state.is_model_ddp) and isinstance(state.model.model, FSDP)) else contextlib.nullcontext() + with state_dict_context: + state_dict = original_model.state_dict() # convert the state dict to the requested precision for k, v in state_dict.items(): @@ -162,36 +176,27 @@ def _save_checkpoint(self, state: State, logger: Logger): if dist.get_global_rank() == 0: log.debug('Saving Hugging Face checkpoint to disk') - from torch.distributed.fsdp import \ - FullyShardedDataParallel as FSDP - if isinstance(state.model.model, FSDP): - model_class = state.model.model.module - else: - model_class = state.model.model - - copied_config = copy.deepcopy(state.model.model.config) - if state.model.model.config.model_type == 'mpt': + copied_config = copy.deepcopy(original_model.config) + if original_model.config.model_type == 'mpt': copied_config.attn_config['attn_impl'] = 'torch' copied_config.init_device = 'cpu' # TODO: after torch 2.1, we can load a state dict into a meta model # and skip the extra model init log.debug(f'Creating new model instance') - new_model_instance: PreTrainedModel = type(model_class)( - copied_config) + new_model_instance = type(original_model)(copied_config) new_model_instance.to(dtype=self.dtype) new_model_instance.load_state_dict(state_dict) del state_dict log.debug('Saving Hugging Face checkpoint to disk') new_model_instance.save_pretrained(temp_save_dir) - if state.model.tokenizer is not None: - assert isinstance(state.model.tokenizer, - PreTrainedTokenizerBase) - state.model.tokenizer.save_pretrained(temp_save_dir) + if original_tokenizer is not None: + assert isinstance(original_tokenizer, PreTrainedTokenizerBase) + original_tokenizer.save_pretrained(temp_save_dir) # Only need to edit files for MPT because it has custom code - if state.model.model.config.model_type == 'mpt': + if original_model.config.model_type == 'mpt': log.debug('Editing MPT files for HuggingFace compatibility') edit_files_for_hf_compatibility(temp_save_dir) @@ -212,8 +217,8 @@ def _save_checkpoint(self, state: State, logger: Logger): elapsed_duration = state.get_elapsed_duration() if self.log_to_mlflow and elapsed_duration is not None and elapsed_duration >= 1.0: components = {'model': new_model_instance} - if state.model.tokenizer is not None: - components['tokenizer'] = state.model.tokenizer + if original_tokenizer is not None: + components['tokenizer'] = original_tokenizer log.debug('Logging Hugging Face model to MLFlow') registered_model_name = f'{state.run_name}_{os.path.basename(save_dir)}' diff --git a/tests/test_hf_conversion_script.py b/tests/test_hf_conversion_script.py index 62922b8a36..7d31121db4 100644 --- a/tests/test_hf_conversion_script.py +++ b/tests/test_hf_conversion_script.py @@ -19,7 +19,7 @@ sys.path.append(repo_dir) import shutil from argparse import Namespace -from typing import cast +from typing import cast, Optional import pytest import torch @@ -202,10 +202,10 @@ def test_callback_inits_with_defaults(): @pytest.mark.world_size(2) @pytest.mark.gpu @pytest.mark.parametrize('model', ['mpt', 'neo', 'llama2']) -@pytest.mark.parametrize('fsdp_state_dict_type', ['full', 'sharded']) +@pytest.mark.parametrize('fsdp_state_dict_type', ['full', 'sharded', None]) @pytest.mark.parametrize('log_to_mlflow', [True, False]) def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path, - fsdp_state_dict_type: str, + fsdp_state_dict_type: Optional[str], log_to_mlflow: bool): delete_transformers_cache() @@ -354,7 +354,7 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path, trainer = Trainer( model=original_model, device='gpu', - fsdp_config=fsdp_config, + fsdp_config=fsdp_config if fsdp_state_dict_type is not None else None, train_dataloader=train_dataloader, save_folder=os.path.join(tmp_path, 'checkpoints'), save_interval=f'{save_interval_batches}ba', @@ -427,7 +427,7 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path, trust_remote_code=True, ) - check_hf_model_equivalence(trainer.state.model.model.to(precision), + check_hf_model_equivalence(trainer.state.model.model.to(precision) if fsdp_state_dict_type is not None else trainer.state.model.module.model.to(precision), loaded_model) check_hf_tokenizer_equivalence(tokenizer, loaded_tokenizer) From 53ba51497fc2a233763e443efca5973b7c73a202 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Fri, 29 Sep 2023 17:42:18 -0700 Subject: [PATCH 41/54] precommit --- llmfoundry/callbacks/hf_checkpointer.py | 15 +++++++++------ tests/test_hf_conversion_script.py | 8 +++++--- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index f375fb432b..c00f6d266f 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -17,7 +17,7 @@ from composer.loggers.remote_uploader_downloader import RemoteUploaderDownloader from composer.models import HuggingFaceModel from composer.utils import dist, format_name_with_dist_and_time, parse_uri -from transformers import PreTrainedTokenizerBase, PreTrainedModel +from transformers import PreTrainedModel, PreTrainedTokenizerBase from llmfoundry.models.mpt import MPTConfig, MPTForCausalLM from llmfoundry.utils.huggingface_hub_utils import \ @@ -148,9 +148,8 @@ def _save_checkpoint(self, state: State, logger: Logger): str) # pyright doesn't know about enter_result log.debug('Gathering state dict') - from torch.distributed.fsdp import \ - FullyShardedDataParallel as FSDP - + from torch.distributed.fsdp import FullyShardedDataParallel as FSDP + if state.is_model_ddp: original_model = state.model.module.model original_tokenizer = state.model.module.tokenizer @@ -164,7 +163,10 @@ def _save_checkpoint(self, state: State, logger: Logger): assert isinstance(original_model, PreTrainedModel) assert isinstance(original_tokenizer, PreTrainedTokenizerBase) - state_dict_context = fsdp_state_dict_type_context(original_model, state_dict_type='full') if ((not state.is_model_ddp) and isinstance(state.model.model, FSDP)) else contextlib.nullcontext() + state_dict_context = fsdp_state_dict_type_context( + original_model, state_dict_type='full') if ( + (not state.is_model_ddp) and isinstance( + state.model.model, FSDP)) else contextlib.nullcontext() with state_dict_context: state_dict = original_model.state_dict() @@ -192,7 +194,8 @@ def _save_checkpoint(self, state: State, logger: Logger): log.debug('Saving Hugging Face checkpoint to disk') new_model_instance.save_pretrained(temp_save_dir) if original_tokenizer is not None: - assert isinstance(original_tokenizer, PreTrainedTokenizerBase) + assert isinstance(original_tokenizer, + PreTrainedTokenizerBase) original_tokenizer.save_pretrained(temp_save_dir) # Only need to edit files for MPT because it has custom code diff --git a/tests/test_hf_conversion_script.py b/tests/test_hf_conversion_script.py index 7d31121db4..a5113973d4 100644 --- a/tests/test_hf_conversion_script.py +++ b/tests/test_hf_conversion_script.py @@ -19,7 +19,7 @@ sys.path.append(repo_dir) import shutil from argparse import Namespace -from typing import cast, Optional +from typing import Optional, cast import pytest import torch @@ -427,8 +427,10 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path, trust_remote_code=True, ) - check_hf_model_equivalence(trainer.state.model.model.to(precision) if fsdp_state_dict_type is not None else trainer.state.model.module.model.to(precision), - loaded_model) + check_hf_model_equivalence( + trainer.state.model.model.to(precision) if fsdp_state_dict_type + is not None else trainer.state.model.module.model.to(precision), + loaded_model) check_hf_tokenizer_equivalence(tokenizer, loaded_tokenizer) delete_transformers_cache() From 95c93cd01baa0456c5f1ae6a465c432487c1470c Mon Sep 17 00:00:00 2001 From: Daniel King Date: Fri, 29 Sep 2023 17:44:27 -0700 Subject: [PATCH 42/54] precommit --- llmfoundry/callbacks/hf_checkpointer.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index c00f6d266f..86444266ca 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -17,7 +17,7 @@ from composer.loggers.remote_uploader_downloader import RemoteUploaderDownloader from composer.models import HuggingFaceModel from composer.utils import dist, format_name_with_dist_and_time, parse_uri -from transformers import PreTrainedModel, PreTrainedTokenizerBase +from transformers import PreTrainedModel, PreTrainedTokenizerBase, PretrainedConfig from llmfoundry.models.mpt import MPTConfig, MPTForCausalLM from llmfoundry.utils.huggingface_hub_utils import \ @@ -158,7 +158,7 @@ def _save_checkpoint(self, state: State, logger: Logger): original_tokenizer = state.model.tokenizer else: original_model = state.model.model - original_tokenizer = state.model.tokenizers + original_tokenizer = state.model.tokenizer assert isinstance(original_model, PreTrainedModel) assert isinstance(original_tokenizer, PreTrainedTokenizerBase) @@ -179,7 +179,8 @@ def _save_checkpoint(self, state: State, logger: Logger): log.debug('Saving Hugging Face checkpoint to disk') copied_config = copy.deepcopy(original_model.config) - if original_model.config.model_type == 'mpt': + assert isinstance(copied_config, PretrainedConfig) + if copied_config.model_type == 'mpt': copied_config.attn_config['attn_impl'] = 'torch' copied_config.init_device = 'cpu' From 1dfbaac584d5962c12e29e561c66f5da840d1fbc Mon Sep 17 00:00:00 2001 From: Daniel King Date: Fri, 29 Sep 2023 17:46:14 -0700 Subject: [PATCH 43/54] precommit --- llmfoundry/callbacks/hf_checkpointer.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 86444266ca..b44859e15a 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -17,7 +17,8 @@ from composer.loggers.remote_uploader_downloader import RemoteUploaderDownloader from composer.models import HuggingFaceModel from composer.utils import dist, format_name_with_dist_and_time, parse_uri -from transformers import PreTrainedModel, PreTrainedTokenizerBase, PretrainedConfig +from transformers import (PreTrainedModel, + PreTrainedTokenizerBase) from llmfoundry.models.mpt import MPTConfig, MPTForCausalLM from llmfoundry.utils.huggingface_hub_utils import \ @@ -151,18 +152,15 @@ def _save_checkpoint(self, state: State, logger: Logger): from torch.distributed.fsdp import FullyShardedDataParallel as FSDP if state.is_model_ddp: - original_model = state.model.module.model + original_model: PreTrainedModel = state.model.module.model original_tokenizer = state.model.module.tokenizer elif isinstance(state.model.model, FSDP): - original_model = state.model.model.module + original_model: PreTrainedModel = state.model.model.module original_tokenizer = state.model.tokenizer else: - original_model = state.model.model + original_model: PreTrainedModel = state.model.model original_tokenizer = state.model.tokenizer - assert isinstance(original_model, PreTrainedModel) - assert isinstance(original_tokenizer, PreTrainedTokenizerBase) - state_dict_context = fsdp_state_dict_type_context( original_model, state_dict_type='full') if ( (not state.is_model_ddp) and isinstance( @@ -179,7 +177,6 @@ def _save_checkpoint(self, state: State, logger: Logger): log.debug('Saving Hugging Face checkpoint to disk') copied_config = copy.deepcopy(original_model.config) - assert isinstance(copied_config, PretrainedConfig) if copied_config.model_type == 'mpt': copied_config.attn_config['attn_impl'] = 'torch' copied_config.init_device = 'cpu' From c71583285cb1890416daab8d7467026f3fe678ae Mon Sep 17 00:00:00 2001 From: Daniel King Date: Mon, 2 Oct 2023 15:53:05 -0700 Subject: [PATCH 44/54] switch to providing registered name --- llmfoundry/callbacks/hf_checkpointer.py | 29 +++++++++++++------------ tests/test_hf_conversion_script.py | 2 +- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index b44859e15a..63c9b9ca0d 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -40,7 +40,8 @@ class HuggingFaceCheckpointer(Callback): huggingface_folder_name (str): Folder to save each checkpoint under (can be a format string). Default is ``ba{batch}``. precision: The precision to save the model in. Default is ``float32``. Options are ``bfloat16``, ``float16``, or ``float32``. overwrite (bool): Whether to overwrite previous checkpoints. - log_to_mlflow (bool): Whether to register the model to MLflow. This will only register one model at the end of training. Default is ``False``. + mlflow_registered_model_name (Optional[str]): The name to register the model under in the MLflow model registry. If ``None``, the model will not + be registered. Default is ``None``. mlflow_logging_config (Optional[dict]): A dictionary of config arguments that will get passed along to the MLflow ``save_model`` call. Expected to contain ``metadata`` and ``task`` keys. If either is unspecified, the defaults are ``'text-generation'`` and ``{'task': 'llm/v1/completions'}`` respectively. @@ -53,7 +54,7 @@ def __init__( huggingface_folder_name: str = 'ba{batch}', precision: str = 'float32', overwrite: bool = False, - log_to_mlflow: bool = False, + mlflow_registered_model_name: Optional[str] = None, mlflow_logging_config: Optional[dict] = None, ): self.backend, self.bucket_name, self.save_dir_format_str = parse_uri( @@ -67,13 +68,14 @@ def __init__( }[precision] # mlflow config setup - self.log_to_mlflow = log_to_mlflow - if mlflow_logging_config is None: - mlflow_logging_config = {} - if 'metadata' not in mlflow_logging_config: - mlflow_logging_config['metadata'] = {'task': 'llm/v1/completions'} - if 'task' not in mlflow_logging_config: - mlflow_logging_config['task'] = 'text-generation' + self.mlflow_registered_model_name = mlflow_registered_model_name + if self.mlflow_registered_model_name is not None: + if mlflow_logging_config is None: + mlflow_logging_config = {} + if 'metadata' not in mlflow_logging_config: + mlflow_logging_config['metadata'] = {'task': 'llm/v1/completions'} + if 'task' not in mlflow_logging_config: + mlflow_logging_config['task'] = 'text-generation' self.mlflow_logging_config = mlflow_logging_config self.huggingface_folder_name_fstr = os.path.join( @@ -106,7 +108,7 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None: self.remote_ud.init(state, logger) state.callbacks.append(self.remote_ud) - if self.log_to_mlflow: + if self.mlflow_registered_model_name is not None: self.mlflow_loggers = [ logger_destination for logger_destination in logger.destinations @@ -216,16 +218,15 @@ def _save_checkpoint(self, state: State, logger: Logger): ) elapsed_duration = state.get_elapsed_duration() - if self.log_to_mlflow and elapsed_duration is not None and elapsed_duration >= 1.0: + if self.mlflow_registered_model_name is not None and elapsed_duration is not None and elapsed_duration >= 1.0: components = {'model': new_model_instance} if original_tokenizer is not None: components['tokenizer'] = original_tokenizer log.debug('Logging Hugging Face model to MLFlow') - registered_model_name = f'{state.run_name}_{os.path.basename(save_dir)}' for i, mlflow_logger in enumerate(self.mlflow_loggers): log.debug( - f'Registering model to UC at {mlflow_logger.model_registry_prefix}.{registered_model_name}' + f'Registering model to UC at {mlflow_logger.model_registry_prefix}.{self.mlflow_registered_model_name}' ) local_save_path = str( Path(temp_save_dir) / f'mlflow_save_{i}') @@ -241,6 +242,6 @@ def _save_checkpoint(self, state: State, logger: Logger): ) mlflow_logger.register_model( model_uri=local_save_path, - name=registered_model_name, + name=self.mlflow_registered_model_name, await_registration_for=3600, ) diff --git a/tests/test_hf_conversion_script.py b/tests/test_hf_conversion_script.py index a5113973d4..e0774240f8 100644 --- a/tests/test_hf_conversion_script.py +++ b/tests/test_hf_conversion_script.py @@ -224,7 +224,7 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path, save_folder=os.path.join(tmp_path, 'checkpoints'), save_interval=f'{huggingface_save_interval_batches}ba', precision=precision_str, - log_to_mlflow=log_to_mlflow, + mlflow_registered_model_name='dummy-registered-name', ) # get small version of each model From 5b5f0398294328b181d39d2b7d92e8bf4ce39552 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Mon, 2 Oct 2023 15:59:01 -0700 Subject: [PATCH 45/54] precommit --- llmfoundry/callbacks/hf_checkpointer.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 63c9b9ca0d..464e1fd755 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -17,8 +17,7 @@ from composer.loggers.remote_uploader_downloader import RemoteUploaderDownloader from composer.models import HuggingFaceModel from composer.utils import dist, format_name_with_dist_and_time, parse_uri -from transformers import (PreTrainedModel, - PreTrainedTokenizerBase) +from transformers import PreTrainedModel, PreTrainedTokenizerBase from llmfoundry.models.mpt import MPTConfig, MPTForCausalLM from llmfoundry.utils.huggingface_hub_utils import \ @@ -73,7 +72,9 @@ def __init__( if mlflow_logging_config is None: mlflow_logging_config = {} if 'metadata' not in mlflow_logging_config: - mlflow_logging_config['metadata'] = {'task': 'llm/v1/completions'} + mlflow_logging_config['metadata'] = { + 'task': 'llm/v1/completions' + } if 'task' not in mlflow_logging_config: mlflow_logging_config['task'] = 'text-generation' self.mlflow_logging_config = mlflow_logging_config From 499a12184d7178aa95d8d9e7bcf0af0afea48953 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Wed, 11 Oct 2023 15:11:03 -0700 Subject: [PATCH 46/54] bump composer pin --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index a686dd0808..d0ecc66160 100644 --- a/setup.py +++ b/setup.py @@ -47,7 +47,7 @@ ] install_requires = [ - 'mosaicml[libcloud,wandb,mlflow,oci,gcs]>=0.16.3,<0.17', + 'mosaicml[libcloud,wandb,mlflow,oci,gcs]>=0.16.4,<0.17', 'accelerate>=0.20,<0.21', # for HF inference `device_map` 'transformers>=4.33,<4.34', 'mosaicml-streaming>=0.6,<0.7', From 047b320904e495f38c450f973d5504dfdf0cfcde Mon Sep 17 00:00:00 2001 From: Daniel King Date: Wed, 11 Oct 2023 15:28:23 -0700 Subject: [PATCH 47/54] pyright --- llmfoundry/optim/scheduler.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/llmfoundry/optim/scheduler.py b/llmfoundry/optim/scheduler.py index c29f73739e..28ff7c82bb 100644 --- a/llmfoundry/optim/scheduler.py +++ b/llmfoundry/optim/scheduler.py @@ -20,6 +20,9 @@ def _raise_if_units_dont_match(time: Union[str, Time], t_max: Union[str, Time], time = Time.from_timestring(time) if isinstance(t_max, str): t_max = Time.from_timestring(t_max) + + assert isinstance(time, Time) and isinstance(t_max, Time) + if time.unit != t_max.unit: raise ValueError(f'{time.unit=} does not match {t_max.unit=}.') @@ -27,6 +30,8 @@ def _raise_if_units_dont_match(time: Union[str, Time], t_max: Union[str, Time], def _raise_if_units_dur(time: Union[str, Time], name: str) -> None: if isinstance(time, str): time = Time.from_timestring(time) + + assert isinstance(time, Time) if time.unit == TimeUnit('dur'): raise ValueError(f'{name} cannot be in units of "dur".') From e952250a21cb8a5938d90029afb4c3f118151e90 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Wed, 11 Oct 2023 16:02:38 -0700 Subject: [PATCH 48/54] types --- llmfoundry/callbacks/hf_checkpointer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 464e1fd755..74b9d94b41 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -68,9 +68,9 @@ def __init__( # mlflow config setup self.mlflow_registered_model_name = mlflow_registered_model_name + if mlflow_logging_config is None: + mlflow_logging_config = {} if self.mlflow_registered_model_name is not None: - if mlflow_logging_config is None: - mlflow_logging_config = {} if 'metadata' not in mlflow_logging_config: mlflow_logging_config['metadata'] = { 'task': 'llm/v1/completions' From 795b9ae7692f3d316696436970e31e4e303b1a5c Mon Sep 17 00:00:00 2001 From: Daniel King Date: Thu, 12 Oct 2023 00:33:56 +0000 Subject: [PATCH 49/54] fixes --- llmfoundry/callbacks/hf_checkpointer.py | 11 +++++++---- tests/test_hf_conversion_script.py | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 74b9d94b41..e0af3f4b15 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -117,9 +117,9 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None: ] if len(self.mlflow_loggers) == 0: raise ValueError( - f'`log_to_mlflow` was set to `True` but no `MLFlowLogger` was found in the `logger.destinations` list. ' + f'`mlflow_registered_model_name` was set, but no `MLFlowLogger` was found in the `logger.destinations` list. ' + - 'Please add an `MLFlowLogger` or set `log_to_mlflow` to `False`.' + 'Please add an `MLFlowLogger` or set `mlflow_registered_model_name` to `None`.' ) import mlflow @@ -156,9 +156,11 @@ def _save_checkpoint(self, state: State, logger: Logger): if state.is_model_ddp: original_model: PreTrainedModel = state.model.module.model + state_dict_model = state.model.module.model original_tokenizer = state.model.module.tokenizer elif isinstance(state.model.model, FSDP): original_model: PreTrainedModel = state.model.model.module + state_dict_model = state.model.model original_tokenizer = state.model.tokenizer else: original_model: PreTrainedModel = state.model.model @@ -167,9 +169,10 @@ def _save_checkpoint(self, state: State, logger: Logger): state_dict_context = fsdp_state_dict_type_context( original_model, state_dict_type='full') if ( (not state.is_model_ddp) and isinstance( - state.model.model, FSDP)) else contextlib.nullcontext() + state_dict_model, FSDP)) else contextlib.nullcontext() + with state_dict_context: - state_dict = original_model.state_dict() + state_dict = state_dict_model.state_dict() # convert the state dict to the requested precision for k, v in state_dict.items(): diff --git a/tests/test_hf_conversion_script.py b/tests/test_hf_conversion_script.py index e0774240f8..aa1d17a151 100644 --- a/tests/test_hf_conversion_script.py +++ b/tests/test_hf_conversion_script.py @@ -224,7 +224,7 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path, save_folder=os.path.join(tmp_path, 'checkpoints'), save_interval=f'{huggingface_save_interval_batches}ba', precision=precision_str, - mlflow_registered_model_name='dummy-registered-name', + mlflow_registered_model_name='dummy-registered-name' if log_to_mlflow else None, ) # get small version of each model From 9af48a2d65583a04d18f37105d5b1ffde5ba54d1 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Wed, 11 Oct 2023 17:35:07 -0700 Subject: [PATCH 50/54] precommit --- llmfoundry/callbacks/hf_checkpointer.py | 1 + tests/test_hf_conversion_script.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index e0af3f4b15..843ca102f3 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -164,6 +164,7 @@ def _save_checkpoint(self, state: State, logger: Logger): original_tokenizer = state.model.tokenizer else: original_model: PreTrainedModel = state.model.model + state_dict_model = state.model.model original_tokenizer = state.model.tokenizer state_dict_context = fsdp_state_dict_type_context( diff --git a/tests/test_hf_conversion_script.py b/tests/test_hf_conversion_script.py index aa1d17a151..5bc3ed6d5d 100644 --- a/tests/test_hf_conversion_script.py +++ b/tests/test_hf_conversion_script.py @@ -224,7 +224,8 @@ def test_huggingface_conversion_callback(model: str, tmp_path: pathlib.Path, save_folder=os.path.join(tmp_path, 'checkpoints'), save_interval=f'{huggingface_save_interval_batches}ba', precision=precision_str, - mlflow_registered_model_name='dummy-registered-name' if log_to_mlflow else None, + mlflow_registered_model_name='dummy-registered-name' + if log_to_mlflow else None, ) # get small version of each model From 4e819388af621e817c0c3989d944d5f5ad144af7 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Wed, 11 Oct 2023 17:38:42 -0700 Subject: [PATCH 51/54] more precommit --- llmfoundry/optim/scheduler.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llmfoundry/optim/scheduler.py b/llmfoundry/optim/scheduler.py index 28ff7c82bb..4a6d21c873 100644 --- a/llmfoundry/optim/scheduler.py +++ b/llmfoundry/optim/scheduler.py @@ -21,7 +21,7 @@ def _raise_if_units_dont_match(time: Union[str, Time], t_max: Union[str, Time], if isinstance(t_max, str): t_max = Time.from_timestring(t_max) - assert isinstance(time, Time) and isinstance(t_max, Time) + assert not isinstance(time, str) and not isinstance(t_max, str) if time.unit != t_max.unit: raise ValueError(f'{time.unit=} does not match {t_max.unit=}.') @@ -31,7 +31,8 @@ def _raise_if_units_dur(time: Union[str, Time], name: str) -> None: if isinstance(time, str): time = Time.from_timestring(time) - assert isinstance(time, Time) + assert not isinstance(time, str) + if time.unit == TimeUnit('dur'): raise ValueError(f'{name} cannot be in units of "dur".') From 0dae7a762a7e3966cafc6b011d6a1500bdc1e049 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Wed, 11 Oct 2023 17:47:01 -0700 Subject: [PATCH 52/54] add comment --- llmfoundry/callbacks/hf_checkpointer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 843ca102f3..a02f8b98a0 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -71,6 +71,8 @@ def __init__( if mlflow_logging_config is None: mlflow_logging_config = {} if self.mlflow_registered_model_name is not None: + # Both the metadata and the task are needed in order for mlflow + # and databricks optimized model serving to work if 'metadata' not in mlflow_logging_config: mlflow_logging_config['metadata'] = { 'task': 'llm/v1/completions' From 354bb1148b9702512fab2b44c3a15f6722a2def4 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Mon, 16 Oct 2023 17:47:57 -0700 Subject: [PATCH 53/54] update import path --- llmfoundry/callbacks/hf_checkpointer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index a02f8b98a0..ff63c779d8 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -10,7 +10,7 @@ from typing import Optional, Union import torch -from composer.callbacks.utils import create_interval_scheduler +from composer.utils.misc import create_interval_scheduler from composer.core import Callback, Event, State, Time from composer.core.state import fsdp_state_dict_type_context from composer.loggers import Logger, MLFlowLogger From af5b1c0be22828c406b63413e64b9e857aa82df4 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Mon, 16 Oct 2023 17:48:43 -0700 Subject: [PATCH 54/54] precommit --- llmfoundry/callbacks/hf_checkpointer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index ff63c779d8..aa3beda513 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -10,13 +10,13 @@ from typing import Optional, Union import torch -from composer.utils.misc import create_interval_scheduler from composer.core import Callback, Event, State, Time from composer.core.state import fsdp_state_dict_type_context from composer.loggers import Logger, MLFlowLogger from composer.loggers.remote_uploader_downloader import RemoteUploaderDownloader from composer.models import HuggingFaceModel from composer.utils import dist, format_name_with_dist_and_time, parse_uri +from composer.utils.misc import create_interval_scheduler from transformers import PreTrainedModel, PreTrainedTokenizerBase from llmfoundry.models.mpt import MPTConfig, MPTForCausalLM