From 5465db4ff9850bc12afcb6fedd5737375bfc5fec Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Thu, 12 Sep 2024 12:45:12 -0700 Subject: [PATCH 01/17] Raise DatasetTooSmall exception if canonical nodes is less than num samples (#1518) Co-authored-by: Saaketh Narayan Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> --- .../data_prep/convert_text_to_mds.py | 4 +- llmfoundry/data/finetuning/tasks.py | 20 ++++++++- llmfoundry/utils/exceptions.py | 6 +-- tests/data/test_dataset.py | 44 +++++++++++++++++++ 4 files changed, 69 insertions(+), 5 deletions(-) create mode 100644 tests/data/test_dataset.py diff --git a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py index 7c40a7e698..9a1f8a912d 100644 --- a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py +++ b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py @@ -478,7 +478,9 @@ def convert_text_to_mds( index_path = os.path.join(local_output_folder, 'index.json') with open(index_path, 'r') as index_file: if not json.load(index_file)['shards']: - raise DatasetTooSmallError() + raise DatasetTooSmallError( + reason='No shards were created when converting text to MDS.', + ) # Write a done file with the args and object names write_done_file(local_output_folder, args_str, object_names) diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index 297962dd8a..e8f6484ef2 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -73,6 +73,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]: ALLOWED_RESPONSE_KEYS, ChatTemplateError, ConsecutiveRepeatedChatRolesError, + DatasetTooSmallError, IncorrectMessageKeyQuantityError, InvalidContentTypeError, InvalidExampleTypeError, @@ -1033,7 +1034,24 @@ def build_from_streaming( *args: Any, **kwargs: Any, ) -> StreamingFinetuningDataset: - return self.streaming_dataset_class(*args, **kwargs) + dataset = self.streaming_dataset_class(*args, **kwargs) + num_canonical_nodes = dataset.num_canonical_nodes + num_samples = dataset.num_samples + if num_canonical_nodes is None: + num_physical_nodes = dist.get_world_size( + ) // dist.get_local_world_size() + if num_samples < num_physical_nodes: + raise DatasetTooSmallError( + f'{num_samples=} is less than {dist.get_world_size() // dist.get_local_world_size()}, the number of physical nodes. ', + ) + + if num_canonical_nodes is not None and num_samples < num_canonical_nodes: + raise DatasetTooSmallError( + f'{num_samples=} is less than {num_canonical_nodes=}. ' + + 'Please check your index.json file and ensure that your dataset has been written out correctly.' + + 'If this was intended, reduce num_canonical_nodes.', + ) + return dataset dataset_constructor = DatasetConstructor() diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py index 345a254407..68045fdaa3 100644 --- a/llmfoundry/utils/exceptions.py +++ b/llmfoundry/utils/exceptions.py @@ -376,9 +376,9 @@ def __init__(self, dataset_name: str, split: str) -> None: class DatasetTooSmallError(UserError): """Error thrown when the dataset is too small to be processed.""" - def __init__(self) -> None: - message = f'Your dataset is too small and produced no complete samples during preprocessing. Please provide more data.' - super().__init__(message) + def __init__(self, reason: str) -> None: + message = f'Your dataset is too small and produced no complete samples or too few samples. Please provide more data. {reason}' + super().__init__(message, reason=reason) class RunTimeoutError(InternalError): diff --git a/tests/data/test_dataset.py b/tests/data/test_dataset.py new file mode 100644 index 0000000000..071c189b68 --- /dev/null +++ b/tests/data/test_dataset.py @@ -0,0 +1,44 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 +from contextlib import nullcontext +from typing import Optional +from unittest import mock + +import pytest + +from llmfoundry.data.finetuning.tasks import dataset_constructor +from llmfoundry.utils.exceptions import DatasetTooSmallError + + +@pytest.mark.parametrize('num_canonical_nodes', [None, 8, 2]) +def test_finetuning_streaming_dataset_too_small( + num_canonical_nodes: Optional[int], +): + num_samples = 2 + + class MockDataset: + + def __init__(self): + self.num_canonical_nodes = num_canonical_nodes + self.num_samples = num_samples + + class MockDist: + + def get_world_size(self): + return 32 + + def get_local_world_size(self): + return 8 + + result_context = nullcontext( + ) if num_canonical_nodes == 2 else pytest.raises(DatasetTooSmallError) + with result_context: + with mock.patch( + 'llmfoundry.data.finetuning.tasks.dist', + new=MockDist(), + ): + with mock.patch( + 'llmfoundry.data.finetuning.tasks.DatasetConstructor.streaming_dataset_class', + new=MockDataset, + ): + dataset_constructor.build_from_streaming() From dab768f7e27d0d725bb911a9671d147532e411bc Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Thu, 12 Sep 2024 13:09:50 -0700 Subject: [PATCH 02/17] Add permissions check for delta table reading (#1522) --- .../command_utils/data_prep/convert_delta_to_json.py | 7 +++++++ llmfoundry/utils/exceptions.py | 8 ++++++++ 2 files changed, 15 insertions(+) diff --git a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py index 50d11b1222..666d0278c6 100644 --- a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py +++ b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py @@ -22,6 +22,7 @@ ClusterDoesNotExistError, FailedToConnectToDatabricksError, FailedToCreateSQLConnectionError, + InsufficientPermissionsError, ) if TYPE_CHECKING: @@ -454,6 +455,12 @@ def fetch( sparkSession, ) except Exception as e: + from pyspark.errors import AnalysisException + if isinstance(e, AnalysisException): + if 'INSUFFICIENT_PERMISSIONS' in e.message: # pyright: ignore + raise InsufficientPermissionsError( + action=f'reading from {tablename}', + ) from e raise RuntimeError( f'Error in get rows from {tablename}. Restart sparkSession and try again', ) from e diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py index 68045fdaa3..11895564f2 100644 --- a/llmfoundry/utils/exceptions.py +++ b/llmfoundry/utils/exceptions.py @@ -427,3 +427,11 @@ def __init__( window_size=window_size, loss_window=loss_window, ) + + +class InsufficientPermissionsError(UserError): + """Error thrown when the user does not have sufficient permissions.""" + + def __init__(self, action: str) -> None: + message = f'Insufficient permissions when {action}. Please check your permissions.' + super().__init__(message, action=action) From a862d6e4d269d7f881c6b59bdc9667ef4cab5613 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Thu, 12 Sep 2024 16:52:06 -0700 Subject: [PATCH 03/17] Add HuggingFaceCheckpointer option for only registering final checkpoint (#1516) --- llmfoundry/callbacks/hf_checkpointer.py | 160 +++++++++++++----- llmfoundry/command_utils/train.py | 7 +- .../inference/test_convert_composer_to_hf.py | 154 +++++++++++++++-- 3 files changed, 257 insertions(+), 64 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index f05e7322a8..4e6a501f2f 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -10,6 +10,7 @@ import shutil import tempfile import time +import warnings from multiprocessing.context import SpawnProcess from pathlib import Path from typing import Any, Optional, Sequence, Union @@ -18,6 +19,7 @@ import torch import torch.nn as nn from composer.core import Callback, Event, Precision, State, Time, TimeUnit +from composer.devices import Device from composer.loggers import Logger, MLFlowLogger from composer.models import HuggingFaceModel from composer.utils import ( @@ -161,6 +163,10 @@ class HuggingFaceCheckpointer(Callback): keys ``input_example`` and ``signature``. flatten_imports (Sequence[str]): A sequence of import prefixes that will be flattened when editing MPT files. + final_register_only (bool): If true, only register the model in the MLFlow + registry on the last batch and do not save the HuggingFace checkpoint. If + registration fails or mlflow_registered_model_name is not set, then we will + fallback to saving the HuggingFace checkpoint. """ def __init__( @@ -173,6 +179,7 @@ def __init__( mlflow_registered_model_name: Optional[str] = None, mlflow_logging_config: Optional[dict] = None, flatten_imports: Sequence[str] = ('llmfoundry',), + final_register_only: bool = False, ): _, _, self.save_dir_format_str = parse_uri(save_folder) self.overwrite = overwrite @@ -185,8 +192,18 @@ def __init__( self.flatten_imports = flatten_imports self.using_peft = False - # mlflow config setup + self.final_register_only = final_register_only + self.mlflow_registered_model_name = mlflow_registered_model_name + if self.final_register_only and self.mlflow_registered_model_name is None: + self.final_register_only = False + warnings.warn( + 'final_register_only is set to True, but mlflow_registered_model_name is not set. ' + + + f'Defaulting to final_register_only=False and saving the HuggingFace checkpoint to {save_folder=}.', + ) + + # mlflow config setup if mlflow_logging_config is None: mlflow_logging_config = {} if self.mlflow_registered_model_name is not None: @@ -249,7 +266,7 @@ def __init__( self.last_checkpoint_batch: Optional[Time] = None self.mlflow_loggers = [] - self.child_processes: list[SpawnProcess] = [] + self.register_processes: list[SpawnProcess] = [] # Temporary save directory used by child_processes. self.temp_save_dir = None @@ -259,7 +276,17 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None: state, event, ) and self.last_checkpoint_batch != state.timestamp.batch: - self._save_checkpoint(state, logger) + is_last_batch = self._is_last_batch(state) + self._save_checkpoint( + state, + logger, + register_to_mlflow=( + self.mlflow_registered_model_name is not None and + is_last_batch + ), + upload_to_save_folder=not self.final_register_only or + not is_last_batch, + ) elif event == Event.INIT: if not isinstance(state.model, HuggingFaceModel): raise ValueError( @@ -300,7 +327,7 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None: # Wait for all child processes spawned by the callback to finish. timeout = 3600 wait_start = time.time() - while not self._all_child_processes_done(): + while not self._all_register_processes_done(state.device): wait_time = time.time() - wait_start if wait_time > timeout: raise TimeoutError( @@ -308,6 +335,19 @@ def run_event(self, event: Event, state: State, logger: Logger) -> None: ) time.sleep(2) + if self._any_register_processes_error( + state.device, + ) and self.final_register_only: + log.error( + 'An error occurred in one or more registration processes. Fallback to saving the HuggingFace checkpoint.', + ) + self._save_checkpoint( + state, + logger, + upload_to_save_folder=True, + register_to_mlflow=False, + ) + # Clean up temporary save directory; all processes are done with it. if self.temp_save_dir is not None: shutil.rmtree(self.temp_save_dir) @@ -339,12 +379,23 @@ def _is_last_batch(self, state: State): return False - def _all_child_processes_done(self) -> bool: - not_done = any(process.is_alive() for process in self.child_processes) - x = torch.tensor(1 if not_done else 0).to(device='cuda') + def _all_register_processes_done(self, device: Device) -> bool: + not_done = any( + process.is_alive() for process in self.register_processes + ) + x = device.tensor_to_device(torch.tensor(1 if not_done else 0)) dist.all_reduce(x, reduce_operation='MAX') return x.item() == 0 + def _any_register_processes_error(self, device: Device) -> bool: + has_errors = any( + process.exitcode is not None and process.exitcode != 0 + for process in self.register_processes + ) + x = device.tensor_to_device(torch.tensor(1 if has_errors else 0)) + dist.all_reduce(x, reduce_operation='MAX') + return x.item() == 1 + def transform_model_and_tokenizer( self, model: PreTrainedModel, @@ -412,7 +463,21 @@ def transform_model_pre_registration( """ return model - def _save_checkpoint(self, state: State, logger: Logger): + def _save_checkpoint( + self, + state: State, + logger: Logger, + upload_to_save_folder: bool, + register_to_mlflow: bool, + ): + """Save a HuggingFace formatted checkpoint. + + Args: + state (State): The training state. + logger (Logger): The logger. + upload_to_save_folder (bool): Whether to upload the HF checkpoint to the save folder. + register_to_mlflow (bool): Whether to register the model to MLFlow + """ del logger # unused self.last_checkpoint_batch = state.timestamp.batch @@ -548,50 +613,53 @@ def tensor_hook( ].base_model_name_or_path = self.pretrained_model_name log.debug('Saving Hugging Face checkpoint to disk') - # This context manager casts the TE extra state in io.BytesIO format to tensor format - # Needed for proper hf ckpt saving. - context_manager = te.onnx_export( - True, - ) if is_te_imported and state.precision == Precision.AMP_FP8 else contextlib.nullcontext( - ) - with context_manager: - new_model_instance.save_pretrained(temp_save_dir) - if original_tokenizer is not None: - assert isinstance( - original_tokenizer, - PreTrainedTokenizerBase, - ) - original_tokenizer.save_pretrained(temp_save_dir) - - # Only need to edit files for MPT because it has custom code - if new_model_instance.config.model_type == 'mpt': - log.debug('Editing MPT files for HuggingFace compatibility') - edit_files_for_hf_compatibility( - temp_save_dir, - self.flatten_imports, - ) - if self.remote_ud is not None: - for filename in os.listdir(temp_save_dir): - remote_file_name = os.path.join(save_dir, filename) - remote_file_uri = self.remote_ud.remote_backend.get_uri( - remote_file_name, - ) - log.info( - f'Uploading HuggingFace formatted checkpoint to {remote_file_uri}', + if upload_to_save_folder: + # This context manager casts the TE extra state in io.BytesIO format to tensor format + # Needed for proper hf ckpt saving. + context_manager = te.onnx_export( + True, + ) if is_te_imported and state.precision == Precision.AMP_FP8 else contextlib.nullcontext( + ) + with context_manager: + new_model_instance.save_pretrained(temp_save_dir) + if original_tokenizer is not None: + assert isinstance( + original_tokenizer, + PreTrainedTokenizerBase, ) - self.remote_ud.upload_file( - state=state, - remote_file_name=remote_file_name, - file_path=Path(os.path.join(temp_save_dir, filename)), - overwrite=self.overwrite, + original_tokenizer.save_pretrained(temp_save_dir) + + # Only need to edit files for MPT because it has custom code + if new_model_instance.config.model_type == 'mpt': + log.debug('Editing MPT files for HuggingFace compatibility') + edit_files_for_hf_compatibility( + temp_save_dir, + self.flatten_imports, ) + if self.remote_ud is not None: + for filename in os.listdir(temp_save_dir): + remote_file_name = os.path.join(save_dir, filename) + remote_file_uri = self.remote_ud.remote_backend.get_uri( + remote_file_name, + ) + log.info( + f'Uploading HuggingFace formatted checkpoint to {remote_file_uri}', + ) + self.remote_ud.upload_file( + state=state, + remote_file_name=remote_file_name, + file_path=Path( + os.path.join(temp_save_dir, filename), + ), + overwrite=self.overwrite, + ) + dist.barrier() if dist.get_global_rank() == 0: - if self.mlflow_registered_model_name and self._is_last_batch(state): - + if register_to_mlflow: new_model_instance = self.transform_model_pre_registration( new_model_instance, ) @@ -680,7 +748,7 @@ def tensor_hook( # Restore the monitor process. if monitor_process is not None: mlflow_logger.monitor_process = monitor_process # type: ignore - self.child_processes.append(process) + self.register_processes.append(process) # Save the temporary directory to be cleaned up later. if use_temp_dir: diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py index 73fa4c8d5a..14b7980d57 100644 --- a/llmfoundry/command_utils/train.py +++ b/llmfoundry/command_utils/train.py @@ -584,7 +584,12 @@ def train(cfg: DictConfig) -> Trainer: ) hf_checkpointer_callback = hf_checkpointer_callbacks[0] - hf_checkpointer_callback._save_checkpoint(trainer.state, trainer.logger) + hf_checkpointer_callback._save_checkpoint( + trainer.state, + trainer.logger, + upload_to_save_folder=True, + register_to_mlflow=True, + ) return trainer if train_cfg.only_composer_checkpoint: diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py index b863e1d0a8..4f1bd63c62 100644 --- a/tests/a_scripts/inference/test_convert_composer_to_hf.py +++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py @@ -9,6 +9,7 @@ import shutil from argparse import Namespace from typing import Any, Callable, Optional, cast +from unittest import mock from unittest.mock import ANY, MagicMock, patch import catalogue @@ -314,9 +315,15 @@ class MockSpawnProcess: multiprocessing, so we need to patch SpawnProcess for tests. """ - def __init__(self, target: Callable, kwargs: dict[str, Any]): + def __init__( + self, + target: Callable, + kwargs: dict[str, Any], + exitcode: int = 0, + ): self.target = target self.kwargs = kwargs + self.exitcode = exitcode def start(self): self.target(**self.kwargs) @@ -325,6 +332,133 @@ def is_alive(self) -> bool: return False +def _create_mlflow_logger_mock() -> MagicMock: + mlflow_logger_mock = MagicMock(spec=MLFlowLogger) + mlflow_logger_mock.state_dict = lambda *args, **kwargs: {} + mlflow_logger_mock.save_model = MagicMock(wraps=_save_model_mock) + mlflow_logger_mock.register_model_with_run_id = MagicMock() + mlflow_logger_mock.model_registry_prefix = '' + mlflow_logger_mock._experiment_id = 'mlflow-experiment-id' + mlflow_logger_mock._run_id = 'mlflow-run-id' + mlflow_logger_mock._enabled = True + mlflow_logger_mock.run_url = 'fake-url' + return mlflow_logger_mock + + +def _create_optimizer(original_model: torch.nn.Module) -> torch.optim.Optimizer: + optimizer_config = _OPTIMIZER_CFG() + optimizer_name = optimizer_config.pop('name') + return build_optimizer( + original_model, + optimizer_name, + optimizer_config, + ) + + +@pytest.mark.gpu +@pytest.mark.parametrize('mlflow_registry_error', [True, False]) +@pytest.mark.parametrize( + 'mlflow_registered_model_name', + [None, 'dummy-registered-name'], +) +@patch('os.cpu_count', MagicMock(return_value=1)) +@patch( + 'llmfoundry.callbacks.hf_checkpointer.SpawnProcess', + new=MockSpawnProcess, +) +def test_final_register_only( + mlflow_registry_error: bool, + mlflow_registered_model_name: Optional[str], + tiny_ft_dataloader: DataLoader, + tmp_path: pathlib.Path, + build_tiny_mpt: Callable, +): + if mlflow_registry_error and mlflow_registered_model_name is None: + pytest.skip( + 'Cannot test mlflow_registry_error without mlflow_registered_model_name', + ) + + delete_transformers_cache() + + dist.initialize_dist(get_device('gpu')) + + precision_str = 'bfloat16' + + checkpointer_callback = HuggingFaceCheckpointer( + save_folder=os.path.join(tmp_path, 'checkpoints'), + save_interval='1dur', + precision=precision_str, + mlflow_registered_model_name=mlflow_registered_model_name, + final_register_only=True, + ) + + original_model = build_tiny_mpt() + + optimizer = _create_optimizer(original_model) + + mlflow_logger_mock = _create_mlflow_logger_mock() + + checkpointer_callback._save_checkpoint = MagicMock( + wraps=checkpointer_callback._save_checkpoint, + ) + trainer = Trainer( + model=original_model, + device='gpu', + train_dataloader=tiny_ft_dataloader, + max_duration='1ba', + callbacks=[checkpointer_callback], + loggers=[mlflow_logger_mock], + optimizers=optimizer, + save_latest_filename=None, + ) + + with mock.patch( + 'llmfoundry.callbacks.hf_checkpointer.SpawnProcess', + new=lambda target, + kwargs: MockSpawnProcess( + target, + kwargs, + exitcode=1 if mlflow_registry_error else 0, + ), + ): + trainer.fit() + + if mlflow_registered_model_name is not None: + # We should always attempt to register the model once + assert mlflow_logger_mock.register_model_with_run_id.call_count == 1 + if mlflow_registry_error: + # If the registry fails, we should still save the model + assert mlflow_logger_mock.register_model_with_run_id.call_count == 1 + assert checkpointer_callback._save_checkpoint.call_count == 2 + assert checkpointer_callback._save_checkpoint.call_args_list[ + 0].kwargs == { + 'register_to_mlflow': True, + 'upload_to_save_folder': False, + } + assert checkpointer_callback._save_checkpoint.call_args_list[ + 1].kwargs == { + 'register_to_mlflow': False, + 'upload_to_save_folder': True, + } + else: + # No mlflow_registry_error, so we should only register the model + assert checkpointer_callback._save_checkpoint.call_count == 1 + assert checkpointer_callback._save_checkpoint.call_args_list[ + 0].kwargs == { + 'register_to_mlflow': True, + 'upload_to_save_folder': False, + } + else: + # No mlflow_registered_model_name, so we should only save the checkpoint + assert mlflow_logger_mock.register_model_with_run_id.call_count == 0 + assert checkpointer_callback._save_checkpoint.call_count == 1 + assert checkpointer_callback._save_checkpoint.call_args_list[ + 0].kwargs == { + 'register_to_mlflow': False, + 'upload_to_save_folder': True, + } + + @pytest.mark.gpu @pytest.mark.parametrize('log_to_mlflow', [True, False]) @pytest.mark.parametrize( @@ -368,23 +502,9 @@ def test_huggingface_conversion_callback_interval( original_model = build_tiny_mpt() - optimizer_config = _OPTIMIZER_CFG() - optimizer_name = optimizer_config.pop('name') - optimizer = build_optimizer( - original_model, - optimizer_name, - optimizer_config, - ) + optimizer = _create_optimizer(original_model) - mlflow_logger_mock = MagicMock(spec=MLFlowLogger) - mlflow_logger_mock.state_dict = lambda *args, **kwargs: {} - mlflow_logger_mock.save_model = MagicMock(wraps=_save_model_mock) - mlflow_logger_mock.register_model_with_run_id = MagicMock() - mlflow_logger_mock.model_registry_prefix = '' - mlflow_logger_mock._experiment_id = 'mlflow-experiment-id' - mlflow_logger_mock._run_id = 'mlflow-run-id' - mlflow_logger_mock._enabled = True - mlflow_logger_mock.run_url = 'fake-url' + mlflow_logger_mock = _create_mlflow_logger_mock() checkpointer_callback.transform_model_pre_registration = MagicMock( wraps=checkpointer_callback.transform_model_pre_registration, ) From 83ab9c30e0a2432bcc6213e4cb8b55296b13e438 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Mon, 16 Sep 2024 13:54:10 -0700 Subject: [PATCH 04/17] Replace FSDP args (#1517) Co-authored-by: v-chen_data --- llmfoundry/command_utils/eval.py | 8 ++++++-- tests/a_scripts/inference/test_convert_composer_to_hf.py | 5 +++-- tests/models/hf/test_fsdp_weight_tying.py | 2 +- tests/models/hf/test_hf_peft_wrapping.py | 2 +- tests/models/test_fsdp_act_checkpoint.py | 2 +- 5 files changed, 12 insertions(+), 7 deletions(-) diff --git a/llmfoundry/command_utils/eval.py b/llmfoundry/command_utils/eval.py index f622ca182d..eca16bd815 100644 --- a/llmfoundry/command_utils/eval.py +++ b/llmfoundry/command_utils/eval.py @@ -52,7 +52,7 @@ def evaluate_model( device_eval_batch_size: Union[int, float], eval_gauntlet_config: Optional[Union[str, dict[str, Any]]], eval_loader_config: Optional[Union[dict[str, Any], list[dict[str, Any]]]], - fsdp_config: Optional[dict[str, Any]], + parallelism_config: Optional[dict[str, Any]], loggers: list[LoggerDestination], python_log_level: Optional[str], precision: str, @@ -99,6 +99,10 @@ def evaluate_model( mosaicml_logger.log_metrics(metadata) mosaicml_logger._flush_metadata(force_flush=True) + fsdp_config = parallelism_config.get( + 'fsdp_config', + None, + ) if parallelism_config else None if fsdp_config and model.get('load_in_8bit', False): raise ValueError( 'The FSDP config block is not supported when loading ' + @@ -316,7 +320,7 @@ def evaluate(cfg: DictConfig) -> tuple[list[Trainer], pd.DataFrame]: device_eval_batch_size=eval_config.device_eval_batch_size, eval_gauntlet_config=eval_gauntlet_config, eval_loader_config=eval_loader_config, - fsdp_config=fsdp_config, + parallelism_config={'fsdp': fsdp_config}, loggers=loggers, python_log_level=eval_config.python_log_level, precision=eval_config.precision, diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py index 4f1bd63c62..66ec739a65 100644 --- a/tests/a_scripts/inference/test_convert_composer_to_hf.py +++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py @@ -1042,7 +1042,8 @@ def test_huggingface_conversion_callback( model=original_model, device='gpu', precision=trainer_precision, - fsdp_config=fsdp_config if fsdp_state_dict_type is not None else None, + parallelism_config={'fsdp': fsdp_config} + if fsdp_state_dict_type is not None else None, train_dataloader=train_dataloader, save_folder=os.path.join(tmp_path, 'checkpoints'), save_interval=save_interval, @@ -1469,7 +1470,7 @@ def test_mptmoe_huggingface_conversion_callback( trainer = Trainer( model=original_model, device='gpu', - fsdp_config=fsdp_config, + parallelism_config={'fsdp': fsdp_config}, train_dataloader=train_dataloader, save_folder=os.path.join(tmp_path, 'checkpoints'), save_interval=save_interval, diff --git a/tests/models/hf/test_fsdp_weight_tying.py b/tests/models/hf/test_fsdp_weight_tying.py index 69ced673a1..8e6c113169 100644 --- a/tests/models/hf/test_fsdp_weight_tying.py +++ b/tests/models/hf/test_fsdp_weight_tying.py @@ -91,7 +91,7 @@ def test_fsdp_weight_tying( trainer = Trainer( model=original_model, device='gpu', - fsdp_config=fsdp_config, + parallelism_config={'fsdp': fsdp_config}, train_dataloader=[], device_train_microbatch_size=1, ) diff --git a/tests/models/hf/test_hf_peft_wrapping.py b/tests/models/hf/test_hf_peft_wrapping.py index 56cb36c8c1..01acc22a60 100644 --- a/tests/models/hf/test_hf_peft_wrapping.py +++ b/tests/models/hf/test_hf_peft_wrapping.py @@ -125,7 +125,7 @@ def test_lora_mixed_init( trainer = Trainer( model=original_model, device='gpu', - fsdp_config=fsdp_config, + parallelism_config={'fsdp': fsdp_config}, train_dataloader=[], device_train_microbatch_size=1, ) diff --git a/tests/models/test_fsdp_act_checkpoint.py b/tests/models/test_fsdp_act_checkpoint.py index a41574538a..366bcf7786 100644 --- a/tests/models/test_fsdp_act_checkpoint.py +++ b/tests/models/test_fsdp_act_checkpoint.py @@ -59,7 +59,7 @@ def test_fsdp_act_checkpoint( trainer = Trainer( model=model, device='gpu', - fsdp_config=fsdp_config, + parallelism_config={'fsdp': fsdp_config}, ) assert trainer.state.fsdp_enabled From 0114f33da83b5e2c43f6399f69acd8401525a9e8 Mon Sep 17 00:00:00 2001 From: Abhay Gupta Date: Mon, 16 Sep 2024 17:09:12 -0700 Subject: [PATCH 05/17] enable correct padding_idx for embedding layers (#1527) --- llmfoundry/models/mpt/modeling_mpt.py | 1 + llmfoundry/models/utils/param_init_fns.py | 3 +++ tests/models/utils/test_param_init_fns.py | 27 +++++++++++++++++++++++ 3 files changed, 31 insertions(+) diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py index 06b64101c3..cfe1172634 100644 --- a/llmfoundry/models/mpt/modeling_mpt.py +++ b/llmfoundry/models/mpt/modeling_mpt.py @@ -396,6 +396,7 @@ def __init__(self, config: MPTConfig): self.wte = SharedEmbedding( config.vocab_size, config.d_model, + padding_idx=config.pad_token_id, device=config.init_device, ) if self.learned_pos_emb: diff --git a/llmfoundry/models/utils/param_init_fns.py b/llmfoundry/models/utils/param_init_fns.py index 180e7b894c..8ad6e77c57 100644 --- a/llmfoundry/models/utils/param_init_fns.py +++ b/llmfoundry/models/utils/param_init_fns.py @@ -224,6 +224,9 @@ def embedding_init( emb_init_fn_ = init_fn_ emb_init_fn_(module.weight) + if module.padding_idx is not None: + with torch.no_grad(): + module.weight[module.padding_idx].fill_(0) return True diff --git a/tests/models/utils/test_param_init_fns.py b/tests/models/utils/test_param_init_fns.py index 0eaf60c869..11d9fba430 100644 --- a/tests/models/utils/test_param_init_fns.py +++ b/tests/models/utils/test_param_init_fns.py @@ -199,3 +199,30 @@ def test_emb_init(emb_init_cfg: Optional[tuple[str, Union[int, list[int]]]]): emb_init_uniform_lim, ) == 2 and emb_init_uniform_lim[0] == emb_init_uniform_lim[1]: assert (model.emb.weight == emb_init_uniform_lim[0]).all() + + +@pytest.mark.parametrize( + 'padding_idx', + [0, 2], +) +def test_emb_padding_init(padding_idx: int,): + cfg: dict[str, Union[int, list[int]]] = { + 'vocab_size': 64, + 'in_features': 16, + 'n_layers': 2, + 'padding_idx': padding_idx, + 'emb_init_std': 5, + } + dict_cfg = om.create(cfg) + + model = nn.Embedding( + dict_cfg.vocab_size, + dict_cfg.in_features, + dict_cfg.padding_idx, + ) + + model.apply(partial(param_init_fns.get('kaiming_normal_'), **dict_cfg)) + assert isinstance(model, torch.nn.Embedding) + + if dict_cfg.get('emb_init_std') is not None: + assert (model.weight[padding_idx] == 0).all() From 9a1b78b128a242590b00f364a99d2d2d735f9468 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 17 Sep 2024 10:29:09 -0700 Subject: [PATCH 06/17] Revert "Replace FSDP args" (#1533) --- llmfoundry/command_utils/eval.py | 8 ++------ tests/a_scripts/inference/test_convert_composer_to_hf.py | 5 ++--- tests/models/hf/test_fsdp_weight_tying.py | 2 +- tests/models/hf/test_hf_peft_wrapping.py | 2 +- tests/models/test_fsdp_act_checkpoint.py | 2 +- 5 files changed, 7 insertions(+), 12 deletions(-) diff --git a/llmfoundry/command_utils/eval.py b/llmfoundry/command_utils/eval.py index eca16bd815..f622ca182d 100644 --- a/llmfoundry/command_utils/eval.py +++ b/llmfoundry/command_utils/eval.py @@ -52,7 +52,7 @@ def evaluate_model( device_eval_batch_size: Union[int, float], eval_gauntlet_config: Optional[Union[str, dict[str, Any]]], eval_loader_config: Optional[Union[dict[str, Any], list[dict[str, Any]]]], - parallelism_config: Optional[dict[str, Any]], + fsdp_config: Optional[dict[str, Any]], loggers: list[LoggerDestination], python_log_level: Optional[str], precision: str, @@ -99,10 +99,6 @@ def evaluate_model( mosaicml_logger.log_metrics(metadata) mosaicml_logger._flush_metadata(force_flush=True) - fsdp_config = parallelism_config.get( - 'fsdp_config', - None, - ) if parallelism_config else None if fsdp_config and model.get('load_in_8bit', False): raise ValueError( 'The FSDP config block is not supported when loading ' + @@ -320,7 +316,7 @@ def evaluate(cfg: DictConfig) -> tuple[list[Trainer], pd.DataFrame]: device_eval_batch_size=eval_config.device_eval_batch_size, eval_gauntlet_config=eval_gauntlet_config, eval_loader_config=eval_loader_config, - parallelism_config={'fsdp': fsdp_config}, + fsdp_config=fsdp_config, loggers=loggers, python_log_level=eval_config.python_log_level, precision=eval_config.precision, diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py index 66ec739a65..4f1bd63c62 100644 --- a/tests/a_scripts/inference/test_convert_composer_to_hf.py +++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py @@ -1042,8 +1042,7 @@ def test_huggingface_conversion_callback( model=original_model, device='gpu', precision=trainer_precision, - parallelism_config={'fsdp': fsdp_config} - if fsdp_state_dict_type is not None else None, + fsdp_config=fsdp_config if fsdp_state_dict_type is not None else None, train_dataloader=train_dataloader, save_folder=os.path.join(tmp_path, 'checkpoints'), save_interval=save_interval, @@ -1470,7 +1469,7 @@ def test_mptmoe_huggingface_conversion_callback( trainer = Trainer( model=original_model, device='gpu', - parallelism_config={'fsdp': fsdp_config}, + fsdp_config=fsdp_config, train_dataloader=train_dataloader, save_folder=os.path.join(tmp_path, 'checkpoints'), save_interval=save_interval, diff --git a/tests/models/hf/test_fsdp_weight_tying.py b/tests/models/hf/test_fsdp_weight_tying.py index 8e6c113169..69ced673a1 100644 --- a/tests/models/hf/test_fsdp_weight_tying.py +++ b/tests/models/hf/test_fsdp_weight_tying.py @@ -91,7 +91,7 @@ def test_fsdp_weight_tying( trainer = Trainer( model=original_model, device='gpu', - parallelism_config={'fsdp': fsdp_config}, + fsdp_config=fsdp_config, train_dataloader=[], device_train_microbatch_size=1, ) diff --git a/tests/models/hf/test_hf_peft_wrapping.py b/tests/models/hf/test_hf_peft_wrapping.py index 01acc22a60..56cb36c8c1 100644 --- a/tests/models/hf/test_hf_peft_wrapping.py +++ b/tests/models/hf/test_hf_peft_wrapping.py @@ -125,7 +125,7 @@ def test_lora_mixed_init( trainer = Trainer( model=original_model, device='gpu', - parallelism_config={'fsdp': fsdp_config}, + fsdp_config=fsdp_config, train_dataloader=[], device_train_microbatch_size=1, ) diff --git a/tests/models/test_fsdp_act_checkpoint.py b/tests/models/test_fsdp_act_checkpoint.py index 366bcf7786..a41574538a 100644 --- a/tests/models/test_fsdp_act_checkpoint.py +++ b/tests/models/test_fsdp_act_checkpoint.py @@ -59,7 +59,7 @@ def test_fsdp_act_checkpoint( trainer = Trainer( model=model, device='gpu', - parallelism_config={'fsdp': fsdp_config}, + fsdp_config=fsdp_config, ) assert trainer.state.fsdp_enabled From 7a23f60ad5ce25e80c3d5f3ab3badfb413743daa Mon Sep 17 00:00:00 2001 From: Saaketh Narayan Date: Tue, 17 Sep 2024 12:54:28 -0700 Subject: [PATCH 07/17] Delete unneeded inner base model in PEFT HF Checkpointer (#1532) --- llmfoundry/callbacks/hf_checkpointer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 4e6a501f2f..65bdcb3b6c 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -585,6 +585,7 @@ def tensor_hook( new_base_model_instance, original_model.peft_config[active_adapter], ) + del new_base_model_instance else: new_model_instance = type(original_model)(new_config) new_model_instance.generation_config.update( From 2e3d14f6130ebad5a149c1c52f53fd07628e1006 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 17 Sep 2024 13:45:04 -0700 Subject: [PATCH 08/17] Add deprecation warning to fsdp_config (#1530) Co-authored-by: v-chen_data --- llmfoundry/command_utils/eval.py | 35 ++++- .../inference/test_convert_composer_to_hf.py | 5 +- tests/eval/test_eval_deprecation.py | 125 ++++++++++++++++++ tests/models/hf/test_fsdp_weight_tying.py | 2 +- tests/models/hf/test_hf_peft_wrapping.py | 2 +- tests/models/test_fsdp_act_checkpoint.py | 2 +- 6 files changed, 163 insertions(+), 8 deletions(-) create mode 100644 tests/eval/test_eval_deprecation.py diff --git a/llmfoundry/command_utils/eval.py b/llmfoundry/command_utils/eval.py index f622ca182d..e644ad1f0f 100644 --- a/llmfoundry/command_utils/eval.py +++ b/llmfoundry/command_utils/eval.py @@ -4,6 +4,7 @@ import logging import os import time +import warnings from typing import Any, Optional, Union import pandas as pd @@ -11,7 +12,7 @@ from composer.core import Callback from composer.loggers.logger_destination import LoggerDestination from composer.trainer import Trainer -from composer.utils import dist, get_device, reproducibility +from composer.utils import dist, get_device, parallelism, reproducibility from omegaconf import DictConfig from omegaconf import OmegaConf as om @@ -36,6 +37,7 @@ process_init_device, ) from llmfoundry.utils.registry_utils import import_file +from llmfoundry.utils.warnings import VersionedDeprecationWarning log = logging.getLogger(__name__) @@ -52,7 +54,6 @@ def evaluate_model( device_eval_batch_size: Union[int, float], eval_gauntlet_config: Optional[Union[str, dict[str, Any]]], eval_loader_config: Optional[Union[dict[str, Any], list[dict[str, Any]]]], - fsdp_config: Optional[dict[str, Any]], loggers: list[LoggerDestination], python_log_level: Optional[str], precision: str, @@ -62,9 +63,33 @@ def evaluate_model( callback_configs: Optional[dict[str, Any]], metadata: Optional[dict[str, str]], logged_config: dict[str, Any], + fsdp_config: Optional[dict[str, Any]] = None, + parallelism_config: Optional[dict[str, Any]] = None, should_log_config: bool = True, load_path: Optional[str] = None, ): + if parallelism_config: + deprecated_fsdp_args = list( + parallelism.FSDPConfig.__annotations__.keys(), + ) + for deprecated_arg in deprecated_fsdp_args: + if deprecated_arg in parallelism_config: + raise ValueError( + 'parallelism_config cannot contain deprecated fsdp_config arguments.', + ) + + if fsdp_config: + warnings.warn( + VersionedDeprecationWarning( + 'The argument fsdp_config is deprecated. Please use parallelism_config instead.', + remove_version='0.13.0', + ), + ) + if fsdp_config and parallelism_config: + raise ValueError( + 'Both fsdp_config and parallelism_config cannot be provided at the same time. Please use parallelism_config.', + ) + log.info(f'Evaluating model: {model_name}') # Build tokenizer and model tokenizer_cfg = tokenizer @@ -99,6 +124,10 @@ def evaluate_model( mosaicml_logger.log_metrics(metadata) mosaicml_logger._flush_metadata(force_flush=True) + fsdp_config = parallelism_config.get( + 'fsdp_config', + None, + ) if parallelism_config else fsdp_config if fsdp_config and model.get('load_in_8bit', False): raise ValueError( 'The FSDP config block is not supported when loading ' + @@ -146,7 +175,7 @@ def evaluate_model( callbacks=callbacks, loggers=loggers, precision=precision, - fsdp_config=fsdp_config, + parallelism_config={'fsdp': fsdp_config}, load_path=load_path, load_weights_only=True, progress_bar=False, diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py index 4f1bd63c62..66ec739a65 100644 --- a/tests/a_scripts/inference/test_convert_composer_to_hf.py +++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py @@ -1042,7 +1042,8 @@ def test_huggingface_conversion_callback( model=original_model, device='gpu', precision=trainer_precision, - fsdp_config=fsdp_config if fsdp_state_dict_type is not None else None, + parallelism_config={'fsdp': fsdp_config} + if fsdp_state_dict_type is not None else None, train_dataloader=train_dataloader, save_folder=os.path.join(tmp_path, 'checkpoints'), save_interval=save_interval, @@ -1469,7 +1470,7 @@ def test_mptmoe_huggingface_conversion_callback( trainer = Trainer( model=original_model, device='gpu', - fsdp_config=fsdp_config, + parallelism_config={'fsdp': fsdp_config}, train_dataloader=train_dataloader, save_folder=os.path.join(tmp_path, 'checkpoints'), save_interval=save_interval, diff --git a/tests/eval/test_eval_deprecation.py b/tests/eval/test_eval_deprecation.py new file mode 100644 index 0000000000..828186245a --- /dev/null +++ b/tests/eval/test_eval_deprecation.py @@ -0,0 +1,125 @@ +# Copyright 2024 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +import unittest +import warnings + +from llmfoundry.command_utils.eval import evaluate_model +from llmfoundry.utils.warnings import VersionedDeprecationWarning + + +class TestEvaluateModelDeprecation(unittest.TestCase): + + def setUp(self): + self.common_args = { # type: ignore + 'tokenizer': { + 'name': 'test_tokenizer', + }, + 'model': { + 'name': 'test_model', + }, + 'model_name': 'test', + 'dist_timeout': 60, + 'run_name': 'test_run', + 'seed': 42, + 'icl_tasks': [], + 'max_seq_len': 512, + 'device_eval_batch_size': 1, + 'eval_gauntlet_config': None, + 'eval_loader_config': None, + 'loggers': [], + 'python_log_level': None, + 'precision': 'fp32', + 'eval_gauntlet_df': None, + 'eval_subset_num_batches': 1, + 'icl_subset_num_batches': None, + 'callback_configs': None, + 'metadata': None, + 'logged_config': {}, + } + + def test_no_deprecation_warning(self): + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always') + import composer.utils.parallelism + deprecated_fsdp_args = list( + composer.utils.parallelism.FSDPConfig.__annotations__.keys(), + ) + print(deprecated_fsdp_args) + + try: + parallelism_config = {'fsdp': {'verbose': True}} + evaluate_model( + **self.common_args, + parallelism_config=parallelism_config, + ) + except ValueError as ve: + if 'parallelism_config cannot contain deprecated fsdp_config arguments.' in str( + ve, + ): + self.fail( + 'Raised ValueError about deprecated fsdp_config arguments', + ) + elif 'Both fsdp_config and parallelism_config cannot be provided at the same time.' in str( + ve, + ): + self.fail( + 'Raised ValueError about both configs being provided', + ) + except Exception: + pass + + deprecation_warnings = [ + warning for warning in w + if isinstance(warning.message, VersionedDeprecationWarning) + ] + if deprecation_warnings: + self.fail('VersionedDeprecationWarning was raised') + + def test_deprecation_warning_with_deprecated_arg(self): + # Use assertRaises to catch the expected ValueError + with self.assertRaises(ValueError) as context: + # Directly call evaluate_model; do not use try-except here + evaluate_model( + **self.common_args, + parallelism_config={'activation_checkpointing': True}, + ) + + # Assert that the correct error message is in the exception + self.assertIn( + 'parallelism_config cannot contain deprecated fsdp_config arguments.', + str(context.exception), + ) + + def test_deprecation_warning_with_fsdp_config(self): + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always') + + try: + evaluate_model( + **self.common_args, + parallelism_config=None, + fsdp_config={'verbose': True}, + ) + except Exception: + pass + + self.assertTrue( + any( + issubclass(warning.category, VersionedDeprecationWarning) + for warning in w + ), + ) + + def test_error_with_both_fsdp_and_parallelism_config(self): + with self.assertRaises(ValueError) as context: + evaluate_model( + **self.common_args, + parallelism_config={'some_arg': True}, + fsdp_config={'some_arg': True}, + ) + + self.assertIn( + 'Both fsdp_config and parallelism_config cannot be provided at the same time.', + str(context.exception), + ) diff --git a/tests/models/hf/test_fsdp_weight_tying.py b/tests/models/hf/test_fsdp_weight_tying.py index 69ced673a1..8e6c113169 100644 --- a/tests/models/hf/test_fsdp_weight_tying.py +++ b/tests/models/hf/test_fsdp_weight_tying.py @@ -91,7 +91,7 @@ def test_fsdp_weight_tying( trainer = Trainer( model=original_model, device='gpu', - fsdp_config=fsdp_config, + parallelism_config={'fsdp': fsdp_config}, train_dataloader=[], device_train_microbatch_size=1, ) diff --git a/tests/models/hf/test_hf_peft_wrapping.py b/tests/models/hf/test_hf_peft_wrapping.py index 56cb36c8c1..01acc22a60 100644 --- a/tests/models/hf/test_hf_peft_wrapping.py +++ b/tests/models/hf/test_hf_peft_wrapping.py @@ -125,7 +125,7 @@ def test_lora_mixed_init( trainer = Trainer( model=original_model, device='gpu', - fsdp_config=fsdp_config, + parallelism_config={'fsdp': fsdp_config}, train_dataloader=[], device_train_microbatch_size=1, ) diff --git a/tests/models/test_fsdp_act_checkpoint.py b/tests/models/test_fsdp_act_checkpoint.py index a41574538a..366bcf7786 100644 --- a/tests/models/test_fsdp_act_checkpoint.py +++ b/tests/models/test_fsdp_act_checkpoint.py @@ -59,7 +59,7 @@ def test_fsdp_act_checkpoint( trainer = Trainer( model=model, device='gpu', - fsdp_config=fsdp_config, + parallelism_config={'fsdp': fsdp_config}, ) assert trainer.state.fsdp_enabled From d7c78229e91129d4c35006209fabd5fb2f2252e9 Mon Sep 17 00:00:00 2001 From: Shashank Rajput <144760128+ShashankMosaicML@users.noreply.github.com> Date: Sun, 22 Sep 2024 14:03:42 -0400 Subject: [PATCH 09/17] Fix reuse kv cache for torch attention (#1539) --- llmfoundry/models/layers/attention.py | 3 +++ tests/models/layers/test_flash_torch.py | 19 ++++++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/llmfoundry/models/layers/attention.py b/llmfoundry/models/layers/attention.py index a1af2235cf..625327767e 100644 --- a/llmfoundry/models/layers/attention.py +++ b/llmfoundry/models/layers/attention.py @@ -656,6 +656,9 @@ def get_qkv( 'prev_layer_key_value is None, cannot reuse_prev_layer_kv.', ) key, value = prev_layer_key_value + if self.attn_impl == 'torch': + key = rearrange(key, 'b h d s -> b s (h d)') + value = rearrange(value, 'b h s d -> b s (h d)') query = self.Wq(x) if self.clip_qkv: diff --git a/tests/models/layers/test_flash_torch.py b/tests/models/layers/test_flash_torch.py index 01a6a7576d..0a4b32a73a 100644 --- a/tests/models/layers/test_flash_torch.py +++ b/tests/models/layers/test_flash_torch.py @@ -188,7 +188,7 @@ def gen_bias(attn_impl: str): alibi=alibi, alibi_bias_max=8, ) - if attn_impl != 'flash' and attn_uses_sequence_id and sequence_id is not None: + if attn_impl == 'torch' and attn_uses_sequence_id and sequence_id is not None: assert isinstance(attn_bias, torch.Tensor) # pyright attn_bias = apply_sequence_id( attn_bias, @@ -561,8 +561,10 @@ def test_grouped_query_invalid_heads(): }, }], ) +@pytest.mark.parametrize('attn_impl', ['flash', 'torch']) def test_reuse_prev_layer_kv_cache( pos_emb_config: dict, + attn_impl: str, device: str = 'cuda', ): """Checks reusing previous layer's kv cache.""" @@ -570,7 +572,7 @@ def test_reuse_prev_layer_kv_cache( rope = pos_emb_config['rope'] cfg = { - 'attn_impl': 'flash', + 'attn_impl': attn_impl, 'd_model': 64, 'n_heads': 4, 'attn_pdrop': 0, @@ -630,6 +632,13 @@ def gen_bias(attn_impl: str): alibi=alibi, alibi_bias_max=8, ) + if attn_impl == 'torch': + assert isinstance(attn_bias, torch.Tensor) # pyright + attn_bias = apply_sequence_id( + attn_bias, + sequence_id, # type: ignore + s, + ) return attn_bias @@ -637,7 +646,7 @@ def gen_bias(attn_impl: str): sequence_id=sequence_id, S=s, attn_uses_sequence_id=True, - attn_impl='flash', + attn_impl=attn_impl, attention_mask=attention_mask, ) @@ -656,7 +665,7 @@ def gen_bias(attn_impl: str): x1.requires_grad = True with torch.autocast(x0.device.type): - attn_bias_0 = gen_bias('flash') + attn_bias_0 = gen_bias(attn_impl) alibi_slopes_0 = None if alibi: alibi_slopes_0 = gen_slopes( @@ -703,7 +712,7 @@ def gen_bias(attn_impl: str): flash_attn_padding_info=flash_attn_padding_info, alibi_slopes=alibi_slopes_0, ) - attn_bias_1 = gen_bias('flash') + attn_bias_1 = gen_bias(attn_impl) alibi_slopes_1 = None if alibi: alibi_slopes_1 = gen_slopes( From 14cff668750dc08eb4511ddee0d55b127e711dea Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Sun, 22 Sep 2024 19:49:21 -0400 Subject: [PATCH 10/17] Error on text dataset file not found (#1534) --- .../data_prep/convert_text_to_mds.py | 15 ++++++++++----- llmfoundry/utils/exceptions.py | 11 +++++++++++ 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py index 9a1f8a912d..3ea5aeb5d4 100644 --- a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py +++ b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py @@ -32,6 +32,7 @@ CannotUnicodeDecodeFile, DatasetTooSmallError, InputFolderMissingDataError, + InputFolderNotFound, OutputFolderNotEmptyError, ) @@ -125,11 +126,15 @@ def get_object_names(input_folder: str) -> list[str]: object_store = maybe_create_object_store_from_uri(input_folder) if object_store is not None: _, _, folder_prefix = parse_uri(input_folder) - names = [ - name for name in object_store.list_objects(folder_prefix) - if name.endswith('.txt') - ] - log.info(f'Found {len(names)} text files in remote storage') + try: + names = [ + name for name in object_store.list_objects(folder_prefix) + if name.endswith('.txt') + ] + log.info(f'Found {len(names)} text files in remote storage') + except FileNotFoundError: + raise InputFolderNotFound(folder_prefix) + else: # input_folder is a local folder names = [ diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py index 11895564f2..900355dff5 100644 --- a/llmfoundry/utils/exceptions.py +++ b/llmfoundry/utils/exceptions.py @@ -348,6 +348,17 @@ def __init__(self, input_folder: str) -> None: super().__init__(message, input_folder=input_folder) +class InputFolderNotFound(UserError): + """Error thrown when the a folder is not found.""" + + def __init__(self, folder_that_was_not_found: str) -> None: + message = f'{folder_that_was_not_found} not found.' + super().__init__( + message, + folder_that_was_not_found=folder_that_was_not_found, + ) + + class CannotUnicodeDecodeFile(UserError): """Error thrown when the input folder is missing data.""" From a2c0507795a887b6fb71d3ef975b714523fe2abb Mon Sep 17 00:00:00 2001 From: Saaketh Narayan Date: Sun, 22 Sep 2024 18:23:51 -0700 Subject: [PATCH 11/17] Make ICL tasks not required for eval (#1540) --- llmfoundry/command_utils/eval.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llmfoundry/command_utils/eval.py b/llmfoundry/command_utils/eval.py index e644ad1f0f..70c4319ea8 100644 --- a/llmfoundry/command_utils/eval.py +++ b/llmfoundry/command_utils/eval.py @@ -262,7 +262,7 @@ def evaluate(cfg: DictConfig) -> tuple[list[Trainer], pd.DataFrame]: EvalConfig, EVAL_CONFIG_KEYS, transforms=[allow_toplevel_keys], - icl_tasks_required=True, + icl_tasks_required=False, ) model_configs = eval_config.models @@ -273,7 +273,7 @@ def evaluate(cfg: DictConfig) -> tuple[list[Trainer], pd.DataFrame]: # Mandatory Evaluation Parameters icl_tasks = eval_config.icl_tasks or eval_config.icl_tasks_str if icl_tasks is None: - raise ValueError('icl_tasks must be specified in the config') + icl_tasks = [] # Optional Evaluation Parameters with default values eval_loader_config = eval_config.eval_loader or eval_config.eval_loaders From 85403c086710bc0f62d03fc03c0fcbb2e5ffda1d Mon Sep 17 00:00:00 2001 From: Shashank Rajput <144760128+ShashankMosaicML@users.noreply.github.com> Date: Mon, 23 Sep 2024 10:37:26 -0400 Subject: [PATCH 12/17] Bumping flash attention version to 2.6.3 and adding option for softcap in attention and lm_head logits. (#1374) --- llmfoundry/models/layers/attention.py | 24 +++++- llmfoundry/models/mpt/configuration_mpt.py | 14 +++ llmfoundry/models/mpt/modeling_mpt.py | 6 ++ llmfoundry/models/utils/config_defaults.py | 1 + setup.py | 2 +- tests/models/layers/test_flash_attn.py | 99 +++++++++++++++++++++- 6 files changed, 140 insertions(+), 6 deletions(-) diff --git a/llmfoundry/models/layers/attention.py b/llmfoundry/models/layers/attention.py index 625327767e..612d6b9642 100644 --- a/llmfoundry/models/layers/attention.py +++ b/llmfoundry/models/layers/attention.py @@ -112,6 +112,7 @@ def scaled_multihead_dot_product_attention( dropout_p: float = 0.0, training: bool = False, needs_weights: bool = False, + attn_logit_softcapping: Optional[float] = None, sliding_window_size: int = -1, ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor, torch.Tensor]]]: @@ -149,6 +150,11 @@ def scaled_multihead_dot_product_attention( attn_weight = q.matmul(k) * softmax_scale + if attn_logit_softcapping is not None: + attn_weight = attn_logit_softcapping * torch.tanh( + attn_weight / attn_logit_softcapping, + ) + if attn_bias is not None: # clamp to 0 necessary for torch 2.0 compile() _s_q = max(0, attn_bias.size(2) - s_q) @@ -264,6 +270,7 @@ def flash_attn_fn( sliding_window_size: int = -1, alibi_slopes: Optional[torch.Tensor] = None, flash_attn_padding_info: Optional[dict[str, torch.Tensor]] = None, + attn_logit_softcapping: Optional[float] = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor, torch.Tensor]]]: if key_padding_mask is not None: @@ -381,13 +388,17 @@ def flash_attn_fn( return_attn_probs=needs_weights, ) elif is_flash_v2_installed(): - alibi_kwargs = {} + extra_attn_kwargs = {} if check_alibi_support('flash'): - alibi_kwargs = {'alibi_slopes': alibi_slopes} + extra_attn_kwargs['alibi_slopes'] = alibi_slopes elif alibi_slopes is not None: raise ValueError( 'alibi_slopes is only supported for flash-attn>=2.4.2', ) + if is_flash_v2_installed( + v2_version='v2.6.2', + ) and attn_logit_softcapping is not None: + extra_attn_kwargs['softcap'] = attn_logit_softcapping output_unpad = flash_attn_interface.flash_attn_varlen_func( q=query_unpad, k=key_unpad, @@ -401,7 +412,7 @@ def flash_attn_fn( causal=reset_is_causal, return_attn_probs=needs_weights, window_size=(sliding_window_size, sliding_window_size), - **alibi_kwargs, + **extra_attn_kwargs, ) else: raise RuntimeError( @@ -448,6 +459,7 @@ def __init__( bias: bool = True, sliding_window_size: int = -1, reuse_kv_layer_idx: Optional[int] = None, + attn_logit_softcapping: Optional[float] = None, kv_dim: Optional[int] = None, ): super().__init__() @@ -463,6 +475,7 @@ def __init__( self.kv_n_heads = kv_n_heads self.sliding_window_size = sliding_window_size self.reuse_kv_layer_idx = reuse_kv_layer_idx + self.attn_logit_softcapping = attn_logit_softcapping self.kv_dim = kv_dim if kv_dim is not None else self.d_model self.head_dim = d_model // n_heads @@ -625,6 +638,7 @@ def forward( dropout_p=self.attn_dropout_p, training=self.training, needs_weights=needs_weights, + attn_logit_softcapping=self.attn_logit_softcapping, sliding_window_size=self.sliding_window_size, **extra_attn_kwargs, ) @@ -853,6 +867,7 @@ def __init__( bias: bool = True, sliding_window_size: int = -1, reuse_kv_layer_idx: Optional[int] = None, + attn_logit_softcapping: Optional[float] = None, kv_dim: Optional[int] = None, ): super().__init__( @@ -873,6 +888,7 @@ def __init__( bias=bias, sliding_window_size=sliding_window_size, reuse_kv_layer_idx=reuse_kv_layer_idx, + attn_logit_softcapping=attn_logit_softcapping, kv_dim=kv_dim, ) @@ -902,6 +918,7 @@ def __init__( bias: bool = True, sliding_window_size: int = -1, reuse_kv_layer_idx: Optional[int] = None, + attn_logit_softcapping: Optional[float] = None, kv_dim: Optional[int] = None, ): super().__init__( @@ -922,6 +939,7 @@ def __init__( bias=bias, sliding_window_size=sliding_window_size, reuse_kv_layer_idx=reuse_kv_layer_idx, + attn_logit_softcapping=attn_logit_softcapping, kv_dim=kv_dim, ) diff --git a/llmfoundry/models/mpt/configuration_mpt.py b/llmfoundry/models/mpt/configuration_mpt.py index 91b431e3b4..dbcabdf5f9 100644 --- a/llmfoundry/models/mpt/configuration_mpt.py +++ b/llmfoundry/models/mpt/configuration_mpt.py @@ -51,6 +51,7 @@ def __init__( tie_word_embeddings: bool = True, use_pad_tok_in_ffn: bool = True, block_overrides: Optional[dict[str, Any]] = None, + final_logit_softcapping: Optional[float] = None, **kwargs: Any, ): """The MPT configuration class. @@ -148,6 +149,7 @@ def __init__( reuse_kv_layer: attn_config: reuse_kv_layer_idx: -6 # Relative index of the layer whose kv cache to reuse + final_logit_softcapping (float | None): Softcapping threshold for final logit. Set to None to disable (default value None). Please see https://arxiv.org/pdf/2403.08295 for more details. kwargs (Any): Other relevant keyword arguments. """ self.d_model = d_model @@ -181,6 +183,7 @@ def __init__( if block_overrides is not None: self._validate_block_overrides(block_overrides) self.block_overrides = block_overrides + self.final_logit_softcapping = final_logit_softcapping if isinstance(fc_type, str): fc_type = {'name': fc_type} @@ -325,6 +328,17 @@ def _validate_config(self) -> None: raise NotImplementedError( 'sliding window attention only implemented for torch attention and flash attention (v2.3.0 or higher).', ) + if self.attn_config['attn_logit_softcapping'] is not None: + if self.attn_config['attn_logit_softcapping'] <= 0: + raise ValueError( + 'Attention attn_logit_softcapping should be positive.', + ) + if self.attn_config[ + 'attn_impl' + ] == 'flash' and not is_flash_v2_installed(v2_version='v2.6.2',): + raise NotImplementedError( + 'Attention attn_logit_softcapping is only implemented with torch attention or flash attention v2.6.2 (or higher).', + ) if self.attn_config['kv_dim'] is not None and self.attn_config[ 'fused_qkv']: raise ValueError( diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py index cfe1172634..9212f5594d 100644 --- a/llmfoundry/models/mpt/modeling_mpt.py +++ b/llmfoundry/models/mpt/modeling_mpt.py @@ -1071,6 +1071,7 @@ def __init__(self, config: MPTConfig): f"{logit_scale=} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.", ) self.logit_scale = logit_scale + self.final_logit_softcapping = config.final_logit_softcapping @property def backbone_model_class(self) -> type[MPTModel]: @@ -1172,6 +1173,11 @@ def forward( ) logits *= self.logit_scale + if self.final_logit_softcapping is not None: + logits = self.final_logit_softcapping * torch.tanh( + logits / self.final_logit_softcapping, + ) + loss = None if labels is not None: _labels = torch.roll(labels, shifts=-1) diff --git a/llmfoundry/models/utils/config_defaults.py b/llmfoundry/models/utils/config_defaults.py index bd3b29a479..5550785149 100644 --- a/llmfoundry/models/utils/config_defaults.py +++ b/llmfoundry/models/utils/config_defaults.py @@ -18,6 +18,7 @@ 'softmax_scale': None, 'attn_uses_sequence_id': False, 'sliding_window_size': -1, + 'attn_logit_softcapping': None, 'alibi': False, 'alibi_bias_max': 8, 'rope': False, diff --git a/setup.py b/setup.py index 0a75c610b8..ebc66fdacf 100644 --- a/setup.py +++ b/setup.py @@ -104,7 +104,7 @@ # Flash 2 group kept for backwards compatibility extra_deps['gpu-flash2'] = [ - 'flash-attn>=2.5.8,<3', + 'flash-attn>=2.6.3,<3', ] extra_deps['gpu'] = copy.deepcopy(extra_deps['gpu-flash2']) diff --git a/tests/models/layers/test_flash_attn.py b/tests/models/layers/test_flash_attn.py index 987ea7160a..666d93c9b4 100644 --- a/tests/models/layers/test_flash_attn.py +++ b/tests/models/layers/test_flash_attn.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import math +from typing import Optional import pytest import torch @@ -334,5 +335,99 @@ def gen_bias(): _assert_approx_equal(value_1.grad, value_2.grad) -def _assert_approx_equal(value1: torch.Tensor, value2: torch.Tensor): - assert torch.norm(value2 - value1) <= 1e-2 + 1e-2 * torch.norm(value2) +@pytest.mark.gpu +@pytest.mark.skipif( + not is_flash_v2_installed(v2_version='v2.6.2'), + reason= + 'attn_logit_softcapping only supported by Flash Attention after v2.6.2.', +) +@pytest.mark.parametrize( + 'attn_logit_softcapping', + [None, 0.1, 1.0, 10.0, 100.0], +) +def test_attn_logit_softcapping(attn_logit_softcapping: Optional[float]): + # Test that attn_logit_softcapping in attention works as expected. + dtype = torch.bfloat16 + device = 'cuda' + d = 128 + seqlen_1 = 8 + bsz = 2 + n_heads = 4 + + query_1 = torch.randn(bsz, seqlen_1, + n_heads * d).to(dtype=dtype, device=device) + query_1.requires_grad = True + key_1 = torch.randn(bsz, seqlen_1, + n_heads * d).to(dtype=dtype, device=device) + key_1.requires_grad = True + value_1 = torch.randn(bsz, seqlen_1, + n_heads * d).to(dtype=dtype, device=device) + value_1.requires_grad = True + output_1, _, _ = flash_attn_fn( + query=query_1, + key=key_1, + value=value_1, + n_heads=n_heads, + kv_n_heads=n_heads, + past_key_value=None, + softmax_scale=1 / math.sqrt(d), + attn_bias=None, + key_padding_mask=None, + is_causal=True, + dropout_p=0.0, + training=False, + needs_weights=False, + flash_attn_padding_info=gen_flash_attn_padding_info( + bsz, + seqlen_1, + 0, + query_1.device, + None, + None, + ), + should_repeat_kv_for_gqa=True, + attn_logit_softcapping=attn_logit_softcapping, + ) + output_1.sum().backward() + + query_2 = query_1.detach().clone() + query_2.requires_grad = True + key_2 = key_1.detach().clone() + key_2.requires_grad = True + value_2 = value_1.detach().clone() + value_2.requires_grad = True + output_2, _, _ = scaled_multihead_dot_product_attention( + query=query_2, + key=key_2, + value=value_2, + n_heads=n_heads, + kv_n_heads=n_heads, + past_key_value=None, + softmax_scale=1 / math.sqrt(d), + key_padding_mask=None, + is_causal=True, + dropout_p=0.0, + training=False, + needs_weights=False, + attn_logit_softcapping=attn_logit_softcapping, + ) + output_2.sum().backward() + + _assert_approx_equal(output_1, output_2) + assert (query_2.grad is not None) and (query_1.grad is not None) + _assert_approx_equal(query_1.grad, query_2.grad) + assert (key_2.grad is not None) and (key_1.grad is not None) + _assert_approx_equal(key_1.grad, key_2.grad) + assert (value_2.grad is not None) and (value_1.grad is not None) + _assert_approx_equal(value_1.grad, value_2.grad) + + +def _assert_approx_equal( + value1: torch.Tensor, + value2: torch.Tensor, + atol: float = 1e-2, + rtol: float = 1e-2, +): + actual_difference = torch.norm(value2 - value1) + allowed_difference = atol + rtol * torch.norm(value2) + assert actual_difference < allowed_difference, f'{actual_difference=}, {allowed_difference=}' From f377090dec102afc646fb29a4510ded6ae74ecf9 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Mon, 23 Sep 2024 16:00:07 -0700 Subject: [PATCH 13/17] Register mosaic logger (#1542) --- llmfoundry/loggers/__init__.py | 2 ++ tests/loggers/test_mosaic_ml_logger.py | 16 ++++++++++++++++ 2 files changed, 18 insertions(+) create mode 100644 tests/loggers/test_mosaic_ml_logger.py diff --git a/llmfoundry/loggers/__init__.py b/llmfoundry/loggers/__init__.py index cd3f3fdc62..c60d9be2cd 100644 --- a/llmfoundry/loggers/__init__.py +++ b/llmfoundry/loggers/__init__.py @@ -4,6 +4,7 @@ from composer.loggers import ( InMemoryLogger, MLFlowLogger, + MosaicMLLogger, TensorboardLogger, WandBLogger, ) @@ -18,3 +19,4 @@ func=InMemoryLogger, ) # for backwards compatibility loggers.register('mlflow', func=MLFlowLogger) +loggers.register('mosaicml', func=MosaicMLLogger) diff --git a/tests/loggers/test_mosaic_ml_logger.py b/tests/loggers/test_mosaic_ml_logger.py new file mode 100644 index 0000000000..e9c003321b --- /dev/null +++ b/tests/loggers/test_mosaic_ml_logger.py @@ -0,0 +1,16 @@ +# Copyright 2024 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +from composer.loggers import MosaicMLLogger + +from llmfoundry.utils.builders import build_logger + + +def test_mosaic_ml_logger_constructs(): + mosaic_ml_logger = build_logger( + 'mosaicml', + kwargs={'ignore_exceptions': True}, + ) + + assert isinstance(mosaic_ml_logger, MosaicMLLogger) + assert mosaic_ml_logger.ignore_exceptions == True From d85c83b15d5b07a1b8cd00eaa7e400aaf7b22ea7 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Mon, 23 Sep 2024 23:24:16 -0700 Subject: [PATCH 14/17] Hfcheckpointer optional generation config (#1543) Co-authored-by: v-chen_data --- llmfoundry/callbacks/hf_checkpointer.py | 7 ++- .../inference/test_convert_composer_to_hf.py | 56 ++++++++++++++++++- 2 files changed, 58 insertions(+), 5 deletions(-) diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 65bdcb3b6c..4365a5b2e5 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -588,9 +588,10 @@ def tensor_hook( del new_base_model_instance else: new_model_instance = type(original_model)(new_config) - new_model_instance.generation_config.update( - **original_model.generation_config.to_dict(), - ) + if new_model_instance.generation_config is not None: + new_model_instance.generation_config.update( + **original_model.generation_config.to_dict(), + ) # Then load the state dict in with "assign" so that the state dict # is loaded properly even though the model is initially on meta device. diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py index 66ec739a65..bf5f2a970b 100644 --- a/tests/a_scripts/inference/test_convert_composer_to_hf.py +++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py @@ -8,13 +8,14 @@ import pathlib import shutil from argparse import Namespace -from typing import Any, Callable, Optional, cast +from typing import Any, Callable, Optional, Union, cast from unittest import mock from unittest.mock import ANY, MagicMock, patch import catalogue import pytest import torch +import torch.nn as nn import transformers from composer import ComposerModel, Trainer from composer.loggers import MLFlowLogger @@ -23,7 +24,13 @@ from omegaconf import OmegaConf as om from torch.distributed._tensor.api import DTensor from torch.utils.data import DataLoader -from transformers import PreTrainedModel, PreTrainedTokenizerBase +from transformers import ( + AutoConfig, + GenerationConfig, + PretrainedConfig, + PreTrainedModel, + PreTrainedTokenizerBase, +) from llmfoundry.callbacks import HuggingFaceCheckpointer from llmfoundry.callbacks.hf_checkpointer import _maybe_get_license_filename @@ -1637,3 +1644,48 @@ def test_license_file_finder( found_path = _maybe_get_license_filename(str(tmp_path)) assert (found_path == license_file_name ) if license_file_name is not None else (found_path is None) + + +@pytest.mark.parametrize('generation_config', [None, {}, {'max_length': 200}]) +def test_generation_config_variants( + generation_config: Optional[Union[dict[str, Any], GenerationConfig]], +): + + class MockModel(nn.Module): + + def __init__(self, config: PretrainedConfig): + super().__init__() + self.config = config + # Ensure generation_config is always a GenerationConfig object + if isinstance(config.generation_config, dict): + self.generation_config = GenerationConfig( + **config.generation_config, + ) + else: + self.generation_config = config.generation_config + + config = AutoConfig.from_pretrained('gpt2') + # Convert dict to GenerationConfig if needed + if isinstance(generation_config, dict): + generation_config = GenerationConfig(**generation_config) + config.generation_config = generation_config + + mock_model = MockModel(config) + logger = MagicMock() + state = MagicMock() + state.timestamp.batch = 1 + state.is_model_ddp = False + state.model.model = mock_model + state.model.tokenizer = None + + checkpointer = HuggingFaceCheckpointer( + save_folder='test', + save_interval='1ba', + ) + + checkpointer._save_checkpoint( + state=state, + logger=logger, + upload_to_save_folder=False, + register_to_mlflow=False, + ) From 275a2a40d86a36882cc7963e2677628e05aaaf01 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Tue, 24 Sep 2024 16:57:21 -0700 Subject: [PATCH 15/17] Bump composer version to 0.25.0 (#1546) --- setup.py | 8 ++++---- tests/a_scripts/inference/test_convert_composer_to_hf.py | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index ebc66fdacf..48c1326b0d 100644 --- a/setup.py +++ b/setup.py @@ -52,7 +52,7 @@ ] install_requires = [ - 'mosaicml[libcloud,wandb,oci,gcs,mlflow]>=0.24.1,<0.25', + 'mosaicml[libcloud,wandb,oci,gcs,mlflow]>=0.25.0,<0.26', 'mlflow>=2.14.1,<2.17', 'accelerate>=0.25,<0.34', # for HF inference `device_map` 'transformers>=4.43.2,<4.44', @@ -91,7 +91,7 @@ ] extra_deps['databricks'] = [ - 'mosaicml[databricks]>=0.24.1,<0.25', + 'mosaicml[databricks]>=0.25.0,<0.26', 'numpy<2', 'databricks-sql-connector>=3,<4', 'databricks-connect==14.1.0', @@ -99,7 +99,7 @@ ] extra_deps['tensorboard'] = [ - 'mosaicml[tensorboard]>=0.24.1,<0.25', + 'mosaicml[tensorboard]>=0.25.0,<0.26', ] # Flash 2 group kept for backwards compatibility @@ -110,7 +110,7 @@ extra_deps['gpu'] = copy.deepcopy(extra_deps['gpu-flash2']) extra_deps['peft'] = [ - 'mosaicml[peft]>=0.24.1,<0.25', + 'mosaicml[peft]>=0.25.0,<0.26', ] extra_deps['openai'] = [ diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py index bf5f2a970b..c25432dc48 100644 --- a/tests/a_scripts/inference/test_convert_composer_to_hf.py +++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py @@ -1563,6 +1563,8 @@ def test_mptmoe_huggingface_conversion_callback( # Check output equivalence loaded_model = loaded_model.cuda().bfloat16() # type: ignore + for k, v in batch.items(): + batch[k] = v.cuda() loaded_model_logits = loaded_model( input_ids=batch.get('input_ids', None), attention_mask=batch.get('attention_mask', None), From 151a2e297b603d84e1e4dfed389c3494990936e6 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Wed, 25 Sep 2024 08:53:05 -0700 Subject: [PATCH 16/17] Bump streaming version to 0.9.0 (#1550) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 48c1326b0d..d1979faf63 100644 --- a/setup.py +++ b/setup.py @@ -56,7 +56,7 @@ 'mlflow>=2.14.1,<2.17', 'accelerate>=0.25,<0.34', # for HF inference `device_map` 'transformers>=4.43.2,<4.44', - 'mosaicml-streaming>=0.8.1,<0.9', + 'mosaicml-streaming>=0.9.0,<0.10', 'torch>=2.4.0,<2.4.1', 'datasets>=2.19,<2.20', 'fsspec==2023.6.0', # newer version results in a bug in datasets that duplicates data From 722526d420dab9adc5a5be18425d5e08c97ee0c8 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Wed, 25 Sep 2024 09:25:27 -0700 Subject: [PATCH 17/17] Bump version to 0.13.0.dev0 (#1549) --- llmfoundry/_version.py | 2 +- llmfoundry/command_utils/eval.py | 2 +- llmfoundry/models/hf/model_wrapper.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llmfoundry/_version.py b/llmfoundry/_version.py index 2f1f590b19..0cddcaf967 100644 --- a/llmfoundry/_version.py +++ b/llmfoundry/_version.py @@ -3,4 +3,4 @@ """The LLM Foundry Version.""" -__version__ = '0.12.0.dev0' +__version__ = '0.13.0.dev0' diff --git a/llmfoundry/command_utils/eval.py b/llmfoundry/command_utils/eval.py index 70c4319ea8..73127e8a07 100644 --- a/llmfoundry/command_utils/eval.py +++ b/llmfoundry/command_utils/eval.py @@ -82,7 +82,7 @@ def evaluate_model( warnings.warn( VersionedDeprecationWarning( 'The argument fsdp_config is deprecated. Please use parallelism_config instead.', - remove_version='0.13.0', + remove_version='0.14.0', ), ) if fsdp_config and parallelism_config: diff --git a/llmfoundry/models/hf/model_wrapper.py b/llmfoundry/models/hf/model_wrapper.py index c8805e5d6d..f2b67db1ec 100644 --- a/llmfoundry/models/hf/model_wrapper.py +++ b/llmfoundry/models/hf/model_wrapper.py @@ -48,7 +48,7 @@ def __init__( warnings.warn( VersionedDeprecationWarning( '`HuggingFaceModelWithFSDP` is deprecated. In the future please use `BaseHuggingFaceModel`.', - remove_version='0.13.0', + remove_version='0.14.0', ), ) super().__init__(