From a1eac155fe3d5cbedd32d1c11a5281dcf3c3540e Mon Sep 17 00:00:00 2001 From: notsyncing Date: Fri, 2 Feb 2024 13:09:02 +0800 Subject: [PATCH] Check if the buffers fit GPU memory after device map auto inferred * For some models, like TheBloke/WizardCoder-33B-V1.1-GPTQ, contain a huge buffer, which may cause OOM on GPU memory if not using offload_buffers. This commit adds a check for such case. --- src/accelerate/big_modeling.py | 6 ++- src/accelerate/utils/modeling.py | 68 +++++++++++++++++++++++++++++++- 2 files changed, 72 insertions(+), 2 deletions(-) diff --git a/src/accelerate/big_modeling.py b/src/accelerate/big_modeling.py index ecab3070761..b1214c7780d 100644 --- a/src/accelerate/big_modeling.py +++ b/src/accelerate/big_modeling.py @@ -572,7 +572,11 @@ def load_checkpoint_and_dispatch( low_zero=(device_map == "balanced_low_0"), ) device_map = infer_auto_device_map( - model, max_memory=max_memory, no_split_module_classes=no_split_module_classes, dtype=dtype + model, + max_memory=max_memory, + no_split_module_classes=no_split_module_classes, + dtype=dtype, + offload_buffers=offload_buffers, ) if offload_state_dict is None and device_map is not None and "disk" in device_map.values(): offload_state_dict = True diff --git a/src/accelerate/utils/modeling.py b/src/accelerate/utils/modeling.py index 778577dbf54..1619b3c7eba 100644 --- a/src/accelerate/utils/modeling.py +++ b/src/accelerate/utils/modeling.py @@ -687,6 +687,7 @@ def compute_module_sizes( model: nn.Module, dtype: Optional[Union[str, torch.device]] = None, special_dtypes: Optional[Dict[str, Union[str, torch.device]]] = None, + buffers_only: bool = False, ): """ Compute the size of each submodule of a given model. @@ -698,7 +699,16 @@ def compute_module_sizes( special_dtypes = {key: _get_proper_dtype(dtyp) for key, dtyp in special_dtypes.items()} special_dtypes_size = {key: dtype_byte_size(dtyp) for key, dtyp in special_dtypes.items()} module_sizes = defaultdict(int) - for name, tensor in named_module_tensors(model, recurse=True): + + module_list = [] + + if not buffers_only: + module_list = named_module_tensors(model, recurse=True) + else: + if hasattr(model, "named_buffers"): + module_list = model.named_buffers(recurse=True) + + for name, tensor in module_list: if special_dtypes is not None and name in special_dtypes: size = tensor.numel() * special_dtypes_size[name] elif dtype is None: @@ -716,6 +726,18 @@ def compute_module_sizes( return module_sizes +def compute_module_total_buffer_size( + model: nn.Module, + dtype: Optional[Union[str, torch.device]] = None, + special_dtypes: Optional[Dict[str, Union[str, torch.device]]] = None, +): + """ + Compute the total size of buffers in each submodule of a given model. + """ + module_sizes = compute_module_sizes(model, dtype, special_dtypes, True) + return module_sizes.get("", 0) + + def get_max_layer_size( modules: List[Tuple[str, torch.nn.Module]], module_sizes: Dict[str, int], no_split_module_classes: List[str] ): @@ -1027,6 +1049,7 @@ def infer_auto_device_map( special_dtypes: Optional[Dict[str, Union[str, torch.dtype]]] = None, verbose: bool = False, clean_result: bool = True, + offload_buffers: bool = False, ): """ Compute a device map for a given model giving priority to GPUs, then offload on CPU and finally offload to disk, @@ -1063,6 +1086,9 @@ def infer_auto_device_map( Whether or not to provide debugging statements as the function builds the device_map. clean_result (`bool`, *optional*, defaults to `True`): Clean the resulting device_map by grouping all submodules that go on the same device together. + offload_buffers (`bool`, *optional*, defaults to `False`): + In the layers that are offloaded on the CPU or the hard drive, whether or not to offload the buffers as + well as the parameters. """ # Get default / clean up max_memory max_memory = get_max_memory(max_memory) @@ -1095,6 +1121,8 @@ def infer_auto_device_map( device_map = OrderedDict() current_device = 0 current_memory_used = 0 + device_memory_used = {} + device_buffer_sizes = {} # Direct submodules and parameters modules_to_treat = ( @@ -1160,6 +1188,8 @@ def infer_auto_device_map( # -> no split, we go to the next device if verbose: print("This module cannot be split, going to the next device.") + + device_memory_used[device] = current_memory_used current_device += 1 modules_to_treat = [(name, module)] + modules_to_treat current_memory_used = 0 @@ -1211,6 +1241,15 @@ def infer_auto_device_map( modules_to_treat.pop(tied_module_index) device_map[tied_module_name] = devices[current_device] + if not offload_buffers: + if device not in device_buffer_sizes.keys(): + device_buffer_sizes[device] = 0 + + current_buffer_size = compute_module_total_buffer_size( + module, dtype=dtype, special_dtypes=special_dtypes + ) + device_buffer_sizes[device] = device_buffer_sizes[device] + current_buffer_size + else: # We don't fit with the tied modules. Next question is: can we split one of the tied modules to make it # smaller or do we need to go on the next device? @@ -1251,6 +1290,8 @@ def infer_auto_device_map( # If the tied module is not split, we go to the next device if verbose: print("None of the tied module can be split, going to the next device.") + + device_memory_used[device] = current_memory_used current_device += 1 modules_to_treat = [(name, module)] + modules_to_treat current_memory_used = 0 @@ -1267,8 +1308,33 @@ def infer_auto_device_map( current_memory_used += module_size device_map[name] = devices[current_device] + if not offload_buffers: + if device not in device_buffer_sizes.keys(): + device_buffer_sizes[device] = 0 + + current_buffer_size = compute_module_total_buffer_size( + module, dtype=dtype, special_dtypes=special_dtypes + ) + device_buffer_sizes[device] = device_buffer_sizes[device] + current_buffer_size + if clean_result: device_map = clean_device_map(device_map) + + if not offload_buffers: + non_gpu_buffer_size = device_buffer_sizes.get("cpu", 0) + device_buffer_sizes.get("disk", 0) + + for gpu_device, gpu_max_memory in max_memory.items(): + if gpu_device == "cpu" or gpu_device == "disk": + continue + + gpu_memory_used = device_memory_used.get(gpu_device, 0) + if gpu_max_memory < non_gpu_buffer_size + gpu_memory_used: + raise ValueError( + f"Device {gpu_device} has {gpu_max_memory} bytes of memory, and has planned to use {gpu_memory_used} bytes, " + f"so the remaining memory is not enough to store additional buffers ({non_gpu_buffer_size} bytes)! " + f"Please consider using offload_buffers=True." + ) + return device_map