Skip to content

Commit

Permalink
Do dtype conversion in torch hook to save memory (#1384)
Browse files Browse the repository at this point in the history
* Do dtype conversion in torch hook to save memory

* update code comment

Co-authored-by: Saaketh Narayan <[email protected]>

---------

Co-authored-by: Saaketh Narayan <[email protected]>
  • Loading branch information
irenedea and snarayan21 authored Jul 23, 2024
1 parent 0bed4ff commit 596dd9d
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 14 deletions.
26 changes: 12 additions & 14 deletions llmfoundry/callbacks/hf_checkpointer.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,8 +435,8 @@ def _save_checkpoint(self, state: State, logger: Logger):

cpu_offload = True

# Add a dtensor->cpu tensor hook to avoid CUDA OOM
def dtensor_to_tensor_hook(
# Add hook to move tensors to cpu to avoid CUDA OOM
def tensor_hook(
module: nn.Module,
state_dict: Dict[str, Any],
prefix: str,
Expand All @@ -449,20 +449,23 @@ def dtensor_to_tensor_hook(
dtensor_fqns.append(fqn)
tensor = tensor.full_tensor() # type: ignore
if dist.get_global_rank() == 0:
# Offload any DTensors to CPU
if cpu_offload:
tensor = tensor.cpu()
state_dict[fqn] = tensor
else:
state_dict[fqn] = None
# Convert the state dict to the requested precision
if isinstance(tensor, torch.Tensor):
state_dict[fqn] = tensor.to(dtype=self.dtype)
del tensor
if dist.get_global_rank() != 0:
for fqn in dtensor_fqns:
del state_dict[fqn]
state_dict = {}
return state_dict

hooks = []
for _, module in state_dict_model.named_modules():
if isinstance(module, FSDP):
hooks.append(
module._register_state_dict_hook(dtensor_to_tensor_hook),
)
hooks.append(module._register_state_dict_hook(tensor_hook),)

state_dict = get_model_state_dict(
state_dict_model,
Expand All @@ -474,11 +477,6 @@ def dtensor_to_tensor_hook(
for hook in hooks:
hook.remove()

# Convert the state dict to the requested precision
for k, v in state_dict.items():
if isinstance(v, torch.Tensor):
state_dict[k] = v.to(dtype=self.dtype)

new_model_instance = None # Need this for pyright because variable could be unbound

if dist.get_global_rank() == 0:
Expand Down Expand Up @@ -537,7 +535,7 @@ def dtensor_to_tensor_hook(
original_tokenizer.save_pretrained(temp_save_dir)

# Only need to edit files for MPT because it has custom code
if original_model.config.model_type == 'mpt':
if new_model_instance.config.model_type == 'mpt':
log.debug('Editing MPT files for HuggingFace compatibility')
edit_files_for_hf_compatibility(
temp_save_dir,
Expand Down
2 changes: 2 additions & 0 deletions tests/a_scripts/inference/test_convert_composer_to_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,8 @@ def test_huggingface_conversion_callback_interval(
mlflow_logger_mock.model_registry_prefix = ''
mlflow_logger_mock._experiment_id = 'mlflow-experiment-id'
mlflow_logger_mock._run_id = 'mlflow-run-id'
mlflow_logger_mock._enabled = True
mlflow_logger_mock.run_url = 'fake-url'
checkpointer_callback.transform_model_pre_registration = MagicMock(
wraps=checkpointer_callback.transform_model_pre_registration,
)
Expand Down

0 comments on commit 596dd9d

Please sign in to comment.