diff --git a/README.md b/README.md index 2bf7483031..836b5af2cf 100644 --- a/README.md +++ b/README.md @@ -550,6 +550,11 @@ tf32: true # require >=ampere bfloat16: true # require >=ampere float16: true +# Limit the memory for all available GPUs to this amount (if an integer, expressed in gigabytes); default: unset +gpu_memory_limit: 20GiB +# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge +lora_on_cpu: true + # A list of one or more datasets to finetune the model with datasets: # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files @@ -1042,12 +1047,14 @@ The following command will merge your LORA adapater with your base model. You c python3 -m axolotl.cli.merge_lora your_config.yml --lora_model_dir="./completed-model" ``` -If you run out of CUDA memory, you can try to merge in system RAM with +You may need to use the `gpu_memory_limit` and/or `lora_on_cpu` config options to avoid running out of memory. If you still run out of CUDA memory, you can try to merge in system RAM with ```bash CUDA_VISIBLE_DEVICES="" python3 -m axolotl.cli.merge_lora ... ``` +although this will be very slow, and using the config options above are recommended instead. + ## Common Errors 🧰 See also the [FAQ's](./docs/faq.md). diff --git a/src/axolotl/cli/__init__.py b/src/axolotl/cli/__init__.py index 0477ebebfb..4f441f527e 100644 --- a/src/axolotl/cli/__init__.py +++ b/src/axolotl/cli/__init__.py @@ -73,7 +73,7 @@ def do_merge_lora( safe_serialization = cfg.save_safetensors is True LOG.info("running merge of LoRA with base model") - model = model.merge_and_unload() + model = model.merge_and_unload(progressbar=True) model.to(dtype=cfg.torch_dtype) if cfg.local_rank == 0: @@ -81,6 +81,7 @@ def do_merge_lora( model.save_pretrained( str(Path(cfg.output_dir) / "merged"), safe_serialization=safe_serialization, + progressbar=True, ) tokenizer.save_pretrained(str(Path(cfg.output_dir) / "merged")) diff --git a/src/axolotl/utils/config.py b/src/axolotl/utils/config.py index 9bade45728..4d4da18ba5 100644 --- a/src/axolotl/utils/config.py +++ b/src/axolotl/utils/config.py @@ -457,6 +457,11 @@ def validate_config(cfg): "lora_modules_to_save not properly set yet adding new tokens. Please add `embed_tokens` and `lm_head` to `lora_modules_to_save`." ) + if cfg.max_memory is not None and cfg.gpu_memory_limit is not None: + raise ValueError( + "max_memory and gpu_memory_limit are mutually exclusive and cannot be used together." + ) + # TODO # MPT 7b # https://github.com/facebookresearch/bitsandbytes/issues/25 diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index b30ffcad8c..6c579f1840 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -2,7 +2,7 @@ import logging import math import os -from typing import Optional, Tuple # noqa: F401 +from typing import Any, Optional, Tuple # noqa: F401 import addict import bitsandbytes as bnb @@ -288,8 +288,37 @@ def load_model( model_kwargs = {} - model_kwargs["device_map"] = cfg.device_map - model_kwargs["max_memory"] = cfg.max_memory + max_memory = cfg.max_memory + device_map = cfg.device_map + + if cfg.gpu_memory_limit: + gpu_memory_limit = ( + str(cfg.gpu_memory_limit) + "GiB" + if isinstance(cfg.gpu_memory_limit, int) + else cfg.gpu_memory_limit + ) + + max_memory = {} + for i in range(torch.cuda.device_count()): + max_memory[i] = gpu_memory_limit + max_memory["cpu"] = "256GiB" # something sufficiently large to fit anything + + if max_memory is not None: + # Based on https://github.com/togethercomputer/OpenChatKit/blob/main/inference/bot.py + from accelerate import infer_auto_device_map, init_empty_weights + + with init_empty_weights(): + model_canvas = AutoModelForCausalLM.from_config(model_config) + model_canvas.tie_weights() + device_map = infer_auto_device_map( + model_canvas, + max_memory=max_memory, + dtype=cfg.torch_dtype, + ) + # We can discard max_memory now as we have a device map set up for us + max_memory = None + + model_kwargs["device_map"] = device_map model_kwargs["torch_dtype"] = cfg.torch_dtype # TODO can we put the reference model on it's own gpu? I think we have to move logits around to calculate loss # if cfg.rl: @@ -426,7 +455,6 @@ def load_model( model_kwargs["device"] = torch.cuda.current_device() del model_kwargs["torch_dtype"] del model_kwargs["device_map"] - del model_kwargs["max_memory"] model = MambaLMHeadModel.from_pretrained( base_model, @@ -683,10 +711,15 @@ def load_lora(model, cfg, inference=False): if cfg.lora_model_dir: LOG.debug("Loading pretained PEFT - LoRA") + model_kwargs: Any = {} + if cfg.lora_on_cpu: + model_kwargs["max_memory"] = {"cpu": "256GiB"} + model_kwargs["device_map"] = {"": "cpu"} model = PeftModel.from_pretrained( model, cfg.lora_model_dir, is_trainable=(not inference), + **model_kwargs, ) else: model = get_peft_model(model, lora_config)