From bdfefaf054693e8781678c3b242ffdf39e162583 Mon Sep 17 00:00:00 2001
From: kallewoof <karljohan-alm@garage.co.jp>
Date: Fri, 5 Jan 2024 22:22:21 +0900
Subject: [PATCH] feature: better device mapping for large models (#918)

* fix: improved memory handling when model is bigger than existing VRAM

* feature: add lora_on_cpu flag to do LoRA loading on CPU (RAM)

For big models where the models are taking up the entire GPU VRAM, the LoRA part will fail unless it is loaded on CPU only.

* doc: add README

* fix: enable progress bars in do_merge_lora()

* doc: mention gpu_memory_limit and lora_on_cpu in merge part of README

* Update src/axolotl/utils/models.py

Co-authored-by: Wing Lian <wing.lian@gmail.com>

* fix: remove deletion of removed model_kwargs key

* fix: validate that gpu_memory_limit and max_memory are not both set

---------

Co-authored-by: Karl-Johan Alm <kalle@gmail.com>
Co-authored-by: Wing Lian <wing.lian@gmail.com>
---
 README.md                   |  9 +++++++-
 src/axolotl/cli/__init__.py |  3 ++-
 src/axolotl/utils/config.py |  5 +++++
 src/axolotl/utils/models.py | 41 +++++++++++++++++++++++++++++++++----
 4 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 2bf7483031..836b5af2cf 100644
--- a/README.md
+++ b/README.md
@@ -550,6 +550,11 @@ tf32: true # require >=ampere
 bfloat16: true # require >=ampere
 float16: true
 
+# Limit the memory for all available GPUs to this amount (if an integer, expressed in gigabytes); default: unset
+gpu_memory_limit: 20GiB
+# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge
+lora_on_cpu: true
+
 # A list of one or more datasets to finetune the model with
 datasets:
   # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
@@ -1042,12 +1047,14 @@ The following command will merge your LORA adapater with your base model.  You c
 python3 -m axolotl.cli.merge_lora your_config.yml --lora_model_dir="./completed-model"
 ```
 
-If you run out of CUDA memory, you can try to merge in system RAM with
+You may need to use the `gpu_memory_limit` and/or `lora_on_cpu` config options to avoid running out of memory. If you still run out of CUDA memory, you can try to merge in system RAM with
 
 ```bash
 CUDA_VISIBLE_DEVICES="" python3 -m axolotl.cli.merge_lora ...
 ```
 
+although this will be very slow, and using the config options above are recommended instead.
+
 ## Common Errors 🧰
 
 See also the [FAQ's](./docs/faq.md).
diff --git a/src/axolotl/cli/__init__.py b/src/axolotl/cli/__init__.py
index 0477ebebfb..4f441f527e 100644
--- a/src/axolotl/cli/__init__.py
+++ b/src/axolotl/cli/__init__.py
@@ -73,7 +73,7 @@ def do_merge_lora(
     safe_serialization = cfg.save_safetensors is True
 
     LOG.info("running merge of LoRA with base model")
-    model = model.merge_and_unload()
+    model = model.merge_and_unload(progressbar=True)
     model.to(dtype=cfg.torch_dtype)
 
     if cfg.local_rank == 0:
@@ -81,6 +81,7 @@ def do_merge_lora(
         model.save_pretrained(
             str(Path(cfg.output_dir) / "merged"),
             safe_serialization=safe_serialization,
+            progressbar=True,
         )
         tokenizer.save_pretrained(str(Path(cfg.output_dir) / "merged"))
 
diff --git a/src/axolotl/utils/config.py b/src/axolotl/utils/config.py
index 9bade45728..4d4da18ba5 100644
--- a/src/axolotl/utils/config.py
+++ b/src/axolotl/utils/config.py
@@ -457,6 +457,11 @@ def validate_config(cfg):
             "lora_modules_to_save not properly set yet adding new tokens. Please add `embed_tokens` and `lm_head` to `lora_modules_to_save`."
         )
 
+    if cfg.max_memory is not None and cfg.gpu_memory_limit is not None:
+        raise ValueError(
+            "max_memory and gpu_memory_limit are mutually exclusive and cannot be used together."
+        )
+
     # TODO
     # MPT 7b
     # https://github.com/facebookresearch/bitsandbytes/issues/25
diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index b30ffcad8c..6c579f1840 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -2,7 +2,7 @@
 import logging
 import math
 import os
-from typing import Optional, Tuple  # noqa: F401
+from typing import Any, Optional, Tuple  # noqa: F401
 
 import addict
 import bitsandbytes as bnb
@@ -288,8 +288,37 @@ def load_model(
 
     model_kwargs = {}
 
-    model_kwargs["device_map"] = cfg.device_map
-    model_kwargs["max_memory"] = cfg.max_memory
+    max_memory = cfg.max_memory
+    device_map = cfg.device_map
+
+    if cfg.gpu_memory_limit:
+        gpu_memory_limit = (
+            str(cfg.gpu_memory_limit) + "GiB"
+            if isinstance(cfg.gpu_memory_limit, int)
+            else cfg.gpu_memory_limit
+        )
+
+        max_memory = {}
+        for i in range(torch.cuda.device_count()):
+            max_memory[i] = gpu_memory_limit
+        max_memory["cpu"] = "256GiB"  # something sufficiently large to fit anything
+
+    if max_memory is not None:
+        # Based on https://github.com/togethercomputer/OpenChatKit/blob/main/inference/bot.py
+        from accelerate import infer_auto_device_map, init_empty_weights
+
+        with init_empty_weights():
+            model_canvas = AutoModelForCausalLM.from_config(model_config)
+        model_canvas.tie_weights()
+        device_map = infer_auto_device_map(
+            model_canvas,
+            max_memory=max_memory,
+            dtype=cfg.torch_dtype,
+        )
+        # We can discard max_memory now as we have a device map set up for us
+        max_memory = None
+
+    model_kwargs["device_map"] = device_map
     model_kwargs["torch_dtype"] = cfg.torch_dtype
     # TODO can we put the reference model on it's own gpu? I think we have to move logits around to calculate loss
     # if cfg.rl:
@@ -426,7 +455,6 @@ def load_model(
             model_kwargs["device"] = torch.cuda.current_device()
             del model_kwargs["torch_dtype"]
             del model_kwargs["device_map"]
-            del model_kwargs["max_memory"]
 
             model = MambaLMHeadModel.from_pretrained(
                 base_model,
@@ -683,10 +711,15 @@ def load_lora(model, cfg, inference=False):
 
     if cfg.lora_model_dir:
         LOG.debug("Loading pretained PEFT - LoRA")
+        model_kwargs: Any = {}
+        if cfg.lora_on_cpu:
+            model_kwargs["max_memory"] = {"cpu": "256GiB"}
+            model_kwargs["device_map"] = {"": "cpu"}
         model = PeftModel.from_pretrained(
             model,
             cfg.lora_model_dir,
             is_trainable=(not inference),
+            **model_kwargs,
         )
     else:
         model = get_peft_model(model, lora_config)