diff --git a/README.md b/README.md
index 2bf7483031..836b5af2cf 100644
--- a/README.md
+++ b/README.md
@@ -550,6 +550,11 @@ tf32: true # require >=ampere
 bfloat16: true # require >=ampere
 float16: true
 
+# Limit the memory for all available GPUs to this amount (if an integer, expressed in gigabytes); default: unset
+gpu_memory_limit: 20GiB
+# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge
+lora_on_cpu: true
+
 # A list of one or more datasets to finetune the model with
 datasets:
   # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
@@ -1042,12 +1047,14 @@ The following command will merge your LORA adapater with your base model.  You c
 python3 -m axolotl.cli.merge_lora your_config.yml --lora_model_dir="./completed-model"
 ```
 
-If you run out of CUDA memory, you can try to merge in system RAM with
+You may need to use the `gpu_memory_limit` and/or `lora_on_cpu` config options to avoid running out of memory. If you still run out of CUDA memory, you can try to merge in system RAM with
 
 ```bash
 CUDA_VISIBLE_DEVICES="" python3 -m axolotl.cli.merge_lora ...
 ```
 
+although this will be very slow, and using the config options above are recommended instead.
+
 ## Common Errors 🧰
 
 See also the [FAQ's](./docs/faq.md).
diff --git a/src/axolotl/cli/__init__.py b/src/axolotl/cli/__init__.py
index 0477ebebfb..4f441f527e 100644
--- a/src/axolotl/cli/__init__.py
+++ b/src/axolotl/cli/__init__.py
@@ -73,7 +73,7 @@ def do_merge_lora(
     safe_serialization = cfg.save_safetensors is True
 
     LOG.info("running merge of LoRA with base model")
-    model = model.merge_and_unload()
+    model = model.merge_and_unload(progressbar=True)
     model.to(dtype=cfg.torch_dtype)
 
     if cfg.local_rank == 0:
@@ -81,6 +81,7 @@ def do_merge_lora(
         model.save_pretrained(
             str(Path(cfg.output_dir) / "merged"),
             safe_serialization=safe_serialization,
+            progressbar=True,
         )
         tokenizer.save_pretrained(str(Path(cfg.output_dir) / "merged"))
 
diff --git a/src/axolotl/utils/config.py b/src/axolotl/utils/config.py
index 9bade45728..4d4da18ba5 100644
--- a/src/axolotl/utils/config.py
+++ b/src/axolotl/utils/config.py
@@ -457,6 +457,11 @@ def validate_config(cfg):
             "lora_modules_to_save not properly set yet adding new tokens. Please add `embed_tokens` and `lm_head` to `lora_modules_to_save`."
         )
 
+    if cfg.max_memory is not None and cfg.gpu_memory_limit is not None:
+        raise ValueError(
+            "max_memory and gpu_memory_limit are mutually exclusive and cannot be used together."
+        )
+
     # TODO
     # MPT 7b
     # https://github.com/facebookresearch/bitsandbytes/issues/25
diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index b30ffcad8c..6c579f1840 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -2,7 +2,7 @@
 import logging
 import math
 import os
-from typing import Optional, Tuple  # noqa: F401
+from typing import Any, Optional, Tuple  # noqa: F401
 
 import addict
 import bitsandbytes as bnb
@@ -288,8 +288,37 @@ def load_model(
 
     model_kwargs = {}
 
-    model_kwargs["device_map"] = cfg.device_map
-    model_kwargs["max_memory"] = cfg.max_memory
+    max_memory = cfg.max_memory
+    device_map = cfg.device_map
+
+    if cfg.gpu_memory_limit:
+        gpu_memory_limit = (
+            str(cfg.gpu_memory_limit) + "GiB"
+            if isinstance(cfg.gpu_memory_limit, int)
+            else cfg.gpu_memory_limit
+        )
+
+        max_memory = {}
+        for i in range(torch.cuda.device_count()):
+            max_memory[i] = gpu_memory_limit
+        max_memory["cpu"] = "256GiB"  # something sufficiently large to fit anything
+
+    if max_memory is not None:
+        # Based on https://github.com/togethercomputer/OpenChatKit/blob/main/inference/bot.py
+        from accelerate import infer_auto_device_map, init_empty_weights
+
+        with init_empty_weights():
+            model_canvas = AutoModelForCausalLM.from_config(model_config)
+        model_canvas.tie_weights()
+        device_map = infer_auto_device_map(
+            model_canvas,
+            max_memory=max_memory,
+            dtype=cfg.torch_dtype,
+        )
+        # We can discard max_memory now as we have a device map set up for us
+        max_memory = None
+
+    model_kwargs["device_map"] = device_map
     model_kwargs["torch_dtype"] = cfg.torch_dtype
     # TODO can we put the reference model on it's own gpu? I think we have to move logits around to calculate loss
     # if cfg.rl:
@@ -426,7 +455,6 @@ def load_model(
             model_kwargs["device"] = torch.cuda.current_device()
             del model_kwargs["torch_dtype"]
             del model_kwargs["device_map"]
-            del model_kwargs["max_memory"]
 
             model = MambaLMHeadModel.from_pretrained(
                 base_model,
@@ -683,10 +711,15 @@ def load_lora(model, cfg, inference=False):
 
     if cfg.lora_model_dir:
         LOG.debug("Loading pretained PEFT - LoRA")
+        model_kwargs: Any = {}
+        if cfg.lora_on_cpu:
+            model_kwargs["max_memory"] = {"cpu": "256GiB"}
+            model_kwargs["device_map"] = {"": "cpu"}
         model = PeftModel.from_pretrained(
             model,
             cfg.lora_model_dir,
             is_trainable=(not inference),
+            **model_kwargs,
         )
     else:
         model = get_peft_model(model, lora_config)