From c69faee7a74dee0595f7a1d1b73d1af661306646 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Wed, 23 Aug 2023 10:39:11 -0400
Subject: [PATCH 01/67] workaround so training doesn't hang when packed
 dataloader batches aren't even (#461)

* workaround so training doesn't hang when packed dataloader batches aren't even

* don't bother labeling anything in the no-op data
---
 src/axolotl/utils/dataloader.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/axolotl/utils/dataloader.py b/src/axolotl/utils/dataloader.py
index dc3261d63e..6d5505baa2 100644
--- a/src/axolotl/utils/dataloader.py
+++ b/src/axolotl/utils/dataloader.py
@@ -243,6 +243,18 @@ def __iter__(self):
             len_remaining -= 1
             if not len_remaining:
                 return
+        # yield a no-op for cases where we don't have any data left to pack
+        for i in range(0, len_remaining):
+            yield self.collate_fn(
+                [
+                    {
+                        "input_ids": [0],
+                        "labels": [-100],
+                        "attention_mask": [True],
+                        "position_ids": [0],
+                    }
+                ]
+            )
 
     def _len_est(self):
         lengths_sum = np.sum(self.lengths)

From 55c23c7bcbb3740dee900a8f6528dfce9b68adda Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Thu, 24 Aug 2023 00:56:01 +0900
Subject: [PATCH 02/67] Fix(doc): Clarify config (#466)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 20bde1786c..4ec79729f5 100644
--- a/README.md
+++ b/README.md
@@ -626,7 +626,7 @@ strict:
 
 Run
 ```bash
-accelerate launch scripts/finetune.py configs/your_config.yml
+accelerate launch scripts/finetune.py your_config.yml
 ```
 
 #### Multi-GPU

From bde3c5a478100fd205822a139ec1c9cade73c9c1 Mon Sep 17 00:00:00 2001
From: "Charles O. Goddard" <chargoddard@gmail.com>
Date: Wed, 23 Aug 2023 20:07:18 -0700
Subject: [PATCH 03/67] ReLoRA implementation (with quantization) (#322)

* Experimental ReLoRA (+qlora) implementation

* Add CPU offload

* Remove local config

* Fix saving logic

* Remove redundant assert

* Fix logic errors

* Move ReLoRA into its own trainer class with a method override to create the proper scheduler

* Formatting & typing fixes

* Use safe_serialization

* Don't allow fsdp/deepspeed with ReLoRA

* Fix cpu-offload logic, enable multi gpu

* Document parameters and add comment

* Fix merge issue

* Smooth over some sharp edges

* Implement resume from checkpoint for relora

* Address review comments

* Fix saving logic

* Add necessary metadata to safetensors

---------

Co-authored-by: Wing Lian <wing.lian@gmail.com>
---
 README.md                         |   6 +
 scripts/finetune.py               |  37 +--
 src/axolotl/monkeypatch/relora.py | 393 ++++++++++++++++++++++++++++++
 src/axolotl/utils/callbacks.py    |   4 +-
 src/axolotl/utils/config.py       |  13 +
 src/axolotl/utils/trainer.py      |  58 ++++-
 6 files changed, 491 insertions(+), 20 deletions(-)
 create mode 100644 src/axolotl/monkeypatch/relora.py

diff --git a/README.md b/README.md
index 4ec79729f5..a81ac8b501 100644
--- a/README.md
+++ b/README.md
@@ -493,6 +493,12 @@ lora_modules_to_save:
 lora_out_dir:
 lora_fan_in_fan_out: false
 
+# ReLoRA configuration
+# must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed
+relora_steps: # number of steps per ReLoRA restart
+relora_warmup_steps: # number of per-restart warmup steps
+relora_cpu_offload: # true to perform lora weight merges on cpu during restarts, for modest gpu memory savings
+
 # wandb configuration if you're using it
 wandb_mode: # "offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb
 wandb_project: # your wandb project name
diff --git a/scripts/finetune.py b/scripts/finetune.py
index 78eca05b9b..3255a623f2 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -242,6 +242,21 @@ def train(
         model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
         return
 
+    if cfg.resume_from_checkpoint is None and cfg.auto_resume_from_checkpoints:
+        possible_checkpoints = [
+            str(cp) for cp in Path(cfg.output_dir).glob("checkpoint-*")
+        ]
+        if len(possible_checkpoints) > 0:
+            sorted_paths = sorted(
+                possible_checkpoints,
+                key=lambda path: int(path.split("-")[-1]),
+            )
+            cfg.resume_from_checkpoint = sorted_paths[-1]
+            LOG.info(
+                f"Using Auto-resume functionality to start with checkpoint at {cfg.resume_from_checkpoint}"
+            )
+    resume_from_checkpoint = cfg.resume_from_checkpoint
+
     trainer = setup_trainer(
         cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps
     )
@@ -273,20 +288,6 @@ def terminate_handler(_, __, model):
     LOG.info("Starting trainer...")
     if cfg.group_by_length:
         LOG.info("hang tight... sorting dataset for group_by_length")
-    resume_from_checkpoint = cfg.resume_from_checkpoint
-    if cfg.resume_from_checkpoint is None and cfg.auto_resume_from_checkpoints:
-        possible_checkpoints = [
-            str(cp) for cp in Path(cfg.output_dir).glob("checkpoint-*")
-        ]
-        if len(possible_checkpoints) > 0:
-            sorted_paths = sorted(
-                possible_checkpoints,
-                key=lambda path: int(path.split("-")[-1]),
-            )
-            resume_from_checkpoint = sorted_paths[-1]
-            LOG.info(
-                f"Using Auto-resume functionality to start with checkpoint at {resume_from_checkpoint}"
-            )
 
     if not Path(cfg.output_dir).is_dir():
         os.makedirs(cfg.output_dir, exist_ok=True)
@@ -301,6 +302,13 @@ def terminate_handler(_, __, model):
 
     LOG.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}")
 
+    if cfg.relora_steps:
+        if cfg.adapter == "lora" and not (cfg.load_in_4bit or cfg.load_in_8bit):
+            model = model.merge_and_unload()
+        else:
+            # final model weights have already been saved by `ReLoRACallback.on_train_end`
+            return
+
     # TODO do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading
     # only save on rank 0, otherwise it corrupts output on multi-GPU when multiple processes attempt to write the same file
     if cfg.fsdp:
@@ -308,6 +316,7 @@ def terminate_handler(_, __, model):
     elif cfg.local_rank == 0:
         if cfg.flash_optimum:
             model = BetterTransformer.reverse(model)
+
         model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
 
 
diff --git a/src/axolotl/monkeypatch/relora.py b/src/axolotl/monkeypatch/relora.py
new file mode 100644
index 0000000000..e247fafd25
--- /dev/null
+++ b/src/axolotl/monkeypatch/relora.py
@@ -0,0 +1,393 @@
+"""Implements the ReLoRA training procedure from https://arxiv.org/abs/2307.05695, minus the initial full fine-tune."""
+import glob
+import json
+import logging
+import os.path
+import shutil
+from pathlib import Path
+from typing import Dict, List, Sequence
+
+import bitsandbytes as bnb
+import peft
+import safetensors.torch as st
+import torch
+from huggingface_hub import snapshot_download
+from torch.optim.lr_scheduler import LRScheduler
+from torch.optim.optimizer import Optimizer
+from transformers import (
+    TrainerCallback,
+    TrainerControl,
+    TrainerState,
+    TrainingArguments,
+)
+from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+
+from axolotl.utils.dict import DictDefault
+from axolotl.utils.distributed import is_main_process
+
+LOG = logging.getLogger("axolotl.relora")
+
+
+def reset_optimizer(optimizer: torch.optim.Optimizer):
+    for group in optimizer.param_groups:
+        for param in group["params"]:
+            param_state = optimizer.state[param]
+            for key in param_state:
+                if "qmap" in key:
+                    continue
+
+                if key == "step" and isinstance(param_state[key], int):
+                    param_state[key] = 0
+                else:
+                    param_state[key] = torch.zeros_like(param_state[key])
+
+
+class ReLoRACallback(TrainerCallback):
+    """Callback to merge LoRA weights into the base model and save full-weight checkpoints"""
+
+    def __init__(self, cfg: DictDefault):
+        self.relora_steps = cfg.relora_steps
+        self.cpu_offload = cfg.relora_cpu_offload
+        self.quantized = cfg.load_in_4bit or cfg.load_in_8bit
+        self.last_full_model = cfg.base_model
+        self.resume_from_checkpoint = cfg.resume_from_checkpoint
+
+        if not os.path.exists(self.last_full_model):
+            self.last_full_model = str(Path(snapshot_download(cfg.base_model)))
+
+        assert os.path.exists(
+            self.last_full_model
+        ), "for ReLORA base_model must be a local path"
+
+        self.num_lora_restarts = 0
+        self.need_full_save = False
+
+    def on_train_begin(
+        self,
+        _args: TrainingArguments,
+        _state: TrainerState,
+        control: TrainerControl,
+        model: peft.LoraModel,
+        **_kwargs,
+    ):
+        if self.resume_from_checkpoint:
+            weight_path = os.path.join(self.resume_from_checkpoint, "relora")
+            if not os.path.exists(weight_path):
+                LOG.warning(
+                    "Resuming ReLoRA from checkpoint, but no full-weight save found"
+                )
+            else:
+                LOG.info(f"Loading adjusted base weights from {weight_path}")
+                load_weight_checkpoint(model, weight_path)
+        return control
+
+    def on_step_begin(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        model: peft.LoraModel,
+        optimizer: torch.optim.Optimizer,
+        **_kwargs,
+    ):
+        if state.global_step > 0 and state.global_step % self.relora_steps == 0:
+            checkpoint_folder = os.path.join(
+                args.output_dir,
+                f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}",
+                "relora",
+            )
+
+            with torch.no_grad():
+                merge_and_save(
+                    model,
+                    self.last_full_model,
+                    checkpoint_folder,
+                    reinit=True,
+                    quantized=self.quantized,
+                    actually_save=is_main_process(),
+                    cpu_offload=self.cpu_offload,
+                )
+                reset_optimizer(optimizer)
+
+            if self.quantized:
+                self.last_full_model = checkpoint_folder
+            self.num_lora_restarts += 1
+
+        return control
+
+    def on_save(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        model: peft.LoraModel,
+        **_kwargs,
+    ):
+        checkpoint_folder = os.path.join(
+            args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}", "relora"
+        )
+        if (
+            state.global_step >= self.relora_steps
+            and state.global_step % self.relora_steps != 0
+        ):
+            if self.quantized:
+                if self.last_full_model != checkpoint_folder:
+                    # ensure the latest full parameter save is in the latest checkpoint
+                    # folder, so that automatic pruning of checkpoints does not remove it
+                    LOG.info(f"moving last full parameter save to {checkpoint_folder}")
+                    os.makedirs(checkpoint_folder, exist_ok=True)
+                    chunks = glob.glob(
+                        f"{self.last_full_model}/model*.safetensors"
+                    ) + glob.glob(f"{self.last_full_model}/model*.index.json")
+                    for path in chunks:
+                        new_path = os.path.abspath(shutil.move(path, checkpoint_folder))
+                        try:
+                            os.symlink(new_path, path)
+                        except OSError:
+                            # probably on windows without permission to symlink
+                            pass
+
+                    self.last_full_model = checkpoint_folder
+            else:
+                model.model.save_pretrained(checkpoint_folder, safe_serialization=True)
+
+        return control
+
+    def on_log(
+        self,
+        _args: TrainingArguments,
+        _state: TrainerState,
+        control: TrainerControl,
+        logs: Dict[str, float],
+        **_kwargs,
+    ):
+        logs["num_lora_restarts"] = self.num_lora_restarts
+        return control
+
+    def on_train_end(
+        self,
+        args: TrainingArguments,
+        _state: TrainerState,
+        control: TrainerControl,
+        model: peft.LoraModel,
+        **_kwargs,
+    ):
+        if self.quantized:
+            # perform final merge and save
+            with torch.no_grad():
+                merge_and_save(
+                    model,
+                    self.last_full_model,
+                    args.output_dir,
+                    reinit=False,
+                    quantized=self.quantized,
+                    actually_save=is_main_process(),
+                    cpu_offload=self.cpu_offload,
+                )
+        # no need to save if unquantized, as finetune.py will call merge_and_unload()
+        return control
+
+
+class ReLoRAScheduler(LRScheduler):
+    """Wraps another scheduler to apply per-lora-restart learning rate warmups."""
+
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        inner_schedule: LRScheduler,
+        relora_steps: int,
+        warmup_steps: int,
+        min_lr_scale: float = 0.001,
+    ) -> None:
+        self.inner_schedule = inner_schedule
+        self.relora_steps = relora_steps
+        self.warmup_steps = warmup_steps
+        self.min_lr_scale = min_lr_scale
+        super().__init__(optimizer, inner_schedule.last_epoch, inner_schedule.verbose)
+
+    def get_lr(self) -> float:
+        self.inner_schedule.last_epoch = self.last_epoch
+
+        original = self.inner_schedule.get_lr()
+        step = self.last_epoch
+        if step < self.relora_steps:
+            scale = 1
+        else:
+            cycle_t = min(1.0, (step % self.relora_steps) / self.warmup_steps)
+            scale = cycle_t * (1 - self.min_lr_scale) + self.min_lr_scale
+
+        if isinstance(original, Sequence):
+            return [lr * scale for lr in original]
+        return original * scale
+
+
+def sharded_paths(path: str, module_names: List[str]) -> Dict[str, str]:
+    model_name = "model.safetensors"
+    if not os.path.exists(str(Path(path) / model_name)) and not os.path.exists(
+        str(Path(path) / f"{model_name}.index.json")
+    ):
+        model_name = "pytorch_model.bin"
+
+    index_path = str(Path(path) / f"{model_name}.index.json")
+    if os.path.exists(index_path):
+        with open(index_path, "r", encoding="utf-8") as file:
+            data = json.load(file)
+        return data["weight_map"]
+    return {(module_name + ".weight"): model_name for module_name in module_names}
+
+
+def lora_delta_weight(layer: peft.tuners.lora.LoraLayer, device) -> torch.Tensor:
+    if isinstance(layer, (peft.tuners.lora.Linear8bitLt, peft.tuners.lora.Linear4bit)):
+        adapter = layer.active_adapter
+        return (
+            peft.utils.transpose(
+                layer.lora_B[adapter].weight.detach().to(device)
+                @ layer.lora_A[adapter].weight.detach().to(device),
+                getattr(layer, "fan_in_fan_out", False),
+            )
+            * layer.scaling[adapter]
+        )
+
+    return layer.get_delta_weight().to(device)
+
+
+def find_lora_modules(model: peft.LoraModel) -> Dict[str, peft.tuners.lora.LoraLayer]:
+    modules: Dict[str, peft.tuners.lora.LoraLayer] = {}
+
+    key_list = [key for key, _ in model.model.named_modules() if "lora" not in key]
+    for key in key_list:
+        try:
+            # pylint: disable=protected-access
+            _parent, target, _target_name = peft.utils._get_submodules(model.model, key)
+        except AttributeError:
+            continue
+
+        if isinstance(target, peft.tuners.lora.LoraLayer):
+            modules[key] = target
+
+    return modules
+
+
+def update_weights(
+    target: peft.tuners.lora.LoraLayer, new_weight: torch.Tensor, reinit: bool, device
+):
+    if reinit:
+        for adapter_name in target.lora_A:
+            target.reset_lora_parameters(adapter_name)
+        for adapter_name in target.lora_embedding_A:
+            target.reset_lora_parameters(adapter_name)
+
+    if isinstance(target, peft.tuners.lora.Linear4bit):
+        # This could be faster, but the quantization of Linear4bit weights occurs
+        # when the module is moved from cpu to gpu. Without meddling *too* deeply in
+        # PEFT's innards or maintaining a duplicate of that codepath, this is good
+        # enough for now.
+        target.weight.quant_state = None
+        target.weight.data = new_weight.cpu()
+        target.to(device)
+    elif isinstance(target, peft.tuners.lora.Linear8bitLt):
+        target.weight = bnb.nn.Int8Params(new_weight, requires_grad=False).to(device)
+    else:
+        target.weight.data = new_weight.to(device)
+
+
+def merge_and_save(
+    model: peft.LoraModel,
+    model_src: str,
+    model_dst: str,
+    reinit: bool = False,
+    quantized: bool = False,
+    cpu_offload: bool = False,
+    actually_save: bool = True,
+):
+    modules = find_lora_modules(model)
+
+    if not quantized:
+        for module_name, target in modules.items():
+            update = target.get_delta_weight(target.active_adapter).detach()
+            target.weight.data += update
+
+            if reinit:
+                for adapter_name in target.lora_A:
+                    target.reset_lora_parameters(adapter_name)
+                for adapter_name in target.lora_embedding_A:
+                    target.reset_lora_parameters(adapter_name)
+        return
+
+    os.makedirs(model_dst, exist_ok=True)
+    shard_paths = sharded_paths(model_src, modules.keys())
+    out_shard_paths = {}
+
+    unique_shards = list(set(shard_paths.values()))
+    for shard_path in unique_shards:
+        out_tensors = {}
+        if shard_path.endswith(".safetensors"):
+            in_tensors = st.load_file(str(Path(model_src) / shard_path))
+        else:
+            in_tensors = torch.load(Path(model_src) / shard_path)
+            if "state_dict" in in_tensors:
+                in_tensors = in_tensors["state_dict"]
+
+        for module_name, target in modules.items():
+            key = module_name + ".weight"
+            if key not in shard_paths or shard_paths[key] != shard_path:
+                continue
+
+            orig_weight = in_tensors[key]
+            old_dev = target.weight.device
+            math_dev = "cpu" if cpu_offload else old_dev
+
+            delta_weight = lora_delta_weight(target, math_dev)
+            new_weight = orig_weight.to(math_dev) + delta_weight
+            del delta_weight
+
+            if actually_save:
+                out_tensors[key] = new_weight.half().cpu()
+
+            update_weights(target, new_weight, reinit=reinit, device=old_dev)
+
+        if actually_save:
+            out_shard_name = shard_path
+            if out_shard_name.startswith("pytorch_model"):
+                out_shard_name = (
+                    out_shard_name.replace("pytorch_model", "model").rstrip(".bin")
+                    + ".safetensors"
+                )
+
+            for module_name in in_tensors:
+                if module_name not in out_tensors:
+                    out_tensors[module_name] = in_tensors[module_name].half()
+                out_shard_paths[module_name] = out_shard_name
+
+            shard_fn = str(Path(model_dst) / out_shard_name)
+            LOG.info(f"saving tensors to {shard_fn}")
+            st.save_file(out_tensors, shard_fn, metadata={"format": "pt"})
+
+        del in_tensors
+        del out_tensors
+        torch.cuda.empty_cache()
+
+    if actually_save and len(unique_shards) > 1:
+        with open(
+            str(Path(model_dst, "model.safetensors.index.json")), "w", encoding="utf-8"
+        ) as file:
+            json.dump({"metadata": {}, "weight_map": out_shard_paths}, file)
+
+
+def load_weight_checkpoint(model: peft.LoraModel, checkpoint_path: str):
+    modules = find_lora_modules(model)
+    shard_paths = sharded_paths(checkpoint_path, modules.keys())
+    unique_shards = list(set(shard_paths.values()))
+
+    for shard_path in unique_shards:
+        tensors = st.load_file(os.path.join(checkpoint_path, shard_path))
+
+        for module_name, target in modules.items():
+            key = module_name + ".weight"
+            if key not in shard_paths or shard_paths[key] != shard_path:
+                continue
+
+            new_weight = tensors[key]
+            update_weights(
+                target, new_weight, reinit=False, device=target.weight.device
+            )
diff --git a/src/axolotl/utils/callbacks.py b/src/axolotl/utils/callbacks.py
index 32a7f0c994..ddc179f390 100644
--- a/src/axolotl/utils/callbacks.py
+++ b/src/axolotl/utils/callbacks.py
@@ -33,7 +33,9 @@ def on_save(
         )
 
         peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
-        kwargs["model"].save_pretrained(peft_model_path)
+        kwargs["model"].save_pretrained(
+            peft_model_path, save_safetensors=args.save_safetensors
+        )
 
         return control
 
diff --git a/src/axolotl/utils/config.py b/src/axolotl/utils/config.py
index 3d2e029c36..abb3154d21 100644
--- a/src/axolotl/utils/config.py
+++ b/src/axolotl/utils/config.py
@@ -126,6 +126,19 @@ def validate_config(cfg):
     if not cfg.load_in_8bit and cfg.adapter == "lora":
         LOG.warning("We recommend setting `load_in_8bit: true` for LORA finetuning")
 
+    if cfg.relora_steps:
+        if cfg.adapter not in ("lora", "qlora"):
+            raise ValueError("cfg.adapter must be lora or qlora to use ReLoRA")
+
+        if cfg.fsdp:
+            raise ValueError("fsdp not supported with ReLoRA")
+
+        if cfg.deepspeed:
+            raise ValueError("deepspeed not supported with ReLoRA")
+
+        if cfg.lr_scheduler == "one_cycle":
+            raise ValueError("ReLoRA is not compatible with the one_cycle scheduler")
+
     if cfg.trust_remote_code:
         LOG.warning(
             "`trust_remote_code` is set to true. Please make sure that you reviewed the remote code/model."
diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index c9c17fe33c..c73b4a7135 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -24,6 +24,7 @@
     get_parameter_names,
 )
 
+from axolotl.monkeypatch.relora import ReLoRACallback, ReLoRAScheduler
 from axolotl.utils.callbacks import (
     GPUStatsCallback,
     SaveBetterTransformerModelCallback,
@@ -127,6 +128,14 @@ class AxolotlTrainingArguments(TrainingArguments):
         default=1,
         metadata={"help": "the multiplier for the max len for packed sequences"},
     )
+    relora_steps: Optional[int] = field(
+        default=None,
+        metadata={"help": "how often to reset for ReLoRA"},
+    )
+    relora_warmup_steps: Optional[int] = field(
+        default=None,
+        metadata={"help": "how many warmup steps to take after reset for ReLoRA"},
+    )
 
 
 class AxolotlTrainer(Trainer):
@@ -265,6 +274,39 @@ def create_scheduler(
         return self.lr_scheduler
 
 
+class ReLoRATrainer(AxolotlTrainer):
+    """
+    Trainer subclass that uses the OneCycleLR scheduler
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.lr_scheduler = None
+
+    def create_scheduler(
+        self,
+        num_training_steps: int,
+        optimizer: Optional[torch.optim.Optimizer] = None,
+    ):
+        optimizer = self.optimizer if optimizer is None else optimizer
+        lr_scheduler = super().create_scheduler(num_training_steps, optimizer)
+
+        if self.args.relora_steps:
+            warmup_steps = (
+                self.args.relora_warmup_steps if self.args.relora_warmup_steps else 10
+            )
+            self.lr_scheduler = ReLoRAScheduler(
+                optimizer,
+                lr_scheduler,
+                self.args.relora_steps,
+                warmup_steps,
+            )
+        else:
+            self.lr_scheduler = lr_scheduler
+
+        return self.lr_scheduler
+
+
 def add_position_ids(sample):
     sample["position_ids"] = torch.arange(len(sample["input_ids"]))
     return sample
@@ -517,6 +559,8 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
         weight_decay=cfg.weight_decay if cfg.weight_decay is not None else 0.0,
         sample_packing=cfg.sample_packing if cfg.sample_packing else False,
         sample_packing_seq_len_multiplier=cfg.micro_batch_size,
+        relora_steps=cfg.relora_steps,
+        relora_warmup_steps=cfg.relora_warmup_steps,
         **training_arguments_kwargs,
     )
 
@@ -589,6 +633,10 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
 
     callbacks = []
     callbacks.append(GPUStatsCallback(cfg))
+
+    if cfg.relora_steps:
+        callbacks.append(ReLoRACallback(cfg))
+
     # TODO on_save callback to sync checkpoints to GCP/AWS in background
     if cfg.early_stopping_patience:
         early_stop_cb = EarlyStoppingCallback(
@@ -633,11 +681,11 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
                 num_proc=32,
             )
 
-    trainer_cls = (
-        OneCycleLRSchedulerTrainer
-        if cfg.lr_scheduler == "one_cycle" and (cfg.fsdp or cfg.adapter == "qlora")
-        else AxolotlTrainer
-    )
+    trainer_cls = AxolotlTrainer
+    if cfg.lr_scheduler == "one_cycle" and (cfg.fsdp or cfg.adapter == "qlora"):
+        trainer_cls = OneCycleLRSchedulerTrainer
+    elif cfg.relora_steps:
+        trainer_cls = ReLoRATrainer
     trainer = trainer_cls(
         model=model,
         train_dataset=train_dataset,

From cb9797ef5a069000d064d5b678e23ea023a535e4 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Thu, 24 Aug 2023 13:20:35 -0400
Subject: [PATCH 04/67] improve llama pad token handling (#475)

* improve llama pad token handling

* tweak logic to not clobber
---
 examples/gptq-lora-7b/config.yml | 2 +-
 src/axolotl/prompt_tokenizers.py | 2 +-
 src/axolotl/utils/data.py        | 7 ++++---
 src/axolotl/utils/models.py      | 7 ++++---
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/examples/gptq-lora-7b/config.yml b/examples/gptq-lora-7b/config.yml
index d5fbe3f134..d909f7d079 100644
--- a/examples/gptq-lora-7b/config.yml
+++ b/examples/gptq-lora-7b/config.yml
@@ -57,7 +57,7 @@ weight_decay: 0.0001
 fsdp:
 fsdp_config:
 tokens:
-  pad_token: "[PAD]"
+  pad_token: "<pad>"
   bos_token: "<s>"
   eos_token: "</s>"
   unk_token: "<unk>"
diff --git a/src/axolotl/prompt_tokenizers.py b/src/axolotl/prompt_tokenizers.py
index 9bdd5644a8..ed32ab24a2 100644
--- a/src/axolotl/prompt_tokenizers.py
+++ b/src/axolotl/prompt_tokenizers.py
@@ -13,7 +13,7 @@
 LOG = logging.getLogger("axolotl")
 
 IGNORE_INDEX = -100
-LLAMA_DEFAULT_PAD_TOKEN = "[PAD]"  # nosec
+LLAMA_DEFAULT_PAD_TOKEN = "<pad>"  # nosec
 LLAMA_DEFAULT_EOS_TOKEN = "</s>"  # nosec
 LLAMA_DEFAULT_BOS_TOKEN = "<s>"  # nosec
 LLAMA_DEFAULT_UNK_TOKEN = "<unk>"  # nosec
diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py
index f6a722a827..b801e6a576 100644
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -54,9 +54,10 @@
 
 def prepare_dataset(cfg, tokenizer):
     if not cfg.pretraining_dataset:
-        train_dataset, eval_dataset = load_prepare_datasets(
-            tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
-        )
+        with zero_first(is_main_process()):
+            train_dataset, eval_dataset = load_prepare_datasets(
+                tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
+            )
     else:
         train_dataset = load_pretraining_dataset(
             cfg.pretraining_dataset,
diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index 522ab3cb43..4fad740c58 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -22,7 +22,7 @@
     PreTrainedTokenizerBase,
 )
 
-from axolotl.prompt_tokenizers import LLAMA_DEFAULT_PAD_TOKEN
+from axolotl.prompt_tokenizers import LLAMA_DEFAULT_EOS_TOKEN
 from axolotl.utils.bench import log_gpu_memory_usage
 
 LOG = logging.getLogger("axolotl")
@@ -58,8 +58,9 @@ def load_tokenizer(cfg):
     if tokenizer.__class__.__name__ in [
         "LlamaTokenizer",
         "LlamaTokenizerFast",
-    ]:
-        tokenizer.pad_token = LLAMA_DEFAULT_PAD_TOKEN
+    ] and not hasattr(tokenizer, "pad_token"):
+        # set a pad_token, but use eos_token so we don't add a new token
+        tokenizer.pad_token = LLAMA_DEFAULT_EOS_TOKEN
 
     LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
     LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")

From 71bd06243c91b23182debedc2675c2eab69adcaf Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Fri, 25 Aug 2023 14:30:50 +0900
Subject: [PATCH 05/67] Fix(tokenizer): Fix condition to add pad token (#477)

* Fix(tokenizer): Fix condition to add pad token

* chore: fix lint
---
 src/axolotl/utils/models.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index 4fad740c58..64c80109ec 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -55,10 +55,15 @@ def load_tokenizer(cfg):
         **tokenizer_kwargs,
     )
 
-    if tokenizer.__class__.__name__ in [
-        "LlamaTokenizer",
-        "LlamaTokenizerFast",
-    ] and not hasattr(tokenizer, "pad_token"):
+    if (
+        tokenizer.__class__.__name__
+        in [
+            "LlamaTokenizer",
+            "LlamaTokenizerFast",
+        ]
+        and hasattr(tokenizer, "pad_token")
+        and not tokenizer.pad_token
+    ):
         # set a pad_token, but use eos_token so we don't add a new token
         tokenizer.pad_token = LLAMA_DEFAULT_EOS_TOKEN
 

From 0b7ba57ec42559bf75e5d1bc6ba58354a314d12e Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Fri, 25 Aug 2023 02:03:24 -0400
Subject: [PATCH 06/67] fix types w lora (#478)

---
 src/axolotl/utils/models.py | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index 64c80109ec..261acd9348 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -11,7 +11,6 @@
 import torch
 import transformers
 from optimum.bettertransformer import BetterTransformer
-from peft.tuners.lora import LoraLayer
 from transformers import (  # noqa: F401
     AutoConfig,
     AutoModelForCausalLM,
@@ -348,6 +347,14 @@ def load_model(
     if model.device.type == "cuda":
         log_gpu_memory_usage(LOG, "after model load", model.device)
 
+    # make sure these are fp32 per Ramesh et al. (2021)
+    for name, module in model.named_modules():
+        if "norm" in name:
+            module.to(torch.float32)
+        if "lm_head" in name or "embed_tokens" in name:
+            if hasattr(module, "weight"):
+                module.to(torch.float32)
+
     if not cfg.gptq and (
         (cfg.adapter == "lora" and load_in_8bit)
         or (cfg.adapter == "qlora" and cfg.load_in_4bit)
@@ -357,6 +364,16 @@ def load_model(
             model, use_gradient_checkpointing=cfg.gradient_checkpointing
         )
 
+        # LlamaRMSNorm layers are in fp32 after kbit_training, so we need to
+        # convert them back to fp16/bf16 for flash-attn compatibility.
+        if cfg.flash_attention and cfg.is_llama_derived_model:
+            for name, module in model.named_modules():
+                if "norm" in name:
+                    module.to(cfg.torch_dtype)
+                if "lm_head" in name or "embed_tokens" in name:
+                    if hasattr(module, "weight"):
+                        module.to(cfg.torch_dtype)
+
     model, lora_config = load_adapter(model, cfg, cfg.adapter)
 
     if cfg.ddp and not load_in_8bit:
@@ -500,22 +517,6 @@ def load_lora(model, cfg):
     else:
         model = get_peft_model(model, lora_config)
 
-    for name, module in model.named_modules():
-        if isinstance(module, LoraLayer):
-            module = module.to(cfg.torch_dtype)
-        if "norm" in name:
-            module = module.to(torch.float32)
-        if "lm_head" in name or "embed_tokens" in name:
-            if hasattr(module, "weight"):
-                module = module.to(cfg.torch_dtype)
-
-    # LlamaRMSNorm layers are in fp32 after kbit_training, so we need to
-    # convert them back to fp16/bf16 for flash-attn compatibility.
-    if cfg.flash_attention and cfg.is_llama_derived_model:
-        for name, module in model.named_modules():
-            if "norm" in name:
-                module = module.to(cfg.torch_dtype)
-
     model.print_trainable_parameters()
 
     return model, lora_config

From c29117a0d7fd6f2fa3a7e37bf049df3560a59e1b Mon Sep 17 00:00:00 2001
From: Aman Karmani <aman@tmm1.net>
Date: Sat, 26 Aug 2023 15:06:05 +0000
Subject: [PATCH 07/67] allow newer deps

---
 requirements.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index a9f1d50470..6bc557aa55 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,9 +4,9 @@ bitsandbytes>=0.41.1
 accelerate @ git+https://github.com/huggingface/accelerate@2a289f6108e77a77a4efffb3f6316bc98538413b
 addict
 fire
-PyYAML==6.0
+PyYAML>=6.0
 datasets
-flash-attn==2.0.8
+flash-attn>=2.0.8
 sentencepiece
 wandb
 einops
@@ -15,7 +15,7 @@ optimum
 hf_transfer
 colorama
 numba
-numpy==1.24.4
+numpy>=1.24.4
 # qlora things
 bert-score==0.3.13
 evaluate==0.4.0

From 31f3e717642b4de95263a05c9e8a57a4b1113217 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sat, 26 Aug 2023 12:00:03 -0400
Subject: [PATCH 08/67] fix checkpints on multigpu (#481)

---
 src/axolotl/monkeypatch/relora.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/axolotl/monkeypatch/relora.py b/src/axolotl/monkeypatch/relora.py
index e247fafd25..9dac77e18a 100644
--- a/src/axolotl/monkeypatch/relora.py
+++ b/src/axolotl/monkeypatch/relora.py
@@ -131,7 +131,7 @@ def on_save(
             and state.global_step % self.relora_steps != 0
         ):
             if self.quantized:
-                if self.last_full_model != checkpoint_folder:
+                if is_main_process() and self.last_full_model != checkpoint_folder:
                     # ensure the latest full parameter save is in the latest checkpoint
                     # folder, so that automatic pruning of checkpoints does not remove it
                     LOG.info(f"moving last full parameter save to {checkpoint_folder}")

From c500d025174cacb3dff470e4f1086f84e8f5fb95 Mon Sep 17 00:00:00 2001
From: Maxime <672982+maximegmd@users.noreply.github.com>
Date: Sat, 26 Aug 2023 18:02:15 +0200
Subject: [PATCH 09/67] Fix missing 'packaging' wheel (#482)

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 6bc557aa55..156d99b480 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
+packaging
 peft @ git+https://github.com/huggingface/peft.git
 transformers @ git+https://github.com/huggingface/transformers.git
 bitsandbytes>=0.41.1

From f311df9462bf348317de57404a9abe6305da19d3 Mon Sep 17 00:00:00 2001
From: Maxime <672982+maximegmd@users.noreply.github.com>
Date: Sat, 26 Aug 2023 22:34:11 +0200
Subject: [PATCH 10/67] fix: finetune model inference needs the dtype fix to
 work with flash-attn

---
 src/axolotl/utils/models.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index 261acd9348..c95e346e10 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -355,6 +355,7 @@ def load_model(
             if hasattr(module, "weight"):
                 module.to(torch.float32)
 
+    fix_dtype = False
     if not cfg.gptq and (
         (cfg.adapter == "lora" and load_in_8bit)
         or (cfg.adapter == "qlora" and cfg.load_in_4bit)
@@ -363,16 +364,19 @@ def load_model(
         model = prepare_model_for_kbit_training(
             model, use_gradient_checkpointing=cfg.gradient_checkpointing
         )
+        fix_dtype = True
 
-        # LlamaRMSNorm layers are in fp32 after kbit_training, so we need to
-        # convert them back to fp16/bf16 for flash-attn compatibility.
-        if cfg.flash_attention and cfg.is_llama_derived_model:
-            for name, module in model.named_modules():
-                if "norm" in name:
+    # LlamaRMSNorm layers are in fp32 after kbit_training or full finetune, so we need to
+    # convert them back to fp16/bf16 for flash-attn compatibility.
+    if (fix_dtype or cfg.adapter == "" or cfg.adapter == None) and (
+        cfg.flash_attention and cfg.is_llama_derived_model
+    ):
+        for name, module in model.named_modules():
+            if "norm" in name:
+                module.to(cfg.torch_dtype)
+            if "lm_head" in name or "embed_tokens" in name:
+                if hasattr(module, "weight"):
                     module.to(cfg.torch_dtype)
-                if "lm_head" in name or "embed_tokens" in name:
-                    if hasattr(module, "weight"):
-                        module.to(cfg.torch_dtype)
 
     model, lora_config = load_adapter(model, cfg, cfg.adapter)
 

From a184549e4c44651555170eac5dc3384842d34112 Mon Sep 17 00:00:00 2001
From: Maxime <672982+maximegmd@users.noreply.github.com>
Date: Sat, 26 Aug 2023 22:36:14 +0200
Subject: [PATCH 11/67] ignore: linter

---
 src/axolotl/utils/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index c95e346e10..fc2cf04b3b 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -368,7 +368,7 @@ def load_model(
 
     # LlamaRMSNorm layers are in fp32 after kbit_training or full finetune, so we need to
     # convert them back to fp16/bf16 for flash-attn compatibility.
-    if (fix_dtype or cfg.adapter == "" or cfg.adapter == None) and (
+    if (fix_dtype or cfg.adapter == "" or cfg.adapter is None) and (
         cfg.flash_attention and cfg.is_llama_derived_model
     ):
         for name, module in model.named_modules():

From 17605b85d8046b7dee53289175dea17b8700fe0b Mon Sep 17 00:00:00 2001
From: Maxime <672982+maximegmd@users.noreply.github.com>
Date: Sat, 26 Aug 2023 22:40:56 +0200
Subject: [PATCH 12/67] fix: inference did not move the model to the correct
 device (#483)

---
 scripts/finetune.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/finetune.py b/scripts/finetune.py
index 3255a623f2..d02448ec29 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -82,6 +82,8 @@ def do_inference(cfg, model, tokenizer, prompter: Optional[str]):
             max_seq_len=255, mem_freq=50, top_k=5, max_cache_size=None
         )
 
+    model = model.to(cfg.device)
+
     while True:
         print("=" * 80)
         # support for multiline inputs

From d03887fad5044a90b1984baaad36387079ecd4f6 Mon Sep 17 00:00:00 2001
From: Maxime <672982+maximegmd@users.noreply.github.com>
Date: Sat, 26 Aug 2023 22:45:45 +0200
Subject: [PATCH 13/67] ignore: address pr review

---
 src/axolotl/utils/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index fc2cf04b3b..71e27a2bc1 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -368,7 +368,7 @@ def load_model(
 
     # LlamaRMSNorm layers are in fp32 after kbit_training or full finetune, so we need to
     # convert them back to fp16/bf16 for flash-attn compatibility.
-    if (fix_dtype or cfg.adapter == "" or cfg.adapter is None) and (
+    if (fix_dtype or not cfg.adapter) and (
         cfg.flash_attention and cfg.is_llama_derived_model
     ):
         for name, module in model.named_modules():

From 868530c39c2d2a5ddcce1483bd73951fb376e18b Mon Sep 17 00:00:00 2001
From: Aman Karmani <aman@tmm1.net>
Date: Sat, 26 Aug 2023 21:40:12 +0000
Subject: [PATCH 14/67] let transformers handle adamw_bnb_8bit

---
 src/axolotl/utils/trainer.py | 73 +-----------------------------------
 1 file changed, 2 insertions(+), 71 deletions(-)

diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index c73b4a7135..24be1b8c2a 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -10,19 +10,13 @@
 from pathlib import Path
 from typing import Optional, Union
 
-import bitsandbytes as bnb
 import numpy as np
 import torch.cuda
-import transformers
 from datasets import Dataset, set_caching_enabled
-from torch import nn
 from torch.optim.lr_scheduler import OneCycleLR
 from torch.utils.data import DataLoader, DistributedSampler, RandomSampler
 from transformers import EarlyStoppingCallback, Trainer, TrainingArguments
-from transformers.trainer_pt_utils import (
-    SequentialDistributedSampler,
-    get_parameter_names,
-)
+from transformers.trainer_pt_utils import SequentialDistributedSampler
 
 from axolotl.monkeypatch.relora import ReLoRACallback, ReLoRAScheduler
 from axolotl.utils.callbacks import (
@@ -32,10 +26,7 @@
 )
 from axolotl.utils.collators import DataCollatorForSeq2Seq
 from axolotl.utils.dataloader import MultipackDistributedDataloader
-from axolotl.utils.schedulers import (
-    InterpolatingLogScheduler,
-    get_cosine_schedule_with_quadratic_warmup,
-)
+from axolotl.utils.schedulers import get_cosine_schedule_with_quadratic_warmup
 
 LOG = logging.getLogger("axolotl")
 
@@ -570,66 +561,6 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
         if Path(cfg.torchdistx_path).exists():
             sys.path.append(cfg.torchdistx_path)
             importlib.import_module("torchdistx")
-    if (
-        cfg.optimizer == "adamw_bnb_8bit"
-        and not cfg.gptq
-        and "deepspeed" not in training_arguments_kwargs
-        and not cfg.fsdp
-    ):
-        decay_parameters = get_parameter_names(model, [nn.LayerNorm])
-        decay_parameters = [name for name in decay_parameters if "bias" not in name]
-        optimizer_grouped_parameters = [
-            {
-                "params": [
-                    p
-                    for n, p in model.named_parameters()
-                    if (n in decay_parameters and p.requires_grad)
-                ],
-                "weight_decay": training_args.weight_decay,
-            },
-            {
-                "params": [
-                    p
-                    for n, p in model.named_parameters()
-                    if (n not in decay_parameters and p.requires_grad)
-                ],
-                "weight_decay": 0.0,
-            },
-        ]
-
-        optimizer = bnb.optim.Adam8bit(
-            optimizer_grouped_parameters,
-            betas=(training_args.adam_beta1, training_args.adam_beta2),
-            eps=training_args.adam_epsilon,
-            lr=training_args.learning_rate,
-        )
-
-        if cfg.lr_scheduler == "one_cycle":
-            lr_scheduler_kwargs = (
-                cfg.lr_scheduler_kwargs if cfg.lr_scheduler_kwargs else {}
-            )
-            lr_scheduler = OneCycleLR(
-                optimizer,
-                cfg.learning_rate,
-                total_steps=total_num_steps,
-                epochs=cfg.num_epochs,
-                div_factor=cfg.lr_div_factor if cfg.lr_div_factor else 6,
-                **lr_scheduler_kwargs,
-            )
-        elif cfg.lr_scheduler == "log_sweep":
-            lr_scheduler = InterpolatingLogScheduler(
-                optimizer,
-                cfg.warmup_steps,
-                cfg.log_sweep_min_lr if cfg.log_sweep_min_lr else 1e-10,
-                cfg.log_sweep_max_lr if cfg.log_sweep_max_lr else 10,
-            )
-        else:
-            lr_scheduler = transformers.get_cosine_schedule_with_warmup(
-                optimizer,
-                training_args.warmup_steps,
-                total_num_steps,
-            )
-        trainer_kwargs["optimizers"] = (optimizer, lr_scheduler)
 
     callbacks = []
     callbacks.append(GPUStatsCallback(cfg))

From fe4d6baf9286e0eea18a3e752099a4fa16aef606 Mon Sep 17 00:00:00 2001
From: "Charles O. Goddard" <chargoddard@gmail.com>
Date: Sat, 26 Aug 2023 18:08:34 -0700
Subject: [PATCH 15/67] Add example Llama 2 ReLoRA config (#471)

* Add example Llama 2 ReLoRA config

* Use adamw_bnb_8bit in example relora config
---
 examples/llama-2/relora.yml | 73 +++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100644 examples/llama-2/relora.yml

diff --git a/examples/llama-2/relora.yml b/examples/llama-2/relora.yml
new file mode 100644
index 0000000000..66515dabc2
--- /dev/null
+++ b/examples/llama-2/relora.yml
@@ -0,0 +1,73 @@
+base_model: meta-llama/Llama-2-7b-hf
+base_model_config: meta-llama/Llama-2-7b-hf
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+is_llama_derived_model: true
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+datasets:
+  - path: teknium/GPT4-LLM-Cleaned
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+output_dir: ./relora-out
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 4096
+sample_packing: true
+
+lora_r: 8
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules:
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+relora_steps: 150
+relora_warmup_steps: 10
+relora_cpu_offload: false
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 4
+num_epochs: 3
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: true
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+eval_steps: 20
+save_steps: 50
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"

From ad8be435ad42dc7f1feb3740a2b7b961f23364f8 Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Sun, 27 Aug 2023 10:09:09 +0900
Subject: [PATCH 16/67] Feat(doc): Update eval_steps doc (#487)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a81ac8b501..94427fcd01 100644
--- a/README.md
+++ b/README.md
@@ -521,7 +521,7 @@ lr_quadratic_warmup:
 logging_steps:
 save_strategy: # set to `no` to skip checkpoint saves
 save_steps: # leave empty to save at each epoch
-eval_steps:
+eval_steps: # leave empty to eval at each epoch
 save_total_limit: # checkpoints saved at a time
 max_steps:
 

From 3fc900629881e369a41ab656d4811c3a0410ea89 Mon Sep 17 00:00:00 2001
From: mhenrichsen <mads.gade.henrichsen@live.dk>
Date: Sun, 27 Aug 2023 03:10:33 +0200
Subject: [PATCH 17/67] Feat(deepspeed): Add zero2 config (#476)

* zero2 config

* config added

* linting

---------

Co-authored-by: mhenrichsen <some_email@hey.com>
---
 deepspeed/zero2.json | 46 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 deepspeed/zero2.json

diff --git a/deepspeed/zero2.json b/deepspeed/zero2.json
new file mode 100644
index 0000000000..9b8a2a9b6b
--- /dev/null
+++ b/deepspeed/zero2.json
@@ -0,0 +1,46 @@
+{
+    "zero_optimization": {
+      "stage": 2,
+      "offload_optimizer": {
+        "device": "cpu"
+      },
+      "contiguous_gradients": true,
+      "overlap_comm": true
+    },
+    "bf16": {
+      "enabled": "auto"
+    },
+    "fp16": {
+      "enabled": "auto",
+      "auto_cast": false,
+      "loss_scale": 0,
+      "initial_scale_power": 32,
+      "loss_scale_window": 1000,
+      "hysteresis": 2,
+      "min_loss_scale": 1
+    },
+    "optimizer": {
+      "type": "AdamW",
+      "params": {
+        "lr": "auto",
+        "betas": [
+          0.9,
+          0.999
+        ],
+        "eps": 1e-8,
+        "weight_decay": "auto"
+      }
+    },
+    "scheduler": {
+      "type": "WarmupDecayLR",
+      "params": {
+        "warmup_min_lr": "auto",
+        "warmup_max_lr": "auto",
+        "warmup_num_steps": "auto",
+        "total_num_steps": "auto"
+      }
+    },
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}

From 35130711d634966de5aee6a94b3613551c521db6 Mon Sep 17 00:00:00 2001
From: mhenrichsen <mads.gade.henrichsen@live.dk>
Date: Sun, 27 Aug 2023 03:20:17 +0200
Subject: [PATCH 18/67] Feat(cfg): Add code-llama configs for all sizes  (#479)

* configs for all sizes

* update tokenizer type

---------

Co-authored-by: mhenrichsen <some_email@hey.com>
---
 examples/code-llama/13b/lora.yml  | 67 ++++++++++++++++++++++++++++++
 examples/code-llama/13b/qlora.yml | 69 +++++++++++++++++++++++++++++++
 examples/code-llama/34b/lora.yml  | 67 ++++++++++++++++++++++++++++++
 examples/code-llama/34b/qlora.yml | 69 +++++++++++++++++++++++++++++++
 examples/code-llama/7b/lora.yml   | 67 ++++++++++++++++++++++++++++++
 examples/code-llama/7b/qlora.yml  | 69 +++++++++++++++++++++++++++++++
 examples/code-llama/README.md     | 22 ++++++++++
 7 files changed, 430 insertions(+)
 create mode 100644 examples/code-llama/13b/lora.yml
 create mode 100644 examples/code-llama/13b/qlora.yml
 create mode 100644 examples/code-llama/34b/lora.yml
 create mode 100644 examples/code-llama/34b/qlora.yml
 create mode 100644 examples/code-llama/7b/lora.yml
 create mode 100644 examples/code-llama/7b/qlora.yml
 create mode 100644 examples/code-llama/README.md

diff --git a/examples/code-llama/13b/lora.yml b/examples/code-llama/13b/lora.yml
new file mode 100644
index 0000000000..637c051436
--- /dev/null
+++ b/examples/code-llama/13b/lora.yml
@@ -0,0 +1,67 @@
+base_model: codellama/CodeLlama-13b-hf
+base_model_config: codellama/CodeLlama-13b-hf
+model_type: LlamaForCausalLM
+tokenizer_type: CodeLlamaTokenizer
+is_llama_derived_model: true
+
+load_in_8bit: true
+load_in_4bit: false
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+output_dir: ./lora-out
+
+sequence_len: 100000
+sample_packing: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 3
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: true
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+eval_steps: 20
+save_steps:
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
diff --git a/examples/code-llama/13b/qlora.yml b/examples/code-llama/13b/qlora.yml
new file mode 100644
index 0000000000..ae78f5bf2c
--- /dev/null
+++ b/examples/code-llama/13b/qlora.yml
@@ -0,0 +1,69 @@
+base_model: codellama/CodeLlama-13b-hf
+base_model_config: codellama/CodeLlama-13b-hf
+model_type: LlamaForCausalLM
+tokenizer_type: CodeLlamaTokenizer
+is_llama_derived_model: true
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+output_dir: ./qlora-out
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 100000
+sample_packing: true
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules:
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 3
+optimizer: paged_adamw_32bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: true
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+eval_steps: 20
+save_steps:
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
diff --git a/examples/code-llama/34b/lora.yml b/examples/code-llama/34b/lora.yml
new file mode 100644
index 0000000000..9c4cfee10f
--- /dev/null
+++ b/examples/code-llama/34b/lora.yml
@@ -0,0 +1,67 @@
+base_model: codellama/CodeLlama-34b-hf
+base_model_config: codellama/CodeLlama-34b-hf
+model_type: LlamaForCausalLM
+tokenizer_type: CodeLlamaTokenizer
+is_llama_derived_model: true
+
+load_in_8bit: true
+load_in_4bit: false
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+output_dir: ./lora-out
+
+sequence_len: 100000
+sample_packing: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 3
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: true
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+eval_steps: 20
+save_steps:
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
diff --git a/examples/code-llama/34b/qlora.yml b/examples/code-llama/34b/qlora.yml
new file mode 100644
index 0000000000..9f5ce50f96
--- /dev/null
+++ b/examples/code-llama/34b/qlora.yml
@@ -0,0 +1,69 @@
+base_model: codellama/CodeLlama-34b-hf
+base_model_config: codellama/CodeLlama-34b-hf
+model_type: LlamaForCausalLM
+tokenizer_type: CodeLlamaTokenizer
+is_llama_derived_model: true
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+output_dir: ./qlora-out
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 100000
+sample_packing: true
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules:
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 3
+optimizer: paged_adamw_32bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: true
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+eval_steps: 20
+save_steps:
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
diff --git a/examples/code-llama/7b/lora.yml b/examples/code-llama/7b/lora.yml
new file mode 100644
index 0000000000..dfa3f2f7a3
--- /dev/null
+++ b/examples/code-llama/7b/lora.yml
@@ -0,0 +1,67 @@
+base_model: codellama/CodeLlama-7b-hf
+base_model_config: codellama/CodeLlama-7b-hf
+model_type: LlamaForCausalLM
+tokenizer_type: CodeLlamaTokenizer
+is_llama_derived_model: true
+
+load_in_8bit: true
+load_in_4bit: false
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+output_dir: ./lora-out
+
+sequence_len: 100000
+sample_packing: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 3
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: true
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+eval_steps: 20
+save_steps:
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
diff --git a/examples/code-llama/7b/qlora.yml b/examples/code-llama/7b/qlora.yml
new file mode 100644
index 0000000000..704f058c31
--- /dev/null
+++ b/examples/code-llama/7b/qlora.yml
@@ -0,0 +1,69 @@
+base_model: codellama/CodeLlama-7b-hf
+base_model_config: codellama/CodeLlama-7b-hf
+model_type: LlamaForCausalLM
+tokenizer_type: CodeLlamaTokenizer
+is_llama_derived_model: true
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+output_dir: ./qlora-out
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 100000
+sample_packing: true
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules:
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 3
+optimizer: paged_adamw_32bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: true
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+eval_steps: 20
+save_steps:
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
diff --git a/examples/code-llama/README.md b/examples/code-llama/README.md
new file mode 100644
index 0000000000..a5011e3472
--- /dev/null
+++ b/examples/code-llama/README.md
@@ -0,0 +1,22 @@
+# Overview
+
+This is an example of CodeLLaMA configuration for 7b, 13b and 34b.
+
+The 7b variant fits on any 24GB VRAM GPU and will take up about 17 GB of VRAM during training if using qlora and 20 GB if using lora. On a RTX 4090 it trains 3 epochs of the default dataset in about 15 minutes.
+
+The 13b variant will fit if you change these settings to these values:
+gradient_accumulation_steps: 2
+micro_batch_size: 1
+
+The 34b variant does not fit on 24GB of VRAM - you will need something with +40 gb VRAM that also supports flash attention v2 - A6000 or A100 are good choices.
+
+```shell
+accelerate launch scripts/finetune.py examples/code-llama/[MODEL_SIZE]/qlora.yml
+
+```
+or
+
+```shell
+accelerate launch scripts/finetune.py examples/code-llama/[MODEL_SIZE]/lora.yml
+
+```

From 9e699683d79a21aeffde7970f4af07febbd341e8 Mon Sep 17 00:00:00 2001
From: Maxime <672982+maximegmd@users.noreply.github.com>
Date: Sun, 27 Aug 2023 21:01:37 +0200
Subject: [PATCH 19/67] Update src/axolotl/utils/models.py

Co-authored-by: Aman Gupta Karmani <aman@tmm1.net>
---
 src/axolotl/utils/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index 71e27a2bc1..ed917d963e 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -355,7 +355,7 @@ def load_model(
             if hasattr(module, "weight"):
                 module.to(torch.float32)
 
-    fix_dtype = False
+    fix_dtype = not cfg.adapter
     if not cfg.gptq and (
         (cfg.adapter == "lora" and load_in_8bit)
         or (cfg.adapter == "qlora" and cfg.load_in_4bit)

From 7fd662dd89e4fb8e97a7b1fbb4328f33220f60c1 Mon Sep 17 00:00:00 2001
From: Maxime <672982+maximegmd@users.noreply.github.com>
Date: Sun, 27 Aug 2023 21:01:43 +0200
Subject: [PATCH 20/67] Update src/axolotl/utils/models.py

Co-authored-by: Aman Gupta Karmani <aman@tmm1.net>
---
 src/axolotl/utils/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index ed917d963e..4575f5966e 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -368,7 +368,7 @@ def load_model(
 
     # LlamaRMSNorm layers are in fp32 after kbit_training or full finetune, so we need to
     # convert them back to fp16/bf16 for flash-attn compatibility.
-    if (fix_dtype or not cfg.adapter) and (
+    if fix_dtype and (
         cfg.flash_attention and cfg.is_llama_derived_model
     ):
         for name, module in model.named_modules():

From f319b0bc67b548f509ca8ddc3922c028c733bea7 Mon Sep 17 00:00:00 2001
From: Aman Karmani <aman@tmm1.net>
Date: Sun, 27 Aug 2023 19:55:11 +0000
Subject: [PATCH 21/67] rename var and reformat

---
 src/axolotl/utils/models.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index 4575f5966e..dd75106ec4 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -355,7 +355,7 @@ def load_model(
             if hasattr(module, "weight"):
                 module.to(torch.float32)
 
-    fix_dtype = not cfg.adapter
+    needs_fa2_dtype = not cfg.adapter
     if not cfg.gptq and (
         (cfg.adapter == "lora" and load_in_8bit)
         or (cfg.adapter == "qlora" and cfg.load_in_4bit)
@@ -364,13 +364,11 @@ def load_model(
         model = prepare_model_for_kbit_training(
             model, use_gradient_checkpointing=cfg.gradient_checkpointing
         )
-        fix_dtype = True
+        needs_fa2_dtype = True
 
     # LlamaRMSNorm layers are in fp32 after kbit_training or full finetune, so we need to
     # convert them back to fp16/bf16 for flash-attn compatibility.
-    if fix_dtype and (
-        cfg.flash_attention and cfg.is_llama_derived_model
-    ):
+    if needs_fa2_dtype and (cfg.flash_attention and cfg.is_llama_derived_model):
         for name, module in model.named_modules():
             if "norm" in name:
                 module.to(cfg.torch_dtype)

From 3a011ea1ef4ddee446e22849651783dd758dfda6 Mon Sep 17 00:00:00 2001
From: Aman Karmani <aman@tmm1.net>
Date: Sun, 27 Aug 2023 20:09:26 +0000
Subject: [PATCH 22/67] fix condition and add logging

---
 src/axolotl/utils/models.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index dd75106ec4..c2fbc19e39 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -355,7 +355,7 @@ def load_model(
             if hasattr(module, "weight"):
                 module.to(torch.float32)
 
-    needs_fa2_dtype = not cfg.adapter
+    needs_fa2_dtype = cfg.adapter is not None
     if not cfg.gptq and (
         (cfg.adapter == "lora" and load_in_8bit)
         or (cfg.adapter == "qlora" and cfg.load_in_4bit)
@@ -369,6 +369,7 @@ def load_model(
     # LlamaRMSNorm layers are in fp32 after kbit_training or full finetune, so we need to
     # convert them back to fp16/bf16 for flash-attn compatibility.
     if needs_fa2_dtype and (cfg.flash_attention and cfg.is_llama_derived_model):
+        LOG.info("converting modules to %s for flash attention", cfg.torch_dtype)
         for name, module in model.named_modules():
             if "norm" in name:
                 module.to(cfg.torch_dtype)

From 4c37bd0b546c421f0680839210f2e963d8d3f35a Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Mon, 28 Aug 2023 09:39:10 +0900
Subject: [PATCH 23/67] Fix(tokenizer): Make sure to add pad for
 CodeLlamaTokenizer (#489)

---
 src/axolotl/utils/models.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index c2fbc19e39..6cd1cd7ea3 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -59,6 +59,7 @@ def load_tokenizer(cfg):
         in [
             "LlamaTokenizer",
             "LlamaTokenizerFast",
+            "CodeLlamaTokenizer",
         ]
         and hasattr(tokenizer, "pad_token")
         and not tokenizer.pad_token

From 98bf76e236ee7450cdfde506d9f409e5e28cb2a2 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Mon, 28 Aug 2023 04:33:50 -0400
Subject: [PATCH 24/67] fsdp requires params be the same type too (#493)

---
 src/axolotl/utils/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index 6cd1cd7ea3..063e439772 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -356,7 +356,7 @@ def load_model(
             if hasattr(module, "weight"):
                 module.to(torch.float32)
 
-    needs_fa2_dtype = cfg.adapter is not None
+    needs_fa2_dtype = cfg.adapter or cfg.fsdp
     if not cfg.gptq and (
         (cfg.adapter == "lora" and load_in_8bit)
         or (cfg.adapter == "qlora" and cfg.load_in_4bit)

From 267b7b24e59bebd6a0934457f244676c6eb2f97d Mon Sep 17 00:00:00 2001
From: Aman Karmani <aman@tmm1.net>
Date: Mon, 28 Aug 2023 13:10:26 +0000
Subject: [PATCH 25/67] simplify linear layer locator

---
 src/axolotl/utils/models.py | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index 063e439772..d0e5128ef2 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -464,12 +464,8 @@ def load_llama_adapter(model, cfg):
     return model, peft_config
 
 
-def find_all_linear_names(bits, model):
-    cls = (
-        bnb.nn.Linear4bit
-        if bits == 4
-        else (bnb.nn.Linear8bitLt if bits == 8 else torch.nn.Linear)
-    )
+def find_all_linear_names(model):
+    cls = (bnb.nn.Linear4bit, bnb.nn.Linear8bitLt, torch.nn.Linear)
     lora_module_names = set()
     for name, module in model.named_modules():
         if isinstance(module, cls):
@@ -490,13 +486,7 @@ def load_lora(model, cfg):
     lora_target_modules = list(cfg.lora_target_modules or [])
 
     if cfg.lora_target_linear:
-        bits = None
-        if cfg.load_in_4bit:
-            bits = 4
-        elif cfg.load_in_8bit:
-            bits = 8
-
-        linear_names = find_all_linear_names(bits, model)
+        linear_names = find_all_linear_names(model)
         LOG.info(f"found linear modules: {repr(linear_names)}")
         lora_target_modules = list(set(lora_target_modules + linear_names))
 

From 8e197f6fb48da66a82e804dca39bc5932c7055e3 Mon Sep 17 00:00:00 2001
From: Birch-san <Birch-san@users.noreply.github.com>
Date: Mon, 28 Aug 2023 23:47:16 +0100
Subject: [PATCH 26/67] pad_to_worst_case_seq_len boolean, for testing memory
 limits (#498)

* pad_to_worst_case_seq_len boolean, for testing memory limits

* remove collator_pad_to_longest option since it does nothing

see docs: https://huggingface.co/docs/transformers/main_classes/data_collator#transformers.DataCollatorWithPadding.padding

True and "longest" mean the same thing

* rename to `pad_to_sequence_len, and ensure 64 alignment

---------

Co-authored-by: Aman Karmani <aman@tmm1.net>
---
 README.md                      | 6 +++---
 examples/pythia-12b/config.yml | 1 -
 src/axolotl/utils/trainer.py   | 6 +++---
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 94427fcd01..a18c1108f2 100644
--- a/README.md
+++ b/README.md
@@ -459,6 +459,9 @@ dataset_shard_idx:
 # the maximum length of an input to train with, this should typically be less than 2048
 # as most models have a token/context limit of 2048
 sequence_len: 2048
+# pad inputs so each step uses constant sized buffers
+# this will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently
+pad_to_sequence_len:
 # max sequence length to concatenate training samples together up to
 # inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
 # FutureWarning: This will soon be DEPRECATED
@@ -610,9 +613,6 @@ deepspeed:
 # Path to torch distx for optim 'adamw_anyprecision'
 torchdistx_path:
 
-# Set padding for data collator to 'longest'
-collator_pad_to_longest:
-
 # Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize
 pretraining_dataset:
 
diff --git a/examples/pythia-12b/config.yml b/examples/pythia-12b/config.yml
index 535e5cd37d..dc06eb6b6d 100644
--- a/examples/pythia-12b/config.yml
+++ b/examples/pythia-12b/config.yml
@@ -47,4 +47,3 @@ local_rank:
 gradient_checkpointing: true
 fsdp:
 fsdp_config:
-collator_pad_to_longest: true
diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index 24be1b8c2a..1bc190fe28 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -585,10 +585,10 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
         callbacks.append(SaveBetterTransformerModelCallback)
 
     data_collator_kwargs = {
-        "padding": True,
+        "padding": True,  # True/"longest" is the default
     }
-    if cfg.collator_pad_to_longest:
-        data_collator_kwargs["padding"] = "longest"
+    if cfg.pad_to_sequence_len:
+        data_collator_kwargs["pad_to_multiple_of"] = 64 * round(cfg.sequence_len / 64)
     else:
         # A100 is best at 64, while others at 8. Let's use the larger so we don't have to check
         # https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html

From fd55bc87e2805e37038c25acc5005f3bcf525da2 Mon Sep 17 00:00:00 2001
From: Aman Karmani <aman@tmm1.net>
Date: Tue, 29 Aug 2023 01:03:41 +0000
Subject: [PATCH 27/67] use math.ceil instead of round /cc #498

---
 src/axolotl/utils/trainer.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index 1bc190fe28..fcbdd6d3e2 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -588,7 +588,9 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
         "padding": True,  # True/"longest" is the default
     }
     if cfg.pad_to_sequence_len:
-        data_collator_kwargs["pad_to_multiple_of"] = 64 * round(cfg.sequence_len / 64)
+        data_collator_kwargs["pad_to_multiple_of"] = 64 * math.ceil(
+            cfg.sequence_len / 64
+        )
     else:
         # A100 is best at 64, while others at 8. Let's use the larger so we don't have to check
         # https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html

From 125cccb7864219c26b13a45966f46b9c16e1f1ff Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Tue, 29 Aug 2023 05:37:53 -0700
Subject: [PATCH 28/67] Refactor train cfg cli (#499)

* wip to cleanup cfg cli options

* fix launcher

* fix cli args
---
 scripts/finetune.py         | 126 +++++++++++++++++++++++-------------
 src/axolotl/utils/models.py |  40 ++++++------
 2 files changed, 101 insertions(+), 65 deletions(-)

diff --git a/scripts/finetune.py b/scripts/finetune.py
index d02448ec29..454a627a1b 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -6,11 +6,13 @@
 import random
 import signal
 import sys
+from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
 import fire
 import torch
+import transformers
 import yaml
 
 # add src to the pythonpath so we don't need to pip install this
@@ -22,7 +24,7 @@
 from axolotl.utils.data import prepare_dataset
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import is_main_process
-from axolotl.utils.models import load_model, load_tokenizer
+from axolotl.utils.models import load_model, load_model_config, load_tokenizer
 from axolotl.utils.tokenization import check_dataset_labels
 from axolotl.utils.trainer import setup_trainer
 from axolotl.utils.wandb import setup_wandb_env_vars
@@ -37,6 +39,20 @@
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 
 
+@dataclass
+class TrainerCliArgs:
+    """
+    dataclass representing the various non-training arguments
+    """
+
+    debug: bool = field(default=False)
+    inference: bool = field(default=False)
+    merge_lora: bool = field(default=False)
+    prepare_ds_only: bool = field(default=False)
+    prompter: Optional[str] = field(default=None)
+    shard: bool = field(default=False)
+
+
 def print_axolotl_text_art():
     ascii_art = """
                            dP            dP   dP
@@ -61,6 +77,8 @@ def get_multi_line_input() -> Optional[str]:
 
 
 def do_inference(cfg, model, tokenizer, prompter: Optional[str]):
+    if prompter == "None":
+        prompter = None
     default_tokens = {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
 
     for token, symbol in default_tokens.items():
@@ -158,45 +176,20 @@ def check_not_in(list1: List[str], list2: Union[Dict[str, Any], List[str]]) -> b
 
 
 def train(
-    config: Path = Path("configs/"),
-    prepare_ds_only: bool = False,
-    **kwargs,
+    *,
+    cfg: DictDefault,
+    cli_args: TrainerCliArgs,
 ):
-    print_axolotl_text_art()
-    if Path(config).is_dir():
-        config = choose_config(config)
-
-    # load the config from the yaml file
-    with open(config, encoding="utf-8") as file:
-        cfg: DictDefault = DictDefault(yaml.safe_load(file))
-    # if there are any options passed in the cli, if it is something that seems valid from the yaml,
-    # then overwrite the value
-    cfg_keys = cfg.keys()
-    for k, _ in kwargs.items():
-        # if not strict, allow writing to cfg even if it's not in the yml already
-        if k in cfg_keys or not cfg.strict:
-            # handle booleans
-            if isinstance(cfg[k], bool):
-                cfg[k] = bool(kwargs[k])
-            else:
-                cfg[k] = kwargs[k]
-
-    validate_config(cfg)
-
-    normalize_config(cfg)
-
-    setup_wandb_env_vars(cfg)
-
     # load the tokenizer first
     LOG.info(f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}")
     tokenizer = load_tokenizer(cfg)
 
-    if (
-        check_not_in(["shard", "merge_lora"], kwargs) and not cfg.inference
+    if not (
+        cli_args.shard or cli_args.merge_lora or cli_args.inference
     ):  # don't need to load dataset for these
         train_dataset, eval_dataset, total_num_steps = prepare_dataset(cfg, tokenizer)
 
-    if cfg.debug or "debug" in kwargs:
+    if cli_args.debug or cfg.debug:
         LOG.info("check_dataset_labels...")
         check_dataset_labels(
             train_dataset.select(
@@ -205,17 +198,17 @@ def train(
             tokenizer,
         )
 
-    if prepare_ds_only:
+    if cli_args.prepare_ds_only:
         LOG.info("Finished preparing dataset. Exiting...")
         return
 
     # Load the model and tokenizer
     LOG.info("loading model and (optionally) peft_config...")
-    model, peft_config = load_model(cfg, tokenizer)
+    model, peft_config = load_model(cfg, tokenizer, inference=cli_args.inference)
 
     safe_serialization = cfg.save_safetensors is True
 
-    if "merge_lora" in kwargs and cfg.adapter is not None:
+    if cli_args.merge_lora and cfg.adapter is not None:
         LOG.info("running merge of LoRA with base model")
         model = model.merge_and_unload()
         model.to(dtype=torch.float16)
@@ -229,18 +222,13 @@ def train(
             tokenizer.save_pretrained(str(Path(cfg.output_dir) / "merged"))
         return
 
-    if cfg.inference:
-        LOG.info("calling do_inference function")
-        prompter: Optional[str] = "AlpacaPrompter"
-        if "prompter" in kwargs:
-            if kwargs["prompter"] == "None":
-                prompter = None
-            else:
-                prompter = kwargs["prompter"]
-        do_inference(cfg, model, tokenizer, prompter=prompter)
+    if cli_args.inference:
+        LOG.debug("Running inference on model")
+        do_inference(cfg, model, tokenizer, prompter=cli_args.prompter)
         return
 
-    if "shard" in kwargs:
+    if cli_args.shard:
+        LOG.debug("Re-saving model w/ sharding")
         model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
         return
 
@@ -322,5 +310,51 @@ def terminate_handler(_, __, model):
         model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
 
 
+def load_cfg(config: Path = Path("examples/"), **kwargs):
+    if Path(config).is_dir():
+        config = choose_config(config)
+
+    # load the config from the yaml file
+    with open(config, encoding="utf-8") as file:
+        cfg: DictDefault = DictDefault(yaml.safe_load(file))
+    # if there are any options passed in the cli, if it is something that seems valid from the yaml,
+    # then overwrite the value
+    cfg_keys = cfg.keys()
+    for k, _ in kwargs.items():
+        # if not strict, allow writing to cfg even if it's not in the yml already
+        if k in cfg_keys or not cfg.strict:
+            # handle booleans
+            if isinstance(cfg[k], bool):
+                cfg[k] = bool(kwargs[k])
+            else:
+                cfg[k] = kwargs[k]
+
+    model_config = load_model_config(cfg)
+
+    # figure out if the model is llama
+    cfg.is_llama_derived_model = (
+        (hasattr(model_config, "model_type") and model_config.model_type == "llama")
+        or cfg.is_llama_derived_model
+        or "llama" in cfg.base_model
+        or (cfg.model_type and "llama" in cfg.model_type.lower())
+    )
+    validate_config(cfg)
+
+    normalize_config(cfg)
+
+    setup_wandb_env_vars(cfg)
+    return cfg
+
+
+def do_train(config: Path = Path("examples/"), **kwargs):
+    print_axolotl_text_art()
+    parsed_cfg = load_cfg(config, **kwargs)
+    parser = transformers.HfArgumentParser((TrainerCliArgs))
+    parsed_cli_args, _ = parser.parse_args_into_dataclasses(
+        return_remaining_strings=True
+    )
+    train(cfg=parsed_cfg, cli_args=parsed_cli_args)
+
+
 if __name__ == "__main__":
-    fire.Fire(train)
+    fire.Fire(do_train)
diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index d0e5128ef2..4b9c79d848 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -5,12 +5,13 @@
 import math
 import os
 from pathlib import Path
-from typing import TYPE_CHECKING, Optional, Tuple  # noqa: F401
+from typing import Optional, Tuple  # noqa: F401
 
 import bitsandbytes as bnb
 import torch
 import transformers
 from optimum.bettertransformer import BetterTransformer
+from peft import PeftConfig
 from transformers import (  # noqa: F401
     AutoConfig,
     AutoModelForCausalLM,
@@ -23,13 +24,17 @@
 
 from axolotl.prompt_tokenizers import LLAMA_DEFAULT_EOS_TOKEN
 from axolotl.utils.bench import log_gpu_memory_usage
+from axolotl.utils.dict import DictDefault
 
 LOG = logging.getLogger("axolotl")
 
-if TYPE_CHECKING:
-    from peft import PeftConfig  # noqa: F401
 
-    from axolotl.utils.dict import DictDefault  # noqa: F401
+def load_model_config(cfg):
+    model_config_name = cfg.base_model_config or cfg.base_model
+    trust_remote_code: bool = False or cfg.trust_remote_code
+    return AutoConfig.from_pretrained(
+        model_config_name, trust_remote_code=trust_remote_code
+    )
 
 
 def load_tokenizer(cfg):
@@ -86,8 +91,10 @@ def load_tokenizer(cfg):
 
 
 def load_model(
-    cfg, tokenizer
-):  # type: (DictDefault, PreTrainedTokenizerBase) -> Tuple[PreTrainedModel, Optional[PeftConfig]]
+    cfg: DictDefault,
+    tokenizer: PreTrainedTokenizerBase,
+    inference: bool = False,
+) -> Tuple[PreTrainedModel, Optional[PeftConfig]]:
     """
     Load a model for a given configuration and tokenizer.
     """
@@ -97,14 +104,9 @@ def load_model(
 
     # TODO refactor as a kwarg
     load_in_8bit = cfg.load_in_8bit
-    cfg.is_llama_derived_model = (
-        "llama" in base_model
-        or (cfg.model_type and "llama" in cfg.model_type.lower())
-        or cfg.is_llama_derived_model
-    )
 
     if cfg.is_llama_derived_model and cfg.flash_attention:
-        if cfg.device not in ["mps", "cpu"] and not cfg.inference:
+        if cfg.device not in ["mps", "cpu"] and not inference:
             from axolotl.monkeypatch.llama_attn_hijack_flash import (
                 replace_llama_attn_with_flash_attn,
             )
@@ -146,7 +148,7 @@ def load_model(
     if (
         cfg.is_llama_derived_model
         and (cfg.max_packed_sequence_len or cfg.sample_packing)
-        and not cfg.inference
+        and not inference
     ):
         from axolotl.monkeypatch.llama_expand_mask import hijack_expand_mask
 
@@ -424,15 +426,15 @@ def load_model(
     return model, lora_config
 
 
-def load_adapter(model, cfg, adapter):
-    # type: (PreTrainedModel, DictDefault, Optional[str]) -> Tuple[PreTrainedModel, Optional[PeftConfig]]
+def load_adapter(model, cfg, adapter, inference=False):
+    # type: (PreTrainedModel, DictDefault, Optional[str], bool) -> Tuple[PreTrainedModel, Optional[PeftConfig]]
 
     if adapter is None:
         return model, None
     if hasattr(model, "enable_input_require_grads"):
         model.enable_input_require_grads()
     if adapter in ["lora", "qlora"]:
-        return load_lora(model, cfg)
+        return load_lora(model, cfg, inference=inference)
     if adapter == "llama-adapter":
         return load_llama_adapter(model, cfg)
 
@@ -478,8 +480,8 @@ def find_all_linear_names(model):
     return list(lora_module_names)
 
 
-def load_lora(model, cfg):
-    # type: (PreTrainedModel, DictDefault) -> Tuple[PreTrainedModel, Optional[PeftConfig]]
+def load_lora(model, cfg, inference=False):
+    # type: (PreTrainedModel, DictDefault, bool) -> Tuple[PreTrainedModel, Optional[PeftConfig]]
 
     from peft import LoraConfig, PeftModel, get_peft_model
 
@@ -506,7 +508,7 @@ def load_lora(model, cfg):
         model = PeftModel.from_pretrained(
             model,
             cfg.lora_model_dir,
-            is_trainable=not cfg.inference,
+            is_trainable=(not inference),
         )
     else:
         model = get_peft_model(model, lora_config)

From 36b2e1cfee26d097e4de5541cf27ad8887e9fb60 Mon Sep 17 00:00:00 2001
From: Maxime <672982+maximegmd@users.noreply.github.com>
Date: Tue, 29 Aug 2023 15:17:10 +0200
Subject: [PATCH 29/67] tweak: use default config file when only one file is
 present (#501)

---
 scripts/finetune.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/scripts/finetune.py b/scripts/finetune.py
index 454a627a1b..044199c161 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -153,6 +153,10 @@ def choose_config(path: Path):
             "No YAML config files found in the specified directory. Are you using a .yml extension?"
         )
 
+    if len(yaml_files) == 1:
+        print(f"Using default YAML file '{yaml_files[0]}'")
+        return yaml_files[0]
+
     print("Choose a YAML file:")
     for idx, file in enumerate(yaml_files):
         print(f"{idx + 1}. {file}")

From 48c56470d025829f0d02b278bab7531253ef0203 Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Tue, 29 Aug 2023 22:17:37 +0900
Subject: [PATCH 30/67] Fix(doc): Clarify no amp to full yaml docs (#496)

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index a18c1108f2..0da9dc5060 100644
--- a/README.md
+++ b/README.md
@@ -407,6 +407,10 @@ fp16: true
 # Use CUDA tf32
 tf32: true # require >=ampere
 
+# No AMP (automatic mixed precision)
+bfloat16: true # require >=ampere
+float16: true
+
 # a list of one or more datasets to finetune the model with
 datasets:
   # hf dataset repo | "json" for local dataset, make sure to fill data_files

From e356b297cb9470154808eb013227cb1984c05d20 Mon Sep 17 00:00:00 2001
From: Aman Gupta Karmani <aman@tmm1.net>
Date: Tue, 29 Aug 2023 09:17:51 -0400
Subject: [PATCH 31/67] remove --force-reinstall from Dockerfile to ensure
 correct pytorch version (#492)

---
 docker/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index a9d94f03ce..b429d50f27 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -11,7 +11,7 @@ RUN apt-get update && \
 
 WORKDIR /workspace
 
-RUN pip3 install --force-reinstall "peft @ git+https://github.com/huggingface/peft.git@main"
+RUN pip3 install "peft @ git+https://github.com/huggingface/peft.git@main"
 RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN cd axolotl && \

From 5ac3392075bd5e858002db9c1c1a3968495033ea Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Tue, 29 Aug 2023 06:18:17 -0700
Subject: [PATCH 32/67] support for datasets with multiple names (#480)

* support for datasets with multiple names

* update docs
---
 README.md                 |  9 +++++++++
 src/axolotl/utils/data.py | 11 ++++++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 0da9dc5060..204e2141a8 100644
--- a/README.md
+++ b/README.md
@@ -328,6 +328,15 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
       name: enron_emails
       type: completion # format from earlier
 
+  # huggingface repo with multiple named configurations/subsets
+  datasets:
+    - path: bigcode/commitpackft
+      name:
+        - ruby
+        - python
+        - typescript
+      type: ... # unimplemented custom format
+
   # local
   datasets:
     - path: data.jsonl # or json
diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py
index b801e6a576..20d0fcfb88 100644
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -134,8 +134,17 @@ def load_tokenized_prepared_datasets(
             seed = 42
 
         datasets = []
+
+        def for_d_in_datasets(dataset_configs):
+            for dataset in dataset_configs:
+                if dataset.name and isinstance(dataset.name, list):
+                    for name in dataset.name:
+                        yield DictDefault({**dataset, "name": name})
+                else:
+                    yield dataset
+
         # pylint: disable=invalid-name
-        for d in cfg.datasets:
+        for d in for_d_in_datasets(cfg.datasets):
             ds: Union[Dataset, DatasetDict] = None
             ds_from_hub = False
             try:

From 548787daaeaf48f0cd23231a45c7375fb318b097 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Tue, 29 Aug 2023 10:13:42 -0700
Subject: [PATCH 33/67] customizable ascii art (#506)

---
 requirements.txt    |  1 +
 scripts/finetune.py | 17 +++++++----------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 156d99b480..0ae20f3001 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -24,3 +24,4 @@ rouge-score==0.1.2
 scipy
 scikit-learn==1.2.2
 pynvml
+art
diff --git a/scripts/finetune.py b/scripts/finetune.py
index 044199c161..8019af8e3f 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -16,6 +16,7 @@
 import yaml
 
 # add src to the pythonpath so we don't need to pip install this
+from art import text2art
 from optimum.bettertransformer import BetterTransformer
 from transformers import GenerationConfig, TextStreamer
 
@@ -53,16 +54,12 @@ class TrainerCliArgs:
     shard: bool = field(default=False)
 
 
-def print_axolotl_text_art():
-    ascii_art = """
-                           dP            dP   dP
-                           88            88   88
-.d8888b. dP.  .dP .d8888b. 88 .d8888b. d8888P 88
-88'  `88  `8bd8'  88'  `88 88 88'  `88   88   88
-88.  .88  .d88b.  88.  .88 88 88.  .88   88   88
-`88888P8 dP'  `dP `88888P' dP `88888P'   dP   dP
-"""
-
+def print_axolotl_text_art(suffix=None):
+    font = "nancyj"
+    ascii_text = "  axolotl"
+    if suffix:
+        ascii_text += f"  x  {suffix}"
+    ascii_art = text2art(" axolotl", font=font)
     if is_main_process():
         print(ascii_art)
 

From 76576323dfb8ac42d81b97a3809eb423da7538b6 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Tue, 29 Aug 2023 13:24:19 -0700
Subject: [PATCH 34/67] add eval benchmark callback (#441)

* add mmlu callback

* use hf dataset for mmlu evals

* default to mmlu-zs

* make sure to define all the explicit positional args

* include metrics in callback

* another callback fix for collator max len attribute

* fix mmlu evals

* sample benchmarks, ensure we drop long samples

* fix the data file

* fix elif and add better messaging

* more fixes

* rename mmlu to bench

* more fixes

* dataset handling and aggregate across benchmark

* better handling when no subjects

* benchmark callback has its own dataloader and collator

* fixes

* updated dataset

* more fixes

* missing transformers import

* improve support for customized dataset for bench evals

* gather benchmarks from all ranks

* fix for gather across multiple gpus
---
 requirements.txt                 |   1 +
 src/axolotl/utils/callbacks.py   | 210 +++++++++++++++++++++++++++++++
 src/axolotl/utils/distributed.py |  38 ++++++
 src/axolotl/utils/trainer.py     |  72 ++++++++++-
 4 files changed, 320 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 0ae20f3001..fcd7f9292a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,6 +4,7 @@ transformers @ git+https://github.com/huggingface/transformers.git
 bitsandbytes>=0.41.1
 accelerate @ git+https://github.com/huggingface/accelerate@2a289f6108e77a77a4efffb3f6316bc98538413b
 addict
+evaluate
 fire
 PyYAML>=6.0
 datasets
diff --git a/src/axolotl/utils/callbacks.py b/src/axolotl/utils/callbacks.py
index ddc179f390..92333f4cab 100644
--- a/src/axolotl/utils/callbacks.py
+++ b/src/axolotl/utils/callbacks.py
@@ -1,9 +1,19 @@
 """Callbacks for Trainer class"""
 
+from __future__ import annotations
+
 import logging
 import os
+from typing import TYPE_CHECKING, Dict, List
 
+import evaluate
+import numpy as np
+import pandas as pd
+import torch
+import torch.distributed as dist
+from datasets import load_dataset
 from optimum.bettertransformer import BetterTransformer
+from tqdm import tqdm
 from transformers import (
     TrainerCallback,
     TrainerControl,
@@ -13,8 +23,19 @@
 from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, IntervalStrategy
 
 from axolotl.utils.bench import log_gpu_memory_usage
+from axolotl.utils.distributed import (
+    barrier,
+    gather_scalar_from_all_ranks,
+    get_world_size,
+    is_main_process,
+    zero_first,
+)
+
+if TYPE_CHECKING:
+    from axolotl.utils.trainer import AxolotlTrainingArguments
 
 LOG = logging.getLogger("axolotl.callbacks")
+IGNORE_INDEX = -100
 
 
 class SavePeftModelCallback(TrainerCallback):  # pylint: disable=too-few-public-methods
@@ -96,3 +117,192 @@ def on_step_end(
             log_gpu_memory_usage(LOG, "while training", self.cfg.device)
             self.logged = True
         return control
+
+
+def bench_eval_callback_factory(trainer, tokenizer):
+    accuracy = evaluate.load("accuracy")
+    abcd_idx = [
+        tokenizer("A", add_special_tokens=False).input_ids[0],
+        tokenizer("B", add_special_tokens=False).input_ids[0],
+        tokenizer("C", add_special_tokens=False).input_ids[0],
+        tokenizer("D", add_special_tokens=False).input_ids[0],
+        tokenizer("E", add_special_tokens=False).input_ids[0],
+        tokenizer("F", add_special_tokens=False).input_ids[0],
+        tokenizer("G", add_special_tokens=False).input_ids[0],
+    ]
+    bench_split = "eval"
+
+    def transform_bench_subject(example):
+        # Split on ':' and trim whitespace
+        parts = example["subject"].split(":")
+        first_part = (
+            parts[0].strip().lower().replace("-", "_")
+        )  # Lowercase the first part
+        second_part = (
+            parts[1].strip().replace("-", "_") if len(parts) > 1 else "all"
+        )  # Replace hyphens with underscores
+
+        # Return the transformed values
+        return {"name": first_part, "subject": second_part}
+
+    if trainer.args.bench_dataset == "mmlu-zs":
+        bench_dataset = load_dataset(
+            "openaccess-ai-collective/mmlu-evals",
+            data_files={
+                "eval": "zero_shot_mmlu_val.json",
+                "test": "zero_shot_mmlu_test.json",
+            },
+        )
+        # bench_dataset = bench_dataset.remove_columns("subject")
+    # MMLU Five-shot (Eval/Test only)
+    elif trainer.args.bench_dataset in ["mmlu", "mmlu-fs"]:
+        bench_dataset = load_dataset(
+            "openaccess-ai-collective/mmlu-evals",
+            data_files={
+                "eval": "five_shot_mmlu_val.json",
+                "test": "five_shot_mmlu_test.json",
+            },
+        )
+        # bench_dataset = bench_dataset.remove_columns('subject')
+    elif "/" in trainer.args.bench_dataset:
+        bench_ds = trainer.args.bench_dataset
+        bench_ds_name = "/".join(bench_ds.split("/", 2)[:2])
+        bench_ds_data_file = "/".join(bench_ds.split("/", 2)[2:])
+        bench_dataset = load_dataset(
+            bench_ds_name,
+            data_files={
+                "eval": bench_ds_data_file,
+            },
+        )
+        bench_dataset["eval"] = bench_dataset["eval"].map(transform_bench_subject)
+    else:
+        raise ValueError(
+            f"unhandled value `{trainer.args.bench_dataset}` for bench_dataset training args"
+        )
+    bench_dataset = bench_dataset[trainer.args.bench_split]
+    if trainer.args.max_bench_samples is not None:
+        bench_dataset = bench_dataset.select(range(trainer.args.max_bench_samples))
+
+    def tokenize_evals(example):
+        source = f"{tokenizer.bos_token}{example['input']}"
+        target = f"{example['output']}{tokenizer.eos_token}"
+
+        tokenized_source = tokenizer(
+            source,
+            max_length=2048,
+            truncation=True,
+            add_special_tokens=False,
+        )
+        tokenized_target = tokenizer(
+            target,
+            max_length=2048,
+            truncation=True,
+            add_special_tokens=False,
+        )
+        input_ids = tokenized_source["input_ids"] + tokenized_target["input_ids"]
+        labels = [IGNORE_INDEX] * len(tokenized_source["input_ids"]) + tokenized_target[
+            "input_ids"
+        ]
+
+        return {
+            "input_ids": input_ids,
+            "labels": labels,
+            "subject": example["subject"],
+        }
+
+    with zero_first(is_main_process()):
+        bench_dataset = bench_dataset.map(tokenize_evals)
+        bench_dataset = bench_dataset.filter(lambda x: x["labels"][-2] in abcd_idx)
+
+    class BenchEvalCallback(TrainerCallback):
+        """
+        TrainerCallback that runs the MMLU evals
+        """
+
+        def on_evaluate(
+            self,
+            args: AxolotlTrainingArguments,
+            state: TrainerState,  # pylint: disable=unused-argument
+            control: TrainerControl,  # pylint: disable=unused-argument
+            metrics: Dict[str, float],  # pylint: disable=unused-argument
+            **kwargs,  # pylint: disable=unused-argument
+        ):
+            data_loader = trainer.get_bench_dataloader(
+                bench_dataset.remove_columns(["input", "subject", "output", "name"])
+            )
+            trainer.model.eval()
+            preds, refs = [], []
+            loss_bench = 0
+            for batch in tqdm(data_loader, total=len(data_loader)):
+                (loss, logits, labels) = trainer.prediction_step(
+                    trainer.model,
+                    batch,
+                    prediction_loss_only=False,
+                )
+                # There are two tokens, the output, and eos token.
+                for i, logit in enumerate(logits):
+                    label_non_zero_id = (batch["labels"][i] != IGNORE_INDEX).nonzero()[
+                        0
+                    ][0]
+                    logit_abcd = logit[label_non_zero_id - 1][abcd_idx]
+                    preds.append(torch.argmax(logit_abcd).item())
+                labels = labels[labels != IGNORE_INDEX].view(-1, 2)[:, 0]
+                refs += [
+                    abcd_idx.index(label) if label in abcd_idx else -1
+                    for label in labels.tolist()
+                ]
+                loss_bench += loss.item()
+            # Extract results by subject.
+            bench_name = bench_dataset["name"]
+            bench_names: dict = {s: {"refs": [], "preds": []} for s in set(bench_name)}
+            for s, p, r in zip(bench_name, preds, refs):  # pylint: disable=invalid-name
+                bench_names[s]["preds"].append(p)
+                bench_names[s]["refs"].append(r)
+            barrier()
+            local_bench_names = bench_names
+            gathered_bench_names: List[Dict] = [{} for _ in range(get_world_size())]
+            # Gather results from all GPUs to GPU 0
+
+            loss_bench_ranks = gather_scalar_from_all_ranks(
+                lambda: loss_bench, get_world_size()
+            )
+            len_data_loader_ranks = gather_scalar_from_all_ranks(
+                lambda: len(data_loader), get_world_size()
+            )
+
+            if not is_main_process():
+                dist.gather_object(local_bench_names, dst=0)
+            else:
+                dist.gather_object(local_bench_names, gathered_bench_names, dst=0)
+                bench_loss = sum(loss_bench_ranks) / sum(len_data_loader_ranks)
+                results = {"bench_loss": bench_loss}
+
+                # Combine results from all GPUs
+                combined_bench_names: Dict[str, Dict[str, List]] = {}
+                for bench_name in gathered_bench_names:
+                    for name, data in bench_name.items():
+                        if name not in combined_bench_names:
+                            combined_bench_names[name] = {"refs": [], "preds": []}
+                        combined_bench_names[name]["refs"].extend(data["refs"])
+                        combined_bench_names[name]["preds"].extend(data["preds"])
+
+                bench_scores = []
+                for (
+                    bench_name
+                ) in combined_bench_names:  # pylint: disable=consider-using-dict-items
+                    bench_score = accuracy.compute(
+                        references=combined_bench_names[bench_name]["refs"],
+                        predictions=combined_bench_names[bench_name]["preds"],
+                    )["accuracy"]
+                    if not pd.isna(bench_score):
+                        results[
+                            f"bench_{bench_split}_accuracy_{bench_name}"
+                        ] = bench_score
+                        bench_scores.append(bench_score)
+                    else:
+                        results[f"bench_{bench_split}_accuracy_{bench_name}"] = 0.0
+                        bench_scores.append(0.0)
+                results[f"bench_{bench_split}_accuracy"] = np.mean(bench_scores)
+                trainer.log(results)
+
+    return BenchEvalCallback
diff --git a/src/axolotl/utils/distributed.py b/src/axolotl/utils/distributed.py
index b3ea07c055..38d0d1e059 100644
--- a/src/axolotl/utils/distributed.py
+++ b/src/axolotl/utils/distributed.py
@@ -1,8 +1,10 @@
 """
 utility helpers for distributed checks
 """
+import os
 from contextlib import contextmanager
 
+import torch
 import torch.distributed as dist
 from accelerate import Accelerator
 
@@ -43,6 +45,10 @@ def is_main_process():
     return dist.get_rank() == 0
 
 
+def get_world_size():
+    return int(os.getenv("WORLD_SIZE", "1"))
+
+
 @contextmanager
 def zero_first(is_main):
     """
@@ -53,3 +59,35 @@ def zero_first(is_main):
     yield
     if is_main:  # then rank 0 waits after it has run the context
         barrier()
+
+
+def gather_scalar_from_all_ranks(fn, world_size=1):  # pylint: disable=invalid-name
+    """
+    Run a callable 'fn' on all ranks and gather the results on the specified rank.
+
+    Args:
+    - fn (callable): A function that computes the value. This should not have any side effects.
+    - rank (int, optional): The rank that gathers the values. Default is 0.
+    - world_size (int, optional): Total number of processes in the current distributed setup.
+
+    Returns:
+    - A list of computed values from all ranks if on the gathering rank, otherwise None.
+    """
+    value_scalar = fn()
+    value_tensor = torch.tensor(value_scalar, device=dist.get_rank()).float()
+
+    if not is_main_process():
+        dist.gather(value_tensor, dst=0)
+    else:
+        gathered_tensors = [torch.zeros_like(value_tensor) for _ in range(world_size)]
+        dist.gather(value_tensor, gather_list=gathered_tensors, dst=0)
+
+        # Convert tensors back to their original type (int or float)
+        gathered_values = []
+        for tensor in gathered_tensors:
+            if tensor == tensor.int():
+                gathered_values.append(int(tensor.item()))
+            else:
+                gathered_values.append(float(tensor.item()))
+        return gathered_values
+    return None
diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index fcbdd6d3e2..37578908e4 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -12,9 +12,15 @@
 
 import numpy as np
 import torch.cuda
+import transformers
 from datasets import Dataset, set_caching_enabled
 from torch.optim.lr_scheduler import OneCycleLR
-from torch.utils.data import DataLoader, DistributedSampler, RandomSampler
+from torch.utils.data import (
+    DataLoader,
+    DistributedSampler,
+    RandomSampler,
+    SequentialSampler,
+)
 from transformers import EarlyStoppingCallback, Trainer, TrainingArguments
 from transformers.trainer_pt_utils import SequentialDistributedSampler
 
@@ -23,6 +29,7 @@
     GPUStatsCallback,
     SaveBetterTransformerModelCallback,
     SavePeftModelCallback,
+    bench_eval_callback_factory,
 )
 from axolotl.utils.collators import DataCollatorForSeq2Seq
 from axolotl.utils.dataloader import MultipackDistributedDataloader
@@ -127,6 +134,27 @@ class AxolotlTrainingArguments(TrainingArguments):
         default=None,
         metadata={"help": "how many warmup steps to take after reset for ReLoRA"},
     )
+    bench_split: Optional[str] = field(
+        default="eval", metadata={"help": "The benchmark split to run on"}
+    )
+    bench_dataset: Optional[str] = field(
+        default="pharaouk/dharma-1/dharma_1_mini.json",
+        metadata={
+            "help": "Benchmark dataset to use: options are `mmlu-zs`, `mmlu-fs`, or the full path to the dataset file"
+        },
+    )
+    do_bench_eval: Optional[bool] = field(
+        default=False, metadata={"help": "Whether to run the Benchmark evaluation."}
+    )
+    max_bench_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "If set, only evaluates on `max_bench_samples` of the benchmark dataset."
+        },
+    )
+    bench_source_max_len: int = field(
+        default=2048, metadata={"help": "Maximum source sequence length for bench."}
+    )
 
 
 class AxolotlTrainer(Trainer):
@@ -136,6 +164,10 @@ class AxolotlTrainer(Trainer):
 
     args = None  # type: AxolotlTrainingArguments
 
+    def __init__(self, *args, bench_data_collator=None, **kwargs):
+        self.bench_data_collator = bench_data_collator
+        super().__init__(*args, **kwargs)
+
     def create_scheduler(
         self, num_training_steps: int, optimizer: torch.optim.Optimizer = None
     ):
@@ -226,6 +258,31 @@ def get_eval_dataloader(
             )
         return super().get_eval_dataloader(eval_dataset)
 
+    def _get_bench_sampler(
+        self, bench_dataset: Dataset
+    ) -> Optional[torch.utils.data.Sampler]:
+        if self.args.world_size <= 1:
+            return SequentialSampler(bench_dataset)
+        return None
+
+    def get_bench_dataloader(
+        self,
+        bench_dataset: Dataset,
+    ) -> Union[DataLoader, MultipackDistributedDataloader]:
+        dataloader_params = {
+            "batch_size": self.args.eval_batch_size,
+            "collate_fn": self.bench_data_collator,
+            "num_workers": self.args.dataloader_num_workers,
+            "pin_memory": self.args.dataloader_pin_memory,
+        }
+
+        if not isinstance(bench_dataset, torch.utils.data.IterableDataset):
+            dataloader_params["sampler"] = self._get_bench_sampler(bench_dataset)
+            dataloader_params["drop_last"] = self.args.dataloader_drop_last
+
+        return DataLoader(bench_dataset, **dataloader_params)
+        # return self.accelerator.prepare(DataLoader(bench_dataset, **dataloader_params))
+
     def compute_loss(self, model, inputs, return_outputs=False):
         # use one's weighted cross entropy loss calc
         # if self.args.sample_packing:
@@ -517,6 +574,11 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
             "steps" if cfg.save_steps else "epoch"
         )
 
+    if cfg.do_bench_eval:
+        training_arguments_kwargs["do_bench_eval"] = cfg.do_bench_eval
+        if cfg.bench_dataset:
+            training_arguments_kwargs["bench_dataset"] = cfg.bench_dataset
+
     training_args = AxolotlTrainingArguments(  # pylint: disable=unexpected-keyword-arg
         max_steps=total_num_steps if cfg.max_steps else -1,
         max_seq_length=cfg.sequence_len,
@@ -629,8 +691,16 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
             return_tensors="pt",
             **data_collator_kwargs,
         ),
+        bench_data_collator=transformers.DataCollatorForSeq2Seq(
+            tokenizer,
+            return_tensors="pt",
+            **data_collator_kwargs,
+        ),
         callbacks=callbacks,
         **trainer_kwargs,
     )
 
+    if cfg.do_bench_eval:
+        trainer.add_callback(bench_eval_callback_factory(trainer, tokenizer))
+
     return trainer

From 1e07c162f1df3b912991f5cc7fe16e4e524e2b6d Mon Sep 17 00:00:00 2001
From: Aman Gupta Karmani <aman@tmm1.net>
Date: Wed, 30 Aug 2023 08:10:33 -0400
Subject: [PATCH 35/67] set zero3 optimizer betas to auto so they inherit from
 HF trainer config (#507)

---
 deepspeed/zero3.json | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/deepspeed/zero3.json b/deepspeed/zero3.json
index 69ad19259f..302f24c1a6 100644
--- a/deepspeed/zero3.json
+++ b/deepspeed/zero3.json
@@ -35,10 +35,7 @@
     "type": "AdamW",
     "params": {
       "lr": "auto",
-      "betas": [
-        0.9,
-        0.95
-      ],
+      "betas": "auto",
       "eps": 1e-8,
       "weight_decay": "auto"
     }

From c56b450cf501c2d985a3ecd9b5814baa4f456423 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Wed, 30 Aug 2023 06:55:26 -0700
Subject: [PATCH 36/67] drop empty tokenized rows too (#509)

---
 src/axolotl/utils/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index 37578908e4..0aceee5190 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -361,7 +361,7 @@ def add_position_ids(sample):
 
 
 def drop_long_seq(sample, sequence_len=2048):
-    return len(sample["input_ids"]) <= sequence_len
+    return len(sample["input_ids"]) <= sequence_len and len(sample["input_ids"]) > 0
 
 
 @contextmanager

From 42f9642792abd53eaa920ea210509d08941a75e3 Mon Sep 17 00:00:00 2001
From: Alpay Ariyak <98838263+alpayariyak@users.noreply.github.com>
Date: Thu, 31 Aug 2023 01:00:50 -0400
Subject: [PATCH 37/67] Changed Bench Eval to report metrics correctly by
 split. Added total accuracy and renamed previously used bench_accuracy to
 bench_average_accuracy. (#512)

* Added "eval_" prefix

* Added total bench accuracy and renamed the previous one to bench_average_accuracy. Changed naming to use bench_split instead of always using eval_ prefix.
---
 src/axolotl/utils/callbacks.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/axolotl/utils/callbacks.py b/src/axolotl/utils/callbacks.py
index 92333f4cab..ee5acfd555 100644
--- a/src/axolotl/utils/callbacks.py
+++ b/src/axolotl/utils/callbacks.py
@@ -275,7 +275,7 @@ def on_evaluate(
             else:
                 dist.gather_object(local_bench_names, gathered_bench_names, dst=0)
                 bench_loss = sum(loss_bench_ranks) / sum(len_data_loader_ranks)
-                results = {"bench_loss": bench_loss}
+                results = {f"{bench_split}_bench_loss": bench_loss}
 
                 # Combine results from all GPUs
                 combined_bench_names: Dict[str, Dict[str, List]] = {}
@@ -287,6 +287,8 @@ def on_evaluate(
                         combined_bench_names[name]["preds"].extend(data["preds"])
 
                 bench_scores = []
+                bench_refs = []
+                bench_preds = []
                 for (
                     bench_name
                 ) in combined_bench_names:  # pylint: disable=consider-using-dict-items
@@ -294,15 +296,20 @@ def on_evaluate(
                         references=combined_bench_names[bench_name]["refs"],
                         predictions=combined_bench_names[bench_name]["preds"],
                     )["accuracy"]
+                    bench_refs.extend(combined_bench_names[bench_name]["refs"])
+                    bench_preds.extend(combined_bench_names[bench_name]["preds"])
                     if not pd.isna(bench_score):
                         results[
-                            f"bench_{bench_split}_accuracy_{bench_name}"
+                            f"{bench_split}_bench_accuracy_{bench_name}"
                         ] = bench_score
                         bench_scores.append(bench_score)
                     else:
-                        results[f"bench_{bench_split}_accuracy_{bench_name}"] = 0.0
+                        results[f"{bench_split}_bench_accuracy_{bench_name}"] = 0.0
                         bench_scores.append(0.0)
-                results[f"bench_{bench_split}_accuracy"] = np.mean(bench_scores)
+                results[f"{bench_split}_bench_average_accuracy"] = np.mean(bench_scores)
+                results[f"{bench_split}_bench_total_accuracy"] = accuracy.compute(
+                    references=bench_refs, predictions=bench_preds
+                )["accuracy"]
                 trainer.log(results)
 
     return BenchEvalCallback

From b21e4a20fe532ae5b36cee8eadfcd1ab64cd7639 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Wed, 30 Aug 2023 22:01:47 -0700
Subject: [PATCH 38/67] split train from other cli options (#503)

---
 scripts/finetune.py            | 241 +++++++++++----------------------
 src/axolotl/common/__init__.py |   0
 src/axolotl/common/cli.py      |  41 ++++++
 src/axolotl/train.py           | 139 +++++++++++++++++++
 4 files changed, 261 insertions(+), 160 deletions(-)
 create mode 100644 src/axolotl/common/__init__.py
 create mode 100644 src/axolotl/common/cli.py
 create mode 100644 src/axolotl/train.py

diff --git a/scripts/finetune.py b/scripts/finetune.py
index 8019af8e3f..201a47e14a 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -4,9 +4,7 @@
 import logging
 import os
 import random
-import signal
 import sys
-from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
@@ -17,17 +15,17 @@
 
 # add src to the pythonpath so we don't need to pip install this
 from art import text2art
-from optimum.bettertransformer import BetterTransformer
 from transformers import GenerationConfig, TextStreamer
 
+from axolotl.common.cli import TrainerCliArgs, load_model_and_tokenizer
 from axolotl.logging_config import configure_logging
+from axolotl.train import TrainDatasetMeta, train
 from axolotl.utils.config import normalize_config, validate_config
 from axolotl.utils.data import prepare_dataset
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import is_main_process
-from axolotl.utils.models import load_model, load_model_config, load_tokenizer
+from axolotl.utils.models import load_model_config, load_tokenizer
 from axolotl.utils.tokenization import check_dataset_labels
-from axolotl.utils.trainer import setup_trainer
 from axolotl.utils.wandb import setup_wandb_env_vars
 
 project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
@@ -40,26 +38,13 @@
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 
 
-@dataclass
-class TrainerCliArgs:
-    """
-    dataclass representing the various non-training arguments
-    """
-
-    debug: bool = field(default=False)
-    inference: bool = field(default=False)
-    merge_lora: bool = field(default=False)
-    prepare_ds_only: bool = field(default=False)
-    prompter: Optional[str] = field(default=None)
-    shard: bool = field(default=False)
-
-
 def print_axolotl_text_art(suffix=None):
     font = "nancyj"
     ascii_text = "  axolotl"
     if suffix:
         ascii_text += f"  x  {suffix}"
     ascii_art = text2art(" axolotl", font=font)
+
     if is_main_process():
         print(ascii_art)
 
@@ -73,9 +58,45 @@ def get_multi_line_input() -> Optional[str]:
     return instruction
 
 
-def do_inference(cfg, model, tokenizer, prompter: Optional[str]):
-    if prompter == "None":
-        prompter = None
+def do_merge_lora(
+    *,
+    cfg: DictDefault,
+    cli_args: TrainerCliArgs,
+):
+    model, tokenizer = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
+    safe_serialization = cfg.save_safetensors is True
+
+    LOG.info("running merge of LoRA with base model")
+    model = model.merge_and_unload()
+    model.to(dtype=torch.float16)
+
+    if cfg.local_rank == 0:
+        LOG.info("saving merged model")
+        model.save_pretrained(
+            str(Path(cfg.output_dir) / "merged"),
+            safe_serialization=safe_serialization,
+        )
+        tokenizer.save_pretrained(str(Path(cfg.output_dir) / "merged"))
+
+
+def shard(
+    *,
+    cfg: DictDefault,
+    cli_args: TrainerCliArgs,
+):
+    model, _ = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
+    safe_serialization = cfg.save_safetensors is True
+    LOG.debug("Re-saving model w/ sharding")
+    model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
+
+
+def do_inference(
+    *,
+    cfg: DictDefault,
+    cli_args: TrainerCliArgs,
+):
+    model, tokenizer = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
+    prompter = cli_args.prompter
     default_tokens = {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
 
     for token, symbol in default_tokens.items():
@@ -176,141 +197,6 @@ def check_not_in(list1: List[str], list2: Union[Dict[str, Any], List[str]]) -> b
     return not any(el in list2 for el in list1)
 
 
-def train(
-    *,
-    cfg: DictDefault,
-    cli_args: TrainerCliArgs,
-):
-    # load the tokenizer first
-    LOG.info(f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}")
-    tokenizer = load_tokenizer(cfg)
-
-    if not (
-        cli_args.shard or cli_args.merge_lora or cli_args.inference
-    ):  # don't need to load dataset for these
-        train_dataset, eval_dataset, total_num_steps = prepare_dataset(cfg, tokenizer)
-
-    if cli_args.debug or cfg.debug:
-        LOG.info("check_dataset_labels...")
-        check_dataset_labels(
-            train_dataset.select(
-                [random.randrange(0, len(train_dataset) - 1) for _ in range(5)]  # nosec
-            ),
-            tokenizer,
-        )
-
-    if cli_args.prepare_ds_only:
-        LOG.info("Finished preparing dataset. Exiting...")
-        return
-
-    # Load the model and tokenizer
-    LOG.info("loading model and (optionally) peft_config...")
-    model, peft_config = load_model(cfg, tokenizer, inference=cli_args.inference)
-
-    safe_serialization = cfg.save_safetensors is True
-
-    if cli_args.merge_lora and cfg.adapter is not None:
-        LOG.info("running merge of LoRA with base model")
-        model = model.merge_and_unload()
-        model.to(dtype=torch.float16)
-
-        if cfg.local_rank == 0:
-            LOG.info("saving merged model")
-            model.save_pretrained(
-                str(Path(cfg.output_dir) / "merged"),
-                safe_serialization=safe_serialization,
-            )
-            tokenizer.save_pretrained(str(Path(cfg.output_dir) / "merged"))
-        return
-
-    if cli_args.inference:
-        LOG.debug("Running inference on model")
-        do_inference(cfg, model, tokenizer, prompter=cli_args.prompter)
-        return
-
-    if cli_args.shard:
-        LOG.debug("Re-saving model w/ sharding")
-        model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
-        return
-
-    if cfg.resume_from_checkpoint is None and cfg.auto_resume_from_checkpoints:
-        possible_checkpoints = [
-            str(cp) for cp in Path(cfg.output_dir).glob("checkpoint-*")
-        ]
-        if len(possible_checkpoints) > 0:
-            sorted_paths = sorted(
-                possible_checkpoints,
-                key=lambda path: int(path.split("-")[-1]),
-            )
-            cfg.resume_from_checkpoint = sorted_paths[-1]
-            LOG.info(
-                f"Using Auto-resume functionality to start with checkpoint at {cfg.resume_from_checkpoint}"
-            )
-    resume_from_checkpoint = cfg.resume_from_checkpoint
-
-    trainer = setup_trainer(
-        cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps
-    )
-
-    model.config.use_cache = False
-
-    if torch.__version__ >= "2" and sys.platform != "win32":
-        LOG.info("Compiling torch model")
-        model = torch.compile(model)
-
-    # go ahead and presave, so we have the adapter config available to inspect
-    if peft_config:
-        LOG.info(f"Pre-saving adapter config to {cfg.output_dir}")
-        peft_config.save_pretrained(cfg.output_dir)
-
-    # In case we want to stop early with ctrl+c, this is a nice to have to save the pretrained model
-    if cfg.local_rank == 0:
-
-        def terminate_handler(_, __, model):
-            if cfg.flash_optimum:
-                model = BetterTransformer.reverse(model)
-            model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
-            sys.exit(0)
-
-        signal.signal(
-            signal.SIGINT, lambda signum, frame: terminate_handler(signum, frame, model)
-        )
-
-    LOG.info("Starting trainer...")
-    if cfg.group_by_length:
-        LOG.info("hang tight... sorting dataset for group_by_length")
-
-    if not Path(cfg.output_dir).is_dir():
-        os.makedirs(cfg.output_dir, exist_ok=True)
-    tokenizer.save_pretrained(cfg.output_dir)
-    if cfg.flash_optimum:
-        with torch.backends.cuda.sdp_kernel(
-            enable_flash=True, enable_math=True, enable_mem_efficient=True
-        ):
-            trainer.train(resume_from_checkpoint=resume_from_checkpoint)
-    else:
-        trainer.train(resume_from_checkpoint=resume_from_checkpoint)
-
-    LOG.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}")
-
-    if cfg.relora_steps:
-        if cfg.adapter == "lora" and not (cfg.load_in_4bit or cfg.load_in_8bit):
-            model = model.merge_and_unload()
-        else:
-            # final model weights have already been saved by `ReLoRACallback.on_train_end`
-            return
-
-    # TODO do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading
-    # only save on rank 0, otherwise it corrupts output on multi-GPU when multiple processes attempt to write the same file
-    if cfg.fsdp:
-        trainer.save_model(cfg.output_dir)
-    elif cfg.local_rank == 0:
-        if cfg.flash_optimum:
-            model = BetterTransformer.reverse(model)
-
-        model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
-
-
 def load_cfg(config: Path = Path("examples/"), **kwargs):
     if Path(config).is_dir():
         config = choose_config(config)
@@ -347,15 +233,50 @@ def load_cfg(config: Path = Path("examples/"), **kwargs):
     return cfg
 
 
-def do_train(config: Path = Path("examples/"), **kwargs):
+def load_datasets(
+    *,
+    cfg: DictDefault,
+    cli_args: TrainerCliArgs,
+) -> TrainDatasetMeta:
+    tokenizer = load_tokenizer(cfg)
+
+    train_dataset, eval_dataset, total_num_steps = prepare_dataset(cfg, tokenizer)
+
+    if cli_args.debug or cfg.debug:
+        LOG.info("check_dataset_labels...")
+        check_dataset_labels(
+            train_dataset.select(
+                [random.randrange(0, len(train_dataset) - 1) for _ in range(5)]  # nosec
+            ),
+            tokenizer,
+        )
+
+    return TrainDatasetMeta(
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        total_num_steps=total_num_steps,
+    )
+
+
+def do_cli(config: Path = Path("examples/"), **kwargs):
     print_axolotl_text_art()
     parsed_cfg = load_cfg(config, **kwargs)
     parser = transformers.HfArgumentParser((TrainerCliArgs))
     parsed_cli_args, _ = parser.parse_args_into_dataclasses(
         return_remaining_strings=True
     )
-    train(cfg=parsed_cfg, cli_args=parsed_cli_args)
+    if parsed_cli_args.inference:
+        do_inference(cfg=parsed_cfg, cli_args=parsed_cli_args)
+    elif parsed_cli_args.merge_lora:
+        do_merge_lora(cfg=parsed_cfg, cli_args=parsed_cli_args)
+    elif parsed_cli_args.shard:
+        shard(cfg=parsed_cfg, cli_args=parsed_cli_args)
+    else:
+        dataset_meta = load_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)
+        if parsed_cli_args.prepare_ds_only:
+            return
+        train(cfg=parsed_cfg, cli_args=parsed_cli_args, dataset_meta=dataset_meta)
 
 
 if __name__ == "__main__":
-    fire.Fire(do_train)
+    fire.Fire(do_cli)
diff --git a/src/axolotl/common/__init__.py b/src/axolotl/common/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/axolotl/common/cli.py b/src/axolotl/common/cli.py
new file mode 100644
index 0000000000..f5bd9b0375
--- /dev/null
+++ b/src/axolotl/common/cli.py
@@ -0,0 +1,41 @@
+"""
+shared module for cli specific things
+"""
+
+import logging
+from dataclasses import dataclass, field
+from typing import Optional
+
+from axolotl.logging_config import configure_logging
+from axolotl.utils.dict import DictDefault
+from axolotl.utils.models import load_model, load_tokenizer
+
+configure_logging()
+LOG = logging.getLogger("axolotl.common.cli")
+
+
+@dataclass
+class TrainerCliArgs:
+    """
+    dataclass representing the various non-training arguments
+    """
+
+    debug: bool = field(default=False)
+    inference: bool = field(default=False)
+    merge_lora: bool = field(default=False)
+    prepare_ds_only: bool = field(default=False)
+    prompter: Optional[str] = field(default=None)
+    shard: bool = field(default=False)
+
+
+def load_model_and_tokenizer(
+    *,
+    cfg: DictDefault,
+    cli_args: TrainerCliArgs,
+):
+    LOG.info(f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}")
+    tokenizer = load_tokenizer(cfg)
+    LOG.info("loading model and (optionally) peft_config...")
+    model, _ = load_model(cfg, tokenizer, inference=cli_args.inference)
+
+    return model, tokenizer
diff --git a/src/axolotl/train.py b/src/axolotl/train.py
new file mode 100644
index 0000000000..51ef359037
--- /dev/null
+++ b/src/axolotl/train.py
@@ -0,0 +1,139 @@
+"""Prepare and train a model on a dataset. Can also infer from a model or merge lora"""
+
+import logging
+import os
+import signal
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+
+import torch
+
+# add src to the pythonpath so we don't need to pip install this
+from datasets import Dataset
+from optimum.bettertransformer import BetterTransformer
+
+from axolotl.common.cli import TrainerCliArgs
+from axolotl.logging_config import configure_logging
+from axolotl.utils.dict import DictDefault
+from axolotl.utils.models import load_model, load_tokenizer
+from axolotl.utils.trainer import setup_trainer
+
+project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+src_dir = os.path.join(project_root, "src")
+sys.path.insert(0, src_dir)
+
+configure_logging()
+LOG = logging.getLogger("axolotl.train")
+
+
+@dataclass
+class TrainDatasetMeta:
+    """
+    dataclass to capture the dataset specific options for training
+    """
+
+    train_dataset: Dataset
+    eval_dataset: Optional[Dataset] = None
+    total_num_steps: Optional[int] = None
+
+
+def train(
+    *,
+    cfg: DictDefault,
+    cli_args: TrainerCliArgs,
+    dataset_meta: TrainDatasetMeta,
+):
+    # load the tokenizer first
+    LOG.info(f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}")
+    tokenizer = load_tokenizer(cfg)
+
+    train_dataset = dataset_meta.train_dataset
+    eval_dataset = dataset_meta.eval_dataset
+    total_num_steps = dataset_meta.total_num_steps
+
+    # Load the model and tokenizer
+    LOG.info("loading model and (optionally) peft_config...")
+    model, peft_config = load_model(cfg, tokenizer, inference=cli_args.inference)
+
+    safe_serialization = cfg.save_safetensors is True
+
+    if cfg.resume_from_checkpoint is None and cfg.auto_resume_from_checkpoints:
+        possible_checkpoints = [
+            str(cp) for cp in Path(cfg.output_dir).glob("checkpoint-*")
+        ]
+        if len(possible_checkpoints) > 0:
+            sorted_paths = sorted(
+                possible_checkpoints,
+                key=lambda path: int(path.split("-")[-1]),
+            )
+            cfg.resume_from_checkpoint = sorted_paths[-1]
+            LOG.info(
+                f"Using Auto-resume functionality to start with checkpoint at {cfg.resume_from_checkpoint}"
+            )
+    resume_from_checkpoint = cfg.resume_from_checkpoint
+
+    trainer = setup_trainer(
+        cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps
+    )
+
+    model.config.use_cache = False
+
+    if torch.__version__ >= "2" and sys.platform != "win32":
+        LOG.info("Compiling torch model")
+        model = torch.compile(model)
+
+    # go ahead and presave, so we have the adapter config available to inspect
+    if peft_config:
+        LOG.info(f"Pre-saving adapter config to {cfg.output_dir}")
+        peft_config.save_pretrained(cfg.output_dir)
+
+    # In case we want to stop early with ctrl+c, this is a nice to have to save the pretrained model
+    if cfg.local_rank == 0:
+
+        def terminate_handler(_, __, model):
+            if cfg.flash_optimum:
+                model = BetterTransformer.reverse(model)
+            model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
+            sys.exit(0)
+
+        signal.signal(
+            signal.SIGINT, lambda signum, frame: terminate_handler(signum, frame, model)
+        )
+
+    LOG.info("Starting trainer...")
+    if cfg.group_by_length:
+        LOG.info("hang tight... sorting dataset for group_by_length")
+
+    if not Path(cfg.output_dir).is_dir():
+        os.makedirs(cfg.output_dir, exist_ok=True)
+    tokenizer.save_pretrained(cfg.output_dir)
+    if cfg.flash_optimum:
+        with torch.backends.cuda.sdp_kernel(
+            enable_flash=True, enable_math=True, enable_mem_efficient=True
+        ):
+            trainer.train(resume_from_checkpoint=resume_from_checkpoint)
+    else:
+        trainer.train(resume_from_checkpoint=resume_from_checkpoint)
+
+    LOG.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}")
+
+    if cfg.relora_steps:
+        if cfg.adapter == "lora" and not (cfg.load_in_4bit or cfg.load_in_8bit):
+            model = model.merge_and_unload()
+        else:
+            # final model weights have already been saved by `ReLoRACallback.on_train_end`
+            return model, tokenizer
+
+    # TODO do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading
+    # only save on rank 0, otherwise it corrupts output on multi-GPU when multiple processes attempt to write the same file
+    if cfg.fsdp:
+        trainer.save_model(cfg.output_dir)
+    elif cfg.local_rank == 0:
+        if cfg.flash_optimum:
+            model = BetterTransformer.reverse(model)
+
+        model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
+
+    return model, tokenizer

From 396a7a74fc6a5d73c4d2d555810ed516f456c479 Mon Sep 17 00:00:00 2001
From: Jan Philipp Harries <2862336+jphme@users.noreply.github.com>
Date: Thu, 31 Aug 2023 19:37:47 +0200
Subject: [PATCH 39/67] Added advanced DDP args (#515)

* add ddp_config

* add advanced ddp config

* add ddp_config

* add advanced ddp config

---------

Co-authored-by: Jan Philipp Harries <jphme@users.noreply.github.com>
---
 README.md                    | 5 +++++
 src/axolotl/utils/trainer.py | 9 +++++++++
 2 files changed, 14 insertions(+)

diff --git a/README.md b/README.md
index 204e2141a8..19b164a8ba 100644
--- a/README.md
+++ b/README.md
@@ -623,6 +623,11 @@ fsdp_config:
 # Deepspeed config path
 deepspeed:
 
+# Advanced DDP Arguments
+ddp_timeout:
+ddp_bucket_cap_mb:
+ddp_broadcast_buffers:
+
 # Path to torch distx for optim 'adamw_anyprecision'
 torchdistx_path:
 
diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index 0aceee5190..f0669565fb 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -579,6 +579,15 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
         if cfg.bench_dataset:
             training_arguments_kwargs["bench_dataset"] = cfg.bench_dataset
 
+    # DDP Config
+    if cfg.ddp_timeout:
+        training_arguments_kwargs["ddp_timeout"] = cfg.ddp_timeout
+    # see https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html
+    if cfg.ddp_bucket_cap_mb:
+        training_arguments_kwargs["ddp_bucket_cap_mb"] = cfg.ddp_bucket_cap_mb
+    if cfg.ddp_broadcast_buffers is not None:
+        training_arguments_kwargs["ddp_broadcast_buffers"] = cfg.ddp_broadcast_buffers
+
     training_args = AxolotlTrainingArguments(  # pylint: disable=unexpected-keyword-arg
         max_steps=total_num_steps if cfg.max_steps else -1,
         max_seq_length=cfg.sequence_len,

From 48434bec54cb44373cbeafa787f738c48f76cdba Mon Sep 17 00:00:00 2001
From: Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Date: Thu, 31 Aug 2023 22:26:52 +0100
Subject: [PATCH 40/67] Debug tokenization output: Add ability to output text
 only (no tokens), and/or specify num samples to see (#511)

---
 scripts/finetune.py               |  7 ++++++-
 src/axolotl/common/cli.py         |  2 ++
 src/axolotl/utils/tokenization.py | 14 ++++++++------
 3 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/scripts/finetune.py b/scripts/finetune.py
index 201a47e14a..0a5f318639 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -246,9 +246,14 @@ def load_datasets(
         LOG.info("check_dataset_labels...")
         check_dataset_labels(
             train_dataset.select(
-                [random.randrange(0, len(train_dataset) - 1) for _ in range(5)]  # nosec
+                [
+                    random.randrange(0, len(train_dataset) - 1)  # nosec
+                    for _ in range(cli_args.debug_num_examples)
+                ]
             ),
             tokenizer,
+            num_examples=cli_args.debug_num_examples,
+            text_only=cli_args.debug_text_only,
         )
 
     return TrainDatasetMeta(
diff --git a/src/axolotl/common/cli.py b/src/axolotl/common/cli.py
index f5bd9b0375..62f2b1061a 100644
--- a/src/axolotl/common/cli.py
+++ b/src/axolotl/common/cli.py
@@ -21,6 +21,8 @@ class TrainerCliArgs:
     """
 
     debug: bool = field(default=False)
+    debug_text_only: bool = field(default=False)
+    debug_num_examples: int = field(default=5)
     inference: bool = field(default=False)
     merge_lora: bool = field(default=False)
     prepare_ds_only: bool = field(default=False)
diff --git a/src/axolotl/utils/tokenization.py b/src/axolotl/utils/tokenization.py
index b2d1df4007..82fcbc638e 100644
--- a/src/axolotl/utils/tokenization.py
+++ b/src/axolotl/utils/tokenization.py
@@ -8,13 +8,13 @@
 LOG = logging.getLogger("axolotl")
 
 
-def check_dataset_labels(dataset, tokenizer):
+def check_dataset_labels(dataset, tokenizer, num_examples=5, text_only=False):
     # the dataset is already shuffled, so let's just check the first 5 elements
-    for idx in range(5):
-        check_example_labels(dataset[idx], tokenizer)
+    for idx in range(num_examples):
+        check_example_labels(dataset[idx], tokenizer, text_only=text_only)
 
 
-def check_example_labels(example, tokenizer):
+def check_example_labels(example, tokenizer, text_only=False):
     # Get the input_ids, labels, and attention_mask from the dataset
     input_ids = example["input_ids"]
     labels = example["labels"]
@@ -29,8 +29,10 @@ def check_example_labels(example, tokenizer):
         decoded_input_token = tokenizer.decode(input_id)
         # Choose the color based on whether the label has the ignore value or not
         color = "red" if label_id == -100 else ("yellow" if label_id == 0 else "green")
-        colored_token = colored(decoded_input_token, color) + colored(
-            f"({label_id}, {mask}, {input_id})", "white"
+        colored_token = colored(decoded_input_token, color) + (
+            not text_only
+            and colored(f"({label_id}, {mask}, {input_id})", "white")
+            or ""
         )
         colored_tokens.append(colored_token)
 

From 7710e81f502e47a453651289465bdb49f8989c89 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Thu, 31 Aug 2023 15:45:23 -0700
Subject: [PATCH 41/67] log supervised token count (#448)

---
 src/axolotl/utils/trainer.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index f0669565fb..f91f4e318e 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -401,6 +401,16 @@ def calculate_total_num_steps(cfg, train_dataset, tokenizer):
             LOG.info(f"📝 UPDATE CONFIG WITH: `total_num_tokens: {total_num_tokens}`")
             cfg.total_num_tokens = total_num_tokens
 
+        if not cfg.total_supervised_tokens:
+            total_supervised_tokens = (
+                train_dataset.data.column("labels")
+                .to_pandas()
+                .apply(lambda x: np.sum(np.array(x) != -100))
+                .sum()
+            )
+            LOG.info(f"`total_supervised_tokens: {total_supervised_tokens}`")
+            cfg.total_supervised_tokens = total_supervised_tokens
+
         if cfg.sample_packing_eff_est:
             total_num_steps = (
                 # match count to len est in dataloader

From f51c9c56c6c319e331c702949f827155ddc5e45b Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Fri, 1 Sep 2023 16:08:21 +0900
Subject: [PATCH 42/67] Fix(doc): Inform Windows users to use WSL/docker (#518)

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 19b164a8ba..30f7b4844f 100644
--- a/README.md
+++ b/README.md
@@ -163,6 +163,8 @@ accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml \
   ```
   </details>
 
+- Windows: Please use WSL or Docker!
+
 ### Dataset
 
 Axolotl supports a variety of dataset formats. Below are some of the formats you can use.

From 1991946c5a5b57bc89aaec8167066b334543aba6 Mon Sep 17 00:00:00 2001
From: Maxime <672982+maximegmd@users.noreply.github.com>
Date: Fri, 1 Sep 2023 16:11:45 +0200
Subject: [PATCH 43/67] fix: bad dtype for full finetune (#504)

* fix: bad dtype for full finetune

* Update src/axolotl/utils/models.py

Co-authored-by: Wing Lian <wing.lian@gmail.com>

* Update models.py

---------

Co-authored-by: Wing Lian <wing.lian@gmail.com>
---
 src/axolotl/utils/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index 4b9c79d848..9f0795af76 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -371,7 +371,7 @@ def load_model(
 
     # LlamaRMSNorm layers are in fp32 after kbit_training or full finetune, so we need to
     # convert them back to fp16/bf16 for flash-attn compatibility.
-    if needs_fa2_dtype and (cfg.flash_attention and cfg.is_llama_derived_model):
+    if needs_fa2_dtype or (cfg.flash_attention and cfg.is_llama_derived_model):
         LOG.info("converting modules to %s for flash attention", cfg.torch_dtype)
         for name, module in model.named_modules():
             if "norm" in name:

From 995557bdf3c6c8b3e839b224ef9513fc2b097f30 Mon Sep 17 00:00:00 2001
From: kingbri <bdashore3@proton.me>
Date: Fri, 1 Sep 2023 13:48:33 -0400
Subject: [PATCH 44/67] Prompters: ShareGPT: Allow for custom system prompts

If a system prompt is present in a conversation, add it instead of
using the default.

Signed-off-by: kingbri <bdashore3@proton.me>
---
 src/axolotl/prompters.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/axolotl/prompters.py b/src/axolotl/prompters.py
index f1fe7d4568..5322a10182 100644
--- a/src/axolotl/prompters.py
+++ b/src/axolotl/prompters.py
@@ -309,10 +309,6 @@ def __init__(self, prompt_style=None, system_prompt: Optional[str] = None):
         )
 
     def build_prompt(self, source) -> Generator[str, None, None]:
-        # ignore the system prompt if provided
-        if source[0]["from"] == "system":
-            source.pop(0)
-
         if len(source) < 2:
             # If there isn't a back and forth conversation, ignore it
             # also happens on the data splitting leaving empty conversations
@@ -321,6 +317,12 @@ def build_prompt(self, source) -> Generator[str, None, None]:
             )
 
         conv = self._conversation.copy()
+
+        # Add the conversation system prompt if provided, otherwise use the default one
+        if source[0]["from"] == "system":
+            conv.system = source[0]["value"]
+            source.pop(0)
+
         roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
 
         try:

From 09f154397eeed6fd86d887c2b9bdd0f49c885630 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sun, 3 Sep 2023 23:24:28 -0400
Subject: [PATCH 45/67] No gather single gpu (#523)

* don't attempt to gather on multi-gpu

* also check distributed status in bench callback
---
 src/axolotl/utils/callbacks.py   | 8 ++++++--
 src/axolotl/utils/distributed.py | 2 ++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/axolotl/utils/callbacks.py b/src/axolotl/utils/callbacks.py
index ee5acfd555..8fc5a918b3 100644
--- a/src/axolotl/utils/callbacks.py
+++ b/src/axolotl/utils/callbacks.py
@@ -27,6 +27,7 @@
     barrier,
     gather_scalar_from_all_ranks,
     get_world_size,
+    is_distributed,
     is_main_process,
     zero_first,
 )
@@ -270,10 +271,13 @@ def on_evaluate(
                 lambda: len(data_loader), get_world_size()
             )
 
-            if not is_main_process():
+            if is_distributed() and not is_main_process():
                 dist.gather_object(local_bench_names, dst=0)
             else:
-                dist.gather_object(local_bench_names, gathered_bench_names, dst=0)
+                if is_distributed():
+                    dist.gather_object(local_bench_names, gathered_bench_names, dst=0)
+                else:
+                    gathered_bench_names = [local_bench_names]
                 bench_loss = sum(loss_bench_ranks) / sum(len_data_loader_ranks)
                 results = {f"{bench_split}_bench_loss": bench_loss}
 
diff --git a/src/axolotl/utils/distributed.py b/src/axolotl/utils/distributed.py
index 38d0d1e059..5e527f3b95 100644
--- a/src/axolotl/utils/distributed.py
+++ b/src/axolotl/utils/distributed.py
@@ -74,6 +74,8 @@ def gather_scalar_from_all_ranks(fn, world_size=1):  # pylint: disable=invalid-n
     - A list of computed values from all ranks if on the gathering rank, otherwise None.
     """
     value_scalar = fn()
+    if not is_distributed():
+        return [value_scalar]
     value_tensor = torch.tensor(value_scalar, device=dist.get_rank()).float()
 
     if not is_main_process():

From 44454ae4c487f4b51140a246340ea9c7b6180242 Mon Sep 17 00:00:00 2001
From: Aman Gupta Karmani <aman@tmm1.net>
Date: Mon, 4 Sep 2023 00:19:03 -0400
Subject: [PATCH 46/67] move is_llama_derived_model into normalize_config
 (#524)

---
 scripts/finetune.py         | 11 +----------
 src/axolotl/utils/config.py | 11 +++++++++++
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/scripts/finetune.py b/scripts/finetune.py
index 0a5f318639..b998edc798 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -24,7 +24,7 @@
 from axolotl.utils.data import prepare_dataset
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import is_main_process
-from axolotl.utils.models import load_model_config, load_tokenizer
+from axolotl.utils.models import load_tokenizer
 from axolotl.utils.tokenization import check_dataset_labels
 from axolotl.utils.wandb import setup_wandb_env_vars
 
@@ -216,15 +216,6 @@ def load_cfg(config: Path = Path("examples/"), **kwargs):
             else:
                 cfg[k] = kwargs[k]
 
-    model_config = load_model_config(cfg)
-
-    # figure out if the model is llama
-    cfg.is_llama_derived_model = (
-        (hasattr(model_config, "model_type") and model_config.model_type == "llama")
-        or cfg.is_llama_derived_model
-        or "llama" in cfg.base_model
-        or (cfg.model_type and "llama" in cfg.model_type.lower())
-    )
     validate_config(cfg)
 
     normalize_config(cfg)
diff --git a/src/axolotl/utils/config.py b/src/axolotl/utils/config.py
index abb3154d21..93a23f7738 100644
--- a/src/axolotl/utils/config.py
+++ b/src/axolotl/utils/config.py
@@ -6,6 +6,7 @@
 import torch
 
 from axolotl.utils.bench import log_gpu_memory_usage
+from axolotl.utils.models import load_model_config
 
 LOG = logging.getLogger("axolotl")
 
@@ -69,6 +70,16 @@ def normalize_config(cfg):
     else:
         cfg.torch_dtype = torch.float32
 
+    model_config = load_model_config(cfg)
+
+    # figure out if the model is llama
+    cfg.is_llama_derived_model = (
+        (hasattr(model_config, "model_type") and model_config.model_type == "llama")
+        or cfg.is_llama_derived_model
+        or "llama" in cfg.base_model
+        or (cfg.model_type and "llama" in cfg.model_type.lower())
+    )
+
     log_gpu_memory_usage(LOG, "baseline", cfg.device)
 
 

From 5fe30b1497f6ebe75b46ee32f3b4c99b4cc9d672 Mon Sep 17 00:00:00 2001
From: Aman Gupta Karmani <aman@tmm1.net>
Date: Mon, 4 Sep 2023 17:49:16 -0400
Subject: [PATCH 47/67] use flash_attn xentropy when available (#525)

* use flash_attn xentropy when available

* log when xentropy is not found
---
 .../monkeypatch/llama_attn_hijack_flash.py      | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/src/axolotl/monkeypatch/llama_attn_hijack_flash.py b/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
index cb0aa3fe6f..b0163a6556 100644
--- a/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
@@ -2,7 +2,9 @@
 
 # copied from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/llama_flash_attn_monkey_patch.py
 
+import logging
 import warnings
+from functools import partial
 from typing import List, Optional, Tuple, Union
 
 import torch
@@ -33,6 +35,9 @@
     )
 
 
+LOG = logging.getLogger("axolotl")
+
+
 def replace_llama_attn_with_flash_attn(packed: Optional[bool] = False):
     transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = (  # pylint: disable=protected-access
         _prepare_decoder_attention_mask
@@ -44,6 +49,18 @@ def replace_llama_attn_with_flash_attn(packed: Optional[bool] = False):
             llama_model_forward
         )
 
+    try:
+        from flash_attn.losses.cross_entropy import CrossEntropyLoss
+
+        LOG.info("patching with flash_attn.losses.cross_entropy")
+        transformers.models.llama.modeling_llama.CrossEntropyLoss = partial(
+            CrossEntropyLoss, inplace_backward=True
+        )
+    except ImportError:
+        LOG.info(
+            "optimized flash-attention CrossEntropyLoss not found (run `pip install git+https://github.com/Dao-AILab/flash-attention.git#egg=xentropy_cuda_lib&subdirectory=csrc/xentropy`)"
+        )
+
 
 # Disable the transformation of the attention mask in LlamaModel as the flash attention
 # requires the attention mask to be the same as the key_padding_mask

From 72a6fe1c1f1916651cc34b80cf0a94984775b442 Mon Sep 17 00:00:00 2001
From: Aman Gupta Karmani <aman@tmm1.net>
Date: Mon, 4 Sep 2023 19:44:51 -0400
Subject: [PATCH 48/67] use flash_attn rmsnorm when available (#526)

* use flash_attn xentropy when available

* use flash_attn.ops.rms_norm when available

* log when xentropy is not found

* log how to install RMSNorm

* add quotes so pip install works
---
 .../monkeypatch/llama_attn_hijack_flash.py    | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/src/axolotl/monkeypatch/llama_attn_hijack_flash.py b/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
index b0163a6556..39cfb5c173 100644
--- a/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
@@ -58,7 +58,24 @@ def replace_llama_attn_with_flash_attn(packed: Optional[bool] = False):
         )
     except ImportError:
         LOG.info(
-            "optimized flash-attention CrossEntropyLoss not found (run `pip install git+https://github.com/Dao-AILab/flash-attention.git#egg=xentropy_cuda_lib&subdirectory=csrc/xentropy`)"
+            "optimized flash-attention CrossEntropyLoss not found (run `pip install 'git+https://github.com/Dao-AILab/flash-attention.git#egg=xentropy_cuda_lib&subdirectory=csrc/xentropy'`)"
+        )
+
+    try:
+        from flash_attn.ops.rms_norm import RMSNorm
+
+        LOG.info("patching with flash_attn.ops.rms_norm")
+
+        class LlamaRMSNorm(RMSNorm):
+            """Patched LLamaRMSNorm"""
+
+            def __init__(self, hidden_size, eps=1e-6):
+                super().__init__(hidden_size, eps=eps)
+
+        transformers.models.llama.modeling_llama.LlamaRMSNorm = LlamaRMSNorm
+    except ImportError:
+        LOG.info(
+            "optimized flash-attention RMSNorm not found (run `pip install 'git+https://github.com/Dao-AILab/flash-attention.git#egg=dropout_layer_norm&subdirectory=csrc/layer_norm'`)"
         )
 
 

From fc8766e502dee8d26c0eef835818d8a390ebf574 Mon Sep 17 00:00:00 2001
From: Aman Karmani <aman@tmm1.net>
Date: Tue, 5 Sep 2023 02:21:24 +0000
Subject: [PATCH 49/67] reorg a bit

---
 src/axolotl/monkeypatch/llama_attn_hijack_flash.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/axolotl/monkeypatch/llama_attn_hijack_flash.py b/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
index 39cfb5c173..ef048082c1 100644
--- a/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
@@ -64,14 +64,13 @@ def replace_llama_attn_with_flash_attn(packed: Optional[bool] = False):
     try:
         from flash_attn.ops.rms_norm import RMSNorm
 
-        LOG.info("patching with flash_attn.ops.rms_norm")
-
         class LlamaRMSNorm(RMSNorm):
             """Patched LLamaRMSNorm"""
 
             def __init__(self, hidden_size, eps=1e-6):
                 super().__init__(hidden_size, eps=eps)
 
+        LOG.info("patching with flash_attn.ops.rms_norm")
         transformers.models.llama.modeling_llama.LlamaRMSNorm = LlamaRMSNorm
     except ImportError:
         LOG.info(

From 3355706e22d95ddca7dc242fe429f06d4ce89526 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Tue, 5 Sep 2023 12:43:22 -0400
Subject: [PATCH 50/67] Add support for GPTQ using native transformers/peft
 (#468)

* auto gptq support

* more tweaks and add yml

* remove old gptq docker

* don't need explicit peft install for tests

* fix setup.py to use extra index url

install torch for tests
fix cuda version for autogptq index
set torch in requirements so that it installs properly
move gptq install around to work with github cicd

* gptq doesn't play well with sample packing

* address pr feedback

* remove torch install for now

* set quantization_config from model config

* Fix the implementation for getting quant config from model config
---
 .github/workflows/main.yml       |  10 ---
 .github/workflows/tests.yml      |   2 +-
 docker/Dockerfile                |   5 +-
 examples/gptq-lora-7b/README.md  |   8 --
 examples/gptq-lora-7b/config.yml |  63 ----------------
 examples/llama-2/gptq-lora.yml   |  76 +++++++++++++++++++
 requirements.txt                 |   4 +
 setup.py                         |  39 ++++++----
 src/axolotl/utils/config.py      |   4 +-
 src/axolotl/utils/models.py      | 123 +++++++++----------------------
 src/axolotl/utils/trainer.py     |  18 +----
 11 files changed, 142 insertions(+), 210 deletions(-)
 delete mode 100644 examples/gptq-lora-7b/README.md
 delete mode 100644 examples/gptq-lora-7b/config.yml
 create mode 100644 examples/llama-2/gptq-lora.yml

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index d20db7065a..30d4774dbf 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -23,11 +23,6 @@ jobs:
             python_version: "3.10"
             pytorch: 2.0.1
             axolotl_extras:
-          - cuda: 118
-            cuda_version: 11.8.0
-            python_version: "3.9"
-            pytorch: 2.0.1
-            axolotl_extras: gptq
     runs-on: self-hosted
     steps:
       - name: Checkout
@@ -73,11 +68,6 @@ jobs:
             pytorch: 2.0.1
             axolotl_extras:
             is_latest: true
-          - cuda: 118
-            cuda_version: 11.8.0
-            python_version: "3.9"
-            pytorch: 2.0.1
-            axolotl_extras: gptq
     runs-on: self-hosted
     steps:
       - name: Checkout
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 01703cd515..d5184def60 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -24,7 +24,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install -e .[peft]
+          pip install -e .
           pip install -r requirements-tests.txt
 
       - name: Run tests
diff --git a/docker/Dockerfile b/docker/Dockerfile
index b429d50f27..683ca75ffd 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -11,14 +11,13 @@ RUN apt-get update && \
 
 WORKDIR /workspace
 
-RUN pip3 install "peft @ git+https://github.com/huggingface/peft.git@main"
 RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN cd axolotl && \
     if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install -e .[flash-attn,$AXOLOTL_EXTRAS]; \
+        pip install -e .[flash-attn,gptq,$AXOLOTL_EXTRAS]; \
     else \
-        pip install -e .[flash-attn]; \
+        pip install -e .[flash-attn,gptq]; \
     fi
 
 # fix so that git fetch/pull from remote works
diff --git a/examples/gptq-lora-7b/README.md b/examples/gptq-lora-7b/README.md
deleted file mode 100644
index 0bde51b068..0000000000
--- a/examples/gptq-lora-7b/README.md
+++ /dev/null
@@ -1,8 +0,0 @@
-# LLaMa 7B using LoRA
-
-This is a good place to start for beginners. This will run on an NVIDIA RTX4090 with no other changes needed.
-
-```shell
-accelerate launch scripts/finetune.py examples/gptq-lora-7b/config.yml
-
-```
diff --git a/examples/gptq-lora-7b/config.yml b/examples/gptq-lora-7b/config.yml
deleted file mode 100644
index d909f7d079..0000000000
--- a/examples/gptq-lora-7b/config.yml
+++ /dev/null
@@ -1,63 +0,0 @@
-base_model: Neko-Institute-of-Science/LLaMA-7B-4bit-128g
-base_model_config: Neko-Institute-of-Science/LLaMA-7B-4bit-128g
-model_type: LlamaForCausalLM
-tokenizer_type: LlamaTokenizer
-trust_remote_code:
-load_in_8bit: true
-gptq: true
-datasets:
-  - path: vicgalle/alpaca-gpt4
-    type: alpaca
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.02
-adapter:
-lora_model_dir:
-sequence_len: 2048
-max_packed_sequence_len:
-lora_r: 8
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-  - q_proj
-  - v_proj
-lora_fan_in_fan_out: false
-wandb_project: llama-7b-lora-int4
-wandb_entity:
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-output_dir: ./llama-7b-lora-int4
-gradient_accumulation_steps: 1
-micro_batch_size: 1
-num_epochs: 3
-optimizer: adamw_bnb_8bit
-torchdistx_path:
-lr_scheduler: cosine
-learning_rate: 0.0000002
-train_on_inputs: false
-group_by_length: false
-fp16: true
-bf16: false
-tf32: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 5
-xformers_attention:
-flash_attention:
-gradient_checkpointing: true
-gptq_groupsize: 128
-gptq_model_v1: false
-warmup_steps: 20
-eval_steps: 110
-save_steps: 660
-debug:
-deepspeed:
-weight_decay: 0.0001
-fsdp:
-fsdp_config:
-tokens:
-  pad_token: "<pad>"
-  bos_token: "<s>"
-  eos_token: "</s>"
-  unk_token: "<unk>"
diff --git a/examples/llama-2/gptq-lora.yml b/examples/llama-2/gptq-lora.yml
new file mode 100644
index 0000000000..dbce2a6b34
--- /dev/null
+++ b/examples/llama-2/gptq-lora.yml
@@ -0,0 +1,76 @@
+base_model: TheBloke/Llama-2-7B-GPTQ
+base_model_config: TheBloke/Llama-2-7B-GPTQ
+is_llama_derived_model: false
+gptq: true
+gptq_bits: 4
+model_type: AutoModelForCausalLM
+tokenizer_type: LlamaTokenizer
+tokenizer_use_fast: true
+tokenizer_legacy: true
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+push_dataset_to_hub:
+hf_use_auth_token: true
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+adapter: lora
+lora_model_dir:
+sequence_len: 4096
+sample_packing:
+lora_r: 8
+lora_alpha: 32
+lora_dropout: 0.05
+lora_target_modules:
+  - k_proj
+  - o_proj
+  - q_proj
+  - v_proj
+lora_target_linear:
+lora_fan_in_fan_out:
+wandb_project:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+output_dir: ./model-out
+gradient_accumulation_steps: 1
+micro_batch_size: 1
+num_epochs: 3
+optimizer: adamw_torch
+adam_beta2: 0.95
+adam_eps: 0.00001
+max_grad_norm: 1.0
+torchdistx_path:
+lr_scheduler: cosine
+lr_quadratic_warmup: true
+learning_rate: 0.000017
+train_on_inputs: false
+group_by_length: false
+bf16: false
+fp16: false
+float16: true
+tf32: true
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention:
+sdp_attention:
+flash_optimum:
+gptq_groupsize:
+gptq_model_v1:
+warmup_steps: 100
+eval_steps:
+save_steps:
+debug:
+deepspeed:
+weight_decay: 0.1
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
diff --git a/requirements.txt b/requirements.txt
index fcd7f9292a..1c8e97dffc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,7 @@
+--extra-index-url https://download.pytorch.org/whl/cu118
+--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
+torch==2.0.1
+auto-gptq
 packaging
 peft @ git+https://github.com/huggingface/peft.git
 transformers @ git+https://github.com/huggingface/transformers.git
diff --git a/setup.py b/setup.py
index 7b99794dee..973d656cd1 100644
--- a/setup.py
+++ b/setup.py
@@ -2,15 +2,27 @@
 
 from setuptools import find_packages, setup
 
-install_requires = []
-with open("./requirements.txt", encoding="utf-8") as requirements_file:
-    # don't include peft yet until we check the int4
-    # need to manually install peft for now...
-    reqs = [r.strip() for r in requirements_file.readlines() if "peft" not in r]
-    reqs = [r for r in reqs if "flash-attn" not in r]
-    reqs = [r for r in reqs if r and r[0] != "#"]
-    for r in reqs:
-        install_requires.append(r)
+
+def parse_requirements():
+    _install_requires = []
+    _dependency_links = []
+    with open("./requirements.txt", encoding="utf-8") as requirements_file:
+        lines = [
+            r.strip() for r in requirements_file.readlines() if "auto-gptq" not in r
+        ]
+        for line in lines:
+            if line.startswith("--extra-index-url"):
+                # Handle custom index URLs
+                _, url = line.split()
+                _dependency_links.append(url)
+            elif "flash-attn" not in line and line and line[0] != "#":
+                # Handle standard packages
+                _install_requires.append(line)
+    return _install_requires, _dependency_links
+
+
+install_requires, dependency_links = parse_requirements()
+
 
 setup(
     name="axolotl",
@@ -19,12 +31,10 @@
     package_dir={"": "src"},
     packages=find_packages(),
     install_requires=install_requires,
+    dependency_links=dependency_links,
     extras_require={
         "gptq": [
-            "alpaca_lora_4bit @ git+https://github.com/winglian/alpaca_lora_4bit.git@setup_pip",
-        ],
-        "gptq_triton": [
-            "alpaca_lora_4bit[triton] @ git+https://github.com/winglian/alpaca_lora_4bit.git@setup_pip",
+            "auto-gptq",
         ],
         "flash-attn": [
             "flash-attn==2.0.8",
@@ -32,8 +42,5 @@
         "extras": [
             "deepspeed",
         ],
-        "peft": [
-            "peft @ git+https://github.com/huggingface/peft.git",
-        ],
     },
 )
diff --git a/src/axolotl/utils/config.py b/src/axolotl/utils/config.py
index 93a23f7738..0fbccd2054 100644
--- a/src/axolotl/utils/config.py
+++ b/src/axolotl/utils/config.py
@@ -108,9 +108,7 @@ def validate_config(cfg):
             "To calculate the equivalent gradient_accumulation_steps, divide batch_size / micro_batch_size / number of gpus.",
         )
     if cfg.load_4bit:
-        raise ValueError(
-            "cfg.load_4bit parameter has been deprecated and replaced by cfg.gptq"
-        )
+        raise ValueError("cfg.load_4bit parameter has been deprecated")
 
     if cfg.adapter == "qlora":
         if cfg.merge_lora:
diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index 9f0795af76..9ec51f4f75 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -4,19 +4,19 @@
 import logging
 import math
 import os
-from pathlib import Path
 from typing import Optional, Tuple  # noqa: F401
 
 import bitsandbytes as bnb
 import torch
 import transformers
 from optimum.bettertransformer import BetterTransformer
-from peft import PeftConfig
+from peft import PeftConfig, prepare_model_for_kbit_training
 from transformers import (  # noqa: F401
     AutoConfig,
     AutoModelForCausalLM,
     AutoTokenizer,
     BitsAndBytesConfig,
+    GPTQConfig,
     LlamaConfig,
     PreTrainedModel,
     PreTrainedTokenizerBase,
@@ -155,32 +155,17 @@ def load_model(
         LOG.info("patching _expand_mask")
         hijack_expand_mask()
 
-    try:
-        if cfg.gptq:
-            from alpaca_lora_4bit.monkeypatch.peft_tuners_lora_monkey_patch import (
-                replace_peft_model_with_int4_lora_model,
-            )
-
-            replace_peft_model_with_int4_lora_model()
-    except Exception as err:
-        LOG.exception(err)
-        raise err
-
-    if not cfg.gptq and (
-        (cfg.adapter == "lora" and load_in_8bit)
-        or (cfg.adapter == "qlora" and cfg.load_in_4bit)
-    ):
-        try:
-            from peft import prepare_model_for_kbit_training
-        except ImportError:
-            # For backward compatibility
-            from peft import (
-                prepare_model_for_int8_training as prepare_model_for_kbit_training,
-            )
-
     model_kwargs = {}
     if cfg.model_revision:
         model_kwargs["revision"] = cfg.model_revision
+    if cfg.gptq:
+        model_config = load_model_config(cfg)
+        if hasattr(model_config, "quantization_config"):
+            LOG.warning("model config does not contain quantization_config information")
+        else:
+            model_kwargs["quantization_config"] = GPTQConfig(
+                **model_config.quantization_config
+            )
     if cfg.adapter == "qlora" and cfg.load_in_4bit:
         model_kwargs["quantization_config"] = BitsAndBytesConfig(
             load_in_4bit=True,
@@ -191,45 +176,7 @@ def load_model(
             bnb_4bit_quant_type="nf4",
         )
     try:
-        if cfg.gptq and cfg.is_llama_derived_model:
-            from alpaca_lora_4bit.autograd_4bit import load_llama_model_4bit_low_ram
-            from huggingface_hub import snapshot_download
-
-            try:
-                snapshot_download_kwargs = {}
-                if cfg.base_model_ignore_patterns:
-                    snapshot_download_kwargs[
-                        "ignore_patterns"
-                    ] = cfg.base_model_ignore_patterns
-                cache_model_path = Path(
-                    snapshot_download(base_model, **snapshot_download_kwargs)
-                )
-                files = (
-                    list(cache_model_path.glob("*.pt"))
-                    + list(cache_model_path.glob("*.safetensors"))
-                    + list(cache_model_path.glob("*.bin"))
-                )
-                if len(files) > 0:
-                    model_path = str(files[0])
-                else:
-                    LOG.warning(
-                        "unable to find a cached model file, this will likely fail..."
-                    )
-                    model_path = str(cache_model_path)
-            except Exception:  # pylint: disable=broad-exception-caught
-                model_path = cfg.base_model
-            model, _ = load_llama_model_4bit_low_ram(
-                base_model_config if base_model_config else base_model,
-                model_path,
-                device_map=cfg.device_map,
-                half=cfg.fp16,
-                groupsize=cfg.gptq_groupsize if cfg.gptq_groupsize else -1,
-                is_v1_model=cfg.gptq_model_v1
-                if cfg.gptq_model_v1 is not None
-                else True,
-            )
-            load_in_8bit = False
-        elif cfg.is_llama_derived_model and not cfg.trust_remote_code:
+        if cfg.is_llama_derived_model and not cfg.trust_remote_code and not cfg.gptq:
             from transformers import LlamaForCausalLM
 
             config_kwargs = {}
@@ -275,15 +222,24 @@ def load_model(
         #     )
         #     model.train() # sets to train instead of eval mode
         elif model_type and not cfg.trust_remote_code:
-            model = getattr(transformers, model_type).from_pretrained(
-                base_model,
-                device_map=cfg.device_map,
-                load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
-                load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
-                torch_dtype=cfg.torch_dtype,
-                trust_remote_code=cfg.trust_remote_code or False,
-                **model_kwargs,
-            )
+            if cfg.gptq:
+                model = AutoModelForCausalLM.from_pretrained(
+                    base_model,
+                    device_map=cfg.device_map,
+                    torch_dtype=cfg.torch_dtype,
+                    trust_remote_code=cfg.trust_remote_code or False,
+                    **model_kwargs,
+                )
+            else:
+                model = getattr(transformers, model_type).from_pretrained(
+                    base_model,
+                    device_map=cfg.device_map,
+                    load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
+                    load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
+                    torch_dtype=cfg.torch_dtype,
+                    trust_remote_code=cfg.trust_remote_code or False,
+                    **model_kwargs,
+                )
         else:
             config = AutoConfig.from_pretrained(
                 base_model,
@@ -359,11 +315,12 @@ def load_model(
                 module.to(torch.float32)
 
     needs_fa2_dtype = cfg.adapter or cfg.fsdp
-    if not cfg.gptq and (
-        (cfg.adapter == "lora" and load_in_8bit)
-        or (cfg.adapter == "qlora" and cfg.load_in_4bit)
+    if (cfg.adapter == "lora" and load_in_8bit) or (
+        cfg.adapter == "qlora" and cfg.load_in_4bit
     ):
         LOG.info("converting PEFT model w/ prepare_model_for_kbit_training")
+        if cfg.gradient_checkpointing:
+            model.gradient_checkpointing_enable()
         model = prepare_model_for_kbit_training(
             model, use_gradient_checkpointing=cfg.gradient_checkpointing
         )
@@ -385,22 +342,10 @@ def load_model(
     if cfg.ddp and not load_in_8bit:
         model.to(f"cuda:{cfg.local_rank}")
 
-    if cfg.gptq:
-        # Scales to half
-        LOG.info("Fitting 4bit scales and zeros to half")
-        for _, module in model.named_modules():
-            if "Autograd4bitQuantLinear" in str(type(module)) or "Linear4bitLt" in str(
-                type(module)
-            ):
-                if hasattr(module, "is_v1_model") and module.is_v1_model:
-                    module.zeros = module.zeros.half()
-                module.scales = module.scales.half()
-                module.bias = module.bias.half()
-
     if (
         torch.cuda.device_count() > 1
         and int(os.getenv("WORLD_SIZE", "1")) > 1
-        and (cfg.gptq or cfg.load_in_4bit)
+        and (cfg.load_in_4bit)
     ):
         # llama is PROBABLY model parallelizable, but the default isn't that it is
         # so let's only set it for the 4bit, see
diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index f91f4e318e..c3d6b85cb1 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -514,23 +514,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
         training_arguments_kwargs["seed"] = cfg.seed
 
     if cfg.gradient_checkpointing:
-        if cfg.gptq:
-            from alpaca_lora_4bit.gradient_checkpointing import (
-                apply_gradient_checkpointing,
-            )
-
-            gradient_checkpointing_ratio = (
-                cfg.gradient_checkpointing_ratio
-                if cfg.gradient_checkpointing_ratio
-                else 1.0
-            )
-            apply_gradient_checkpointing(
-                model, checkpoint_ratio=gradient_checkpointing_ratio
-            )
-        else:
-            training_arguments_kwargs[
-                "gradient_checkpointing"
-            ] = cfg.gradient_checkpointing
+        training_arguments_kwargs["gradient_checkpointing"] = cfg.gradient_checkpointing
     if cfg.fsdp:
         training_arguments_kwargs["fsdp"] = cfg.fsdp
         if cfg.fsdp_config:

From a546ca2813548d7930b0718c0b4b5b33cf875bc6 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Tue, 5 Sep 2023 16:40:13 -0400
Subject: [PATCH 51/67] misc fixes/improvements (#513)

fix per pr feedback
---
 src/axolotl/train.py         |  8 +++++---
 src/axolotl/utils/trainer.py | 18 +++++++++++-------
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/src/axolotl/train.py b/src/axolotl/train.py
index 51ef359037..b1be8f8a33 100644
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -88,6 +88,11 @@ def train(
     if peft_config:
         LOG.info(f"Pre-saving adapter config to {cfg.output_dir}")
         peft_config.save_pretrained(cfg.output_dir)
+    # additionally presave the tokenizer and model configs
+    if not Path(cfg.output_dir).is_dir():
+        os.makedirs(cfg.output_dir, exist_ok=True)
+    tokenizer.save_pretrained(str(Path(cfg.output_dir)))
+    model.config.save_pretrained(str(Path(cfg.output_dir)))
 
     # In case we want to stop early with ctrl+c, this is a nice to have to save the pretrained model
     if cfg.local_rank == 0:
@@ -106,9 +111,6 @@ def terminate_handler(_, __, model):
     if cfg.group_by_length:
         LOG.info("hang tight... sorting dataset for group_by_length")
 
-    if not Path(cfg.output_dir).is_dir():
-        os.makedirs(cfg.output_dir, exist_ok=True)
-    tokenizer.save_pretrained(cfg.output_dir)
     if cfg.flash_optimum:
         with torch.backends.cuda.sdp_kernel(
             enable_flash=True, enable_math=True, enable_mem_efficient=True
diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index c3d6b85cb1..3bc283d75b 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -33,6 +33,7 @@
 )
 from axolotl.utils.collators import DataCollatorForSeq2Seq
 from axolotl.utils.dataloader import MultipackDistributedDataloader
+from axolotl.utils.distributed import is_main_process, zero_first
 from axolotl.utils.schedulers import get_cosine_schedule_with_quadratic_warmup
 
 LOG = logging.getLogger("axolotl")
@@ -375,14 +376,17 @@ def disable_datasets_caching():
 
 def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
     drop_long = partial(drop_long_seq, sequence_len=cfg.sequence_len)
-    train_dataset = train_dataset.filter(drop_long, num_proc=os.cpu_count())
-    if eval_dataset:
-        eval_dataset = eval_dataset.filter(drop_long, num_proc=os.cpu_count())
-
-    if cfg.sample_packing:
-        train_dataset = train_dataset.map(add_position_ids, num_proc=os.cpu_count())
+    with zero_first(is_main_process()):
+        train_dataset = train_dataset.filter(drop_long, num_proc=os.cpu_count())
         if eval_dataset:
-            eval_dataset = eval_dataset.map(add_position_ids, num_proc=os.cpu_count())
+            eval_dataset = eval_dataset.filter(drop_long, num_proc=os.cpu_count())
+
+        if cfg.sample_packing:
+            train_dataset = train_dataset.map(add_position_ids, num_proc=os.cpu_count())
+            if eval_dataset:
+                eval_dataset = eval_dataset.map(
+                    add_position_ids, num_proc=os.cpu_count()
+                )
     return train_dataset, eval_dataset
 
 

From 245c5c41e2992c6d49976fa91e521330d3f42afa Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Wed, 6 Sep 2023 08:37:51 -0400
Subject: [PATCH 52/67] log rank too (#527)

---
 src/axolotl/logging_config.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/axolotl/logging_config.py b/src/axolotl/logging_config.py
index bec20cbad2..8f473aa240 100644
--- a/src/axolotl/logging_config.py
+++ b/src/axolotl/logging_config.py
@@ -23,6 +23,7 @@ class ColorfulFormatter(Formatter):
     }
 
     def format(self, record):
+        record.rank = int(os.getenv("LOCAL_RANK", "0"))
         log_message = super().format(record)
         return self.COLORS.get(record.levelname, "") + log_message + Fore.RESET
 
@@ -35,7 +36,7 @@ def format(self, record):
         },
         "colorful": {
             "()": ColorfulFormatter,
-            "format": "[%(asctime)s] [%(levelname)s] [%(name)s.%(funcName)s:%(lineno)d] [PID:%(process)d] %(message)s",
+            "format": "[%(asctime)s] [%(levelname)s] [%(name)s.%(funcName)s:%(lineno)d] [PID:%(process)d] [RANK:%(rank)d] %(message)s",
         },
     },
     "filters": {},

From 343714972bdb7ffacf5ddfc84f50918766dacb3a Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Wed, 6 Sep 2023 17:00:21 -0400
Subject: [PATCH 53/67] recommend padding when using sample packing (#531)

---
 examples/code-llama/13b/lora.yml  |  1 +
 examples/code-llama/13b/qlora.yml |  1 +
 examples/code-llama/34b/lora.yml  |  1 +
 examples/code-llama/34b/qlora.yml |  1 +
 examples/code-llama/7b/lora.yml   |  1 +
 examples/code-llama/7b/qlora.yml  |  1 +
 examples/llama-2/lora.yml         |  1 +
 examples/llama-2/qlora.yml        |  1 +
 examples/llama-2/relora.yml       |  1 +
 src/axolotl/utils/config.py       |  5 +++++
 tests/test_validation.py          | 14 ++++++++++++++
 11 files changed, 28 insertions(+)

diff --git a/examples/code-llama/13b/lora.yml b/examples/code-llama/13b/lora.yml
index 637c051436..e4384a893d 100644
--- a/examples/code-llama/13b/lora.yml
+++ b/examples/code-llama/13b/lora.yml
@@ -17,6 +17,7 @@ output_dir: ./lora-out
 
 sequence_len: 100000
 sample_packing: true
+pad_to_sequence_len: true
 
 adapter: lora
 lora_model_dir:
diff --git a/examples/code-llama/13b/qlora.yml b/examples/code-llama/13b/qlora.yml
index ae78f5bf2c..8e482a22e2 100644
--- a/examples/code-llama/13b/qlora.yml
+++ b/examples/code-llama/13b/qlora.yml
@@ -20,6 +20,7 @@ lora_model_dir:
 
 sequence_len: 100000
 sample_packing: true
+pad_to_sequence_len: true
 
 lora_r: 32
 lora_alpha: 16
diff --git a/examples/code-llama/34b/lora.yml b/examples/code-llama/34b/lora.yml
index 9c4cfee10f..8a5c457f69 100644
--- a/examples/code-llama/34b/lora.yml
+++ b/examples/code-llama/34b/lora.yml
@@ -17,6 +17,7 @@ output_dir: ./lora-out
 
 sequence_len: 100000
 sample_packing: true
+pad_to_sequence_len: true
 
 adapter: lora
 lora_model_dir:
diff --git a/examples/code-llama/34b/qlora.yml b/examples/code-llama/34b/qlora.yml
index 9f5ce50f96..b0d91fae92 100644
--- a/examples/code-llama/34b/qlora.yml
+++ b/examples/code-llama/34b/qlora.yml
@@ -20,6 +20,7 @@ lora_model_dir:
 
 sequence_len: 100000
 sample_packing: true
+pad_to_sequence_len: true
 
 lora_r: 32
 lora_alpha: 16
diff --git a/examples/code-llama/7b/lora.yml b/examples/code-llama/7b/lora.yml
index dfa3f2f7a3..1e09555f75 100644
--- a/examples/code-llama/7b/lora.yml
+++ b/examples/code-llama/7b/lora.yml
@@ -17,6 +17,7 @@ output_dir: ./lora-out
 
 sequence_len: 100000
 sample_packing: true
+pad_to_sequence_len: true
 
 adapter: lora
 lora_model_dir:
diff --git a/examples/code-llama/7b/qlora.yml b/examples/code-llama/7b/qlora.yml
index 704f058c31..fc9a5eb533 100644
--- a/examples/code-llama/7b/qlora.yml
+++ b/examples/code-llama/7b/qlora.yml
@@ -20,6 +20,7 @@ lora_model_dir:
 
 sequence_len: 100000
 sample_packing: true
+pad_to_sequence_len: true
 
 lora_r: 32
 lora_alpha: 16
diff --git a/examples/llama-2/lora.yml b/examples/llama-2/lora.yml
index 2a0af130be..a54799b408 100644
--- a/examples/llama-2/lora.yml
+++ b/examples/llama-2/lora.yml
@@ -17,6 +17,7 @@ output_dir: ./lora-out
 
 sequence_len: 4096
 sample_packing: true
+pad_to_sequence_len: true
 
 adapter: lora
 lora_model_dir:
diff --git a/examples/llama-2/qlora.yml b/examples/llama-2/qlora.yml
index 3ad2a7e4fd..dd029859ed 100644
--- a/examples/llama-2/qlora.yml
+++ b/examples/llama-2/qlora.yml
@@ -20,6 +20,7 @@ lora_model_dir:
 
 sequence_len: 4096
 sample_packing: true
+pad_to_sequence_len: true
 
 lora_r: 32
 lora_alpha: 16
diff --git a/examples/llama-2/relora.yml b/examples/llama-2/relora.yml
index 66515dabc2..b59a7da04c 100644
--- a/examples/llama-2/relora.yml
+++ b/examples/llama-2/relora.yml
@@ -20,6 +20,7 @@ lora_model_dir:
 
 sequence_len: 4096
 sample_packing: true
+pad_to_sequence_len: true
 
 lora_r: 8
 lora_alpha: 16
diff --git a/src/axolotl/utils/config.py b/src/axolotl/utils/config.py
index 0fbccd2054..7fc6e1232a 100644
--- a/src/axolotl/utils/config.py
+++ b/src/axolotl/utils/config.py
@@ -97,6 +97,11 @@ def validate_config(cfg):
             )
         )
 
+    if cfg.sample_packing and not cfg.pad_to_sequence_len:
+        LOG.warning(
+            "`pad_to_sequence_len: true` is recommended when using sample_packing"
+        )
+
     if cfg.gradient_accumulation_steps and cfg.batch_size:
         raise ValueError(
             "please set only one of gradient_accumulation_steps or batch_size"
diff --git a/tests/test_validation.py b/tests/test_validation.py
index 48b122f9a6..f250e5cb47 100644
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -328,6 +328,20 @@ def test_packing(self):
                 for record in self._caplog.records
             )
 
+        cfg = DictDefault(
+            {
+                "sample_packing": True,
+                "pad_to_sequence_len": None,
+            }
+        )
+        with self._caplog.at_level(logging.WARNING):
+            validate_config(cfg)
+            assert any(
+                "`pad_to_sequence_len: true` is recommended when using sample_packing"
+                in record.message
+                for record in self._caplog.records
+            )
+
         cfg = DictDefault(
             {
                 "max_packed_sequence_len": 2048,

From e30f1e3cf7bfa8d5e7bf50a305e0f5c67fbf7b4c Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Fri, 8 Sep 2023 11:57:02 -0400
Subject: [PATCH 54/67] Early stopping metric (#537)

* set early stopping metric to check

* tweak how load_best_model_at_end gets set for early stopping

* add validation for earl;y stopping patience

* remove negation

* save results to metrics in callback

* move early stopping callback after the benchmark evals

* broadcast metrics so early stopping works
---
 src/axolotl/utils/callbacks.py   |  6 ++++++
 src/axolotl/utils/config.py      |  9 +++++++++
 src/axolotl/utils/distributed.py | 28 ++++++++++++++++++++++++++++
 src/axolotl/utils/trainer.py     | 21 ++++++++++++---------
 4 files changed, 55 insertions(+), 9 deletions(-)

diff --git a/src/axolotl/utils/callbacks.py b/src/axolotl/utils/callbacks.py
index 8fc5a918b3..3f776537a5 100644
--- a/src/axolotl/utils/callbacks.py
+++ b/src/axolotl/utils/callbacks.py
@@ -25,6 +25,7 @@
 from axolotl.utils.bench import log_gpu_memory_usage
 from axolotl.utils.distributed import (
     barrier,
+    broadcast_dict,
     gather_scalar_from_all_ranks,
     get_world_size,
     is_distributed,
@@ -271,6 +272,7 @@ def on_evaluate(
                 lambda: len(data_loader), get_world_size()
             )
 
+            results = {}
             if is_distributed() and not is_main_process():
                 dist.gather_object(local_bench_names, dst=0)
             else:
@@ -316,4 +318,8 @@ def on_evaluate(
                 )["accuracy"]
                 trainer.log(results)
 
+            results = broadcast_dict(results)
+            for key, val in results.items():
+                metrics[key] = val
+
     return BenchEvalCallback
diff --git a/src/axolotl/utils/config.py b/src/axolotl/utils/config.py
index 7fc6e1232a..6de807eab9 100644
--- a/src/axolotl/utils/config.py
+++ b/src/axolotl/utils/config.py
@@ -220,6 +220,15 @@ def validate_config(cfg):
             "sample_packing not compatible with xformers_attention. Use flash_attention"
         )
 
+    if cfg.early_stopping_patience:
+        if not cfg.save_steps or not cfg.eval_steps:
+            raise ValueError(
+                "`early_stopping_patience` requires save_steps and eval_steps to be set. eval_steps should evenly divide save_steps."
+            )
+        if cfg.save_steps % cfg.eval_steps != 0:
+            raise ValueError(
+                "`early_stopping_patience` requires that eval_steps should evenly divide save_steps."
+            )
     # TODO
     # MPT 7b
     # https://github.com/facebookresearch/bitsandbytes/issues/25
diff --git a/src/axolotl/utils/distributed.py b/src/axolotl/utils/distributed.py
index 5e527f3b95..d48659db1e 100644
--- a/src/axolotl/utils/distributed.py
+++ b/src/axolotl/utils/distributed.py
@@ -2,6 +2,7 @@
 utility helpers for distributed checks
 """
 import os
+import pickle  # nosec
 from contextlib import contextmanager
 
 import torch
@@ -93,3 +94,30 @@ def gather_scalar_from_all_ranks(fn, world_size=1):  # pylint: disable=invalid-n
                 gathered_values.append(float(tensor.item()))
         return gathered_values
     return None
+
+
+def broadcast_dict(vals: dict):
+    if not is_distributed():
+        return vals
+
+    if is_main_process():
+        data_byte = pickle.dumps(vals)
+        data_tensor = torch.ByteTensor(list(data_byte)).to("cuda")
+        data_size = torch.IntTensor([len(data_byte)]).to("cuda")
+    else:
+        data_tensor = torch.empty([1024], dtype=torch.uint8, device="cuda")
+        data_size = torch.IntTensor([0]).to("cuda")
+
+    dist.broadcast(data_size, 0)
+    if not is_main_process():
+        # resize
+        data_tensor = data_tensor.new_empty([data_size.item()])
+
+    dist.broadcast(data_tensor, 0)
+
+    if not is_main_process():
+        data_list = data_tensor.cpu().tolist()
+        data_byte = bytes(data_list[: data_size.item()])
+        vals = pickle.loads(data_byte)  # nosec
+
+    return vals
diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index 3bc283d75b..ece1bd9b69 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -576,6 +576,10 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
         training_arguments_kwargs["do_bench_eval"] = cfg.do_bench_eval
         if cfg.bench_dataset:
             training_arguments_kwargs["bench_dataset"] = cfg.bench_dataset
+    if cfg.metric_for_best_model:
+        training_arguments_kwargs["metric_for_best_model"] = cfg.metric_for_best_model
+    if cfg.greater_is_better:
+        training_arguments_kwargs["greater_is_better"] = cfg.greater_is_better
 
     # DDP Config
     if cfg.ddp_timeout:
@@ -601,11 +605,10 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
         output_dir=cfg.output_dir,
         save_total_limit=cfg.save_total_limit if cfg.save_total_limit else 4,
         load_best_model_at_end=(
-            cfg.load_best_model_at_end is not False
+            (cfg.load_best_model_at_end is not False or cfg.early_stopping_patience)
             and cfg.val_set_size > 0
             and cfg.save_steps
             and cfg.save_steps % cfg.eval_steps == 0
-            and cfg.load_in_8bit is not True
         )
         or False,
         ddp_find_unused_parameters=False if cfg.ddp else None,
@@ -637,13 +640,6 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
     if cfg.relora_steps:
         callbacks.append(ReLoRACallback(cfg))
 
-    # TODO on_save callback to sync checkpoints to GCP/AWS in background
-    if cfg.early_stopping_patience:
-        early_stop_cb = EarlyStoppingCallback(
-            cfg.early_stopping_patience,
-        )
-        callbacks.append(early_stop_cb)
-
     if cfg.local_rank == 0 and cfg.adapter in [
         "lora",
         "qlora",
@@ -710,4 +706,11 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
     if cfg.do_bench_eval:
         trainer.add_callback(bench_eval_callback_factory(trainer, tokenizer))
 
+    # TODO on_save callback to sync checkpoints to GCP/AWS in background
+    if cfg.early_stopping_patience:
+        early_stop_cb = EarlyStoppingCallback(
+            cfg.early_stopping_patience,
+        )
+        trainer.add_callback(early_stop_cb)
+
     return trainer

From 5e2d8a42d9e733a3eb55b90bd1cba45e77f789df Mon Sep 17 00:00:00 2001
From: The Objective Dad <63609026+theobjectivedad@users.noreply.github.com>
Date: Fri, 8 Sep 2023 10:57:47 -0500
Subject: [PATCH 55/67] Adding NCCL Timeout Guide (#536)

* fixes NCCL_P2P_LEVEL=NVL #429

* adding more insights into verious values of NCCL_P2P_LEVEL
---
 README.md    |  4 ++++
 docs/nccl.md | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+)
 create mode 100644 docs/nccl.md

diff --git a/README.md b/README.md
index 30f7b4844f..90dd96e8b6 100644
--- a/README.md
+++ b/README.md
@@ -752,6 +752,10 @@ Try to turn off xformers.
 
 It's safe to ignore it.
 
+> NCCL Timeouts during training
+
+See the [NCCL](docs/nccl.md) guide.
+
 ## Need help? 🙋♂️
 
 Join our [Discord server](https://discord.gg/HhrNrHJPRb) where we can help you
diff --git a/docs/nccl.md b/docs/nccl.md
new file mode 100644
index 0000000000..4a7ff5d5d6
--- /dev/null
+++ b/docs/nccl.md
@@ -0,0 +1,46 @@
+# NCCL
+
+NVIDIA NCCL is a library to facilitate and optimize multi-GPU communication operations, such as broadcast, all-gather, reduce, all-reduce, etc. Broadly, NCCL configuration is highly environment-specific and is configured via several [environment variables](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html). A common NCCL-related problem occurs when a long-running operation times out causing the training process to abort:
+
+```text
+Watchdog caught collective operation timeout: WorkNCCL(SeqNum=42, OpType=ALLGATHER, Timeout(ms)=1800000) ran for 1806948 milliseconds before timing out.
+```
+
+Often, this timeout will happen after 30 minutes (the default setting) and is accompanied by below-average power consumption with near 100% GPU utilization before the error is raised. Nvidia recommends [disabling PCI access control services (ACS)](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/troubleshooting.html#pci-access-control-services-acs) as a possible solution if this is available to you.
+
+Forcing cross-GPU communication via [NVLink](https://en.wikipedia.org/wiki/NVLink) may help without increasing timeouts. To verify that your configuration is leveraging NVLink run the following command:
+
+```shell
+nvidia-smi nvlink --status
+```
+
+To force NCCL to use NVLink, simply set this in the environment:
+
+```shell
+export NCCL_P2P_LEVEL=NVL
+```
+
+If NVLink is not available in your environment there are other options for ``NCCL_P2P_LEVEL`` in the table below:
+
+| NCCL_P2P_LEVEL | Description |
+| -------------- | ----------- |
+| PIX | P2P data transfers through no more than a single PCIe bridge. Faster data transfer rates vs to paths involving multiple bridges, but slower compared to direct GPU-to-GPU communication. |
+| PXB | P2P data transfers through multiple PCIe bridges but not going through the PCIe Host Bridge; this path involves a complex routing process, potentially incurring a moderate level of latency. |
+| PHB | P2P data transfers occur over the PCIe and through a PCIe Host Bridge, typically involving the CPU, which can facilitate direct memory access but might introduce additional latency compared to more direct paths (ex PIX, NVL) |
+
+To validate that acceptable data transfer speeds exist for your training job, running [NCCL Tests](https://github.com/NVIDIA/nccl-tests/blob/master/README.md) can help pinpoint bottlenecks, for example:
+
+```shell
+./build/all_reduce_perf -b 8 -e 128M -f 2 -g 3
+```
+
+It can be useful when debugging NCCL communication timeouts to activate additional logging in both PyTorch and NCCL:
+
+```shell
+export NCCL_DEBUG=INFO
+export NCCL_DEBUG_SUBSYS=ALL
+export TORCH_DISTRIBUTED_DEBUG=INFO
+export TORCHELASTIC_ERROR_FILE=/PATH/TO/torcherror.log
+```
+
+Finally, if you believe your training job needs more time you can increase the timeout past 30 minutes by setting the ``ddp_timeout`` value in the Axolotl configuration. See [PyTorch init_process_group](https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group) for documentation on this value.

From 34c0a86a1152430f4189278a5f8f1da375f5f36b Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Fri, 8 Sep 2023 11:58:54 -0400
Subject: [PATCH 56/67] update readme to point to direct link to runpod
 template, cleanup install instrucitons (#532)

* update readme to point to direct link to runpod template, cleanup install instrucitons

* default install flash-attn and auto-gptq now too

* update readme w flash-attn extra

* fix version in setup
---
 .github/workflows/tests.yml |  4 ++--
 README.md                   | 20 ++++----------------
 docker/Dockerfile           |  4 ++--
 requirements.txt            |  2 +-
 setup.py                    |  9 ++-------
 5 files changed, 11 insertions(+), 28 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index d5184def60..9ff08db074 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -24,8 +24,8 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install -e .
-          pip install -r requirements-tests.txt
+          pip3 install -e .
+          pip3 install -r requirements-tests.txt
 
       - name: Run tests
         run: |
diff --git a/README.md b/README.md
index 90dd96e8b6..775592efe6 100644
--- a/README.md
+++ b/README.md
@@ -90,8 +90,7 @@ accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml \
   ```bash
   docker run --gpus '"all"' --rm -it winglian/axolotl:main-py3.10-cu118-2.0.1
   ```
-  - `winglian/axolotl-runpod:main-py3.10-cu118-2.0.1`: for runpod
-  - `winglian/axolotl-runpod:main-py3.9-cu118-2.0.1-gptq`: for gptq
+  - `winglian/axolotl-runpod:main-latest`: for runpod or use this [direct link](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz)
 
   Or run on the current files for development:
 
@@ -104,19 +103,9 @@ accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml \
 
   2. Install pytorch stable https://pytorch.org/get-started/locally/
 
-  3. Install python dependencies with ONE of the following:
-      - Recommended, supports QLoRA, NO gptq/int4 support
+  3. Install axolotl along with python dependencies
         ```bash
-        pip3 install -e .
-        pip3 install -U git+https://github.com/huggingface/peft.git
-        ```
-      - gptq/int4 support, NO QLoRA
-        ```bash
-        pip3 install -e .[gptq]
-        ```
-      - same as above but not recommended
-        ```bash
-        pip3 install -e .[gptq_triton]
+        pip3 install -e .[flash-attn]
         ```
 
 - LambdaLabs
@@ -151,10 +140,9 @@ accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml \
   git clone https://github.com/OpenAccess-AI-Collective/axolotl
   cd axolotl
 
-  pip3 install -e . # change depend on needs
+  pip3 install -e .
   pip3 install protobuf==3.20.3
   pip3 install -U --ignore-installed requests Pillow psutil scipy
-  pip3 install git+https://github.com/huggingface/peft.git # not for gptq
   ```
 
   5. Set path
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 683ca75ffd..8608e2348b 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -15,9 +15,9 @@ RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN cd axolotl && \
     if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install -e .[flash-attn,gptq,$AXOLOTL_EXTRAS]; \
+        pip install -e .[flash-attn,$AXOLOTL_EXTRAS]; \
     else \
-        pip install -e .[flash-attn,gptq]; \
+        pip install -e .[flash-attn]; \
     fi
 
 # fix so that git fetch/pull from remote works
diff --git a/requirements.txt b/requirements.txt
index 1c8e97dffc..6d33f5728d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,7 +12,7 @@ evaluate
 fire
 PyYAML>=6.0
 datasets
-flash-attn>=2.0.8
+flash-attn>=2.2.1
 sentencepiece
 wandb
 einops
diff --git a/setup.py b/setup.py
index 973d656cd1..a53603f8bf 100644
--- a/setup.py
+++ b/setup.py
@@ -7,9 +7,7 @@ def parse_requirements():
     _install_requires = []
     _dependency_links = []
     with open("./requirements.txt", encoding="utf-8") as requirements_file:
-        lines = [
-            r.strip() for r in requirements_file.readlines() if "auto-gptq" not in r
-        ]
+        lines = [r.strip() for r in requirements_file.readlines()]
         for line in lines:
             if line.startswith("--extra-index-url"):
                 # Handle custom index URLs
@@ -33,11 +31,8 @@ def parse_requirements():
     install_requires=install_requires,
     dependency_links=dependency_links,
     extras_require={
-        "gptq": [
-            "auto-gptq",
-        ],
         "flash-attn": [
-            "flash-attn==2.0.8",
+            "flash-attn>=2.2.1",
         ],
         "extras": [
             "deepspeed",

From 78ee2cdab28f0d346b7039308f35bf9cc512bc94 Mon Sep 17 00:00:00 2001
From: SlapDrone <32279503+SlapDrone@users.noreply.github.com>
Date: Fri, 8 Sep 2023 21:59:49 +0200
Subject: [PATCH 57/67] add git environment variables to compose: avoid
 checkout failure error 128 on build (#534)

---
 docker-compose.yaml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docker-compose.yaml b/docker-compose.yaml
index d40422f94f..a16be726cf 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -9,6 +9,11 @@ services:
       - ~/.cache/huggingface/:/root/.cache/huggingface/
     # set environment variables
     environment:
+      # Set environment variables
+      - GIT_AUTHOR_NAME=${GIT_AUTHOR_NAME}
+      - GIT_AUTHOR_EMAIL=${GIT_AUTHOR_EMAIL}
+      - GIT_COMMITTER_NAME=${GIT_COMMITTER_NAME}
+      - GIT_COMMITTER_EMAIL=${GIT_COMMITTER_EMAIL}
       - WANDB_API_KEY=${WANDB_API_KEY}
     deploy:
       resources:

From 0b4cf5bc8c24fe7084473a1f7cdd2cf1231e7168 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Fri, 8 Sep 2023 16:01:05 -0400
Subject: [PATCH 58/67] workaround for md5 variations (#533)

* workaround for md5 variations

* refactor the prepared hash too
---
 src/axolotl/utils/data.py | 28 +++++++++--------
 tests/test_data.py        | 64 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+), 13 deletions(-)
 create mode 100644 tests/test_data.py

diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py
index 20d0fcfb88..f322b800b5 100644
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -2,7 +2,6 @@
 import functools
 import hashlib
 import logging
-from hashlib import md5
 from pathlib import Path
 from typing import Tuple, Union
 
@@ -52,6 +51,13 @@
 DEFAULT_DATASET_PREPARED_PATH = "last_run_prepared"
 
 
+def md5(to_hash: str, encoding: str = "utf-8") -> str:
+    try:
+        return hashlib.md5(to_hash.encode(encoding), usedforsecurity=False).hexdigest()
+    except TypeError:
+        return hashlib.md5(to_hash.encode(encoding)).hexdigest()  # nosec
+
+
 def prepare_dataset(cfg, tokenizer):
     if not cfg.pretraining_dataset:
         with zero_first(is_main_process()):
@@ -88,7 +94,7 @@ def load_tokenized_prepared_datasets(
 ) -> DatasetDict:
     tokenizer_name = tokenizer.__class__.__name__
     ds_hash = str(
-        md5(  # nosec
+        md5(
             (
                 str(cfg.sequence_len)
                 + "@"
@@ -97,8 +103,8 @@ def load_tokenized_prepared_datasets(
                 )
                 + "|"
                 + tokenizer_name
-            ).encode("utf-8")
-        ).hexdigest()
+            )
+        )
     )
     prepared_ds_path = (
         Path(cfg.dataset_prepared_path) / ds_hash
@@ -374,7 +380,7 @@ def load_prepare_datasets(
         # see if we can go ahead and load the stacked dataset
         seed = f"@{str(cfg.seed)}" if cfg.seed else ""
         ds_hash = str(
-            md5(  # nosec
+            md5(
                 (
                     str(cfg.sequence_len)
                     + "@"
@@ -385,8 +391,8 @@ def load_prepare_datasets(
                     )
                     + "|"
                     + tokenizer_name
-                ).encode("utf-8")
-            ).hexdigest()
+                )
+            )
         )
         prepared_ds_path = (
             Path(cfg.dataset_prepared_path) / ds_hash
@@ -500,12 +506,8 @@ def load_prepare_datasets(
             + "|"
             + str(cfg.seed or 42)
         )
-        train_fingerprint = hashlib.md5(
-            to_hash_train.encode(), usedforsecurity=False
-        ).hexdigest()
-        test_fingerprint = hashlib.md5(
-            to_hash_test.encode(), usedforsecurity=False
-        ).hexdigest()
+        train_fingerprint = md5(to_hash_train)
+        test_fingerprint = md5(to_hash_test)
 
         with zero_first(is_main_process()):
             dataset = dataset.train_test_split(
diff --git a/tests/test_data.py b/tests/test_data.py
new file mode 100644
index 0000000000..9d7f5a0412
--- /dev/null
+++ b/tests/test_data.py
@@ -0,0 +1,64 @@
+"""
+test module for the axolotl.utis.data module
+"""
+import unittest
+
+from transformers import LlamaTokenizer
+
+from axolotl.utils.data import encode_pretraining, md5
+
+
+class TestEncodePretraining(unittest.TestCase):
+    """
+    test class for encode pretraining and md5 helper
+    """
+
+    def setUp(self):
+        self.tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b")
+        self.tokenizer.add_special_tokens(
+            {
+                "eos_token": "</s>",
+                "bos_token": "<s>",
+                "unk_token": "<unk>",
+                "pad_token": "<pad>",
+            }
+        )
+        self.max_tokens = 15  # set a small number for easy inspection
+
+    def test_encode_pretraining(self):
+        examples = {
+            "text": [
+                "Hello, world!",
+                "Nice to meet you.",
+                "lorem ipsum dolor sit amet.",
+                "Nice to meet you again!.",
+                "hello, hello",
+            ]
+        }
+        result = encode_pretraining(self.tokenizer, self.max_tokens, examples)
+
+        self.assertEqual(len(result["input_ids"]), 3)
+
+        # Assert the length of input_ids and attention_mask is correct
+        self.assertEqual(len(result["input_ids"][0]), self.max_tokens)
+        self.assertEqual(len(result["attention_mask"][0]), self.max_tokens)
+
+        # Assert EOS and PAD tokens are correctly added
+        # hello world! is 4 tokens
+        self.assertEqual(result["input_ids"][0][0], self.tokenizer.bos_token_id)
+        self.assertEqual(result["input_ids"][0][5], self.tokenizer.eos_token_id)
+        self.assertEqual(result["input_ids"][0][6], self.tokenizer.pad_token_id)
+        # second part, 5 tokens
+        self.assertEqual(result["input_ids"][0][7], self.tokenizer.bos_token_id)
+        self.assertEqual(result["input_ids"][0][13], self.tokenizer.eos_token_id)
+        self.assertEqual(result["input_ids"][0][14], self.tokenizer.pad_token_id)
+
+    def test_md5(self):
+        self.assertEqual(md5("hello world"), "5eb63bbbe01eeed093cb22bb8f5acdc3")
+        self.assertEqual(
+            md5("hello world", "utf-8"), "5eb63bbbe01eeed093cb22bb8f5acdc3"
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From c1921c9acb66c2a8b6542584f62bb02bc543acbf Mon Sep 17 00:00:00 2001
From: dongxiaolong <774848421@qq.com>
Date: Sat, 9 Sep 2023 04:07:11 +0800
Subject: [PATCH 59/67] Update requirements.txt (#543)

fix fsdp
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 6d33f5728d..1e95b716ec 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,7 +6,7 @@ packaging
 peft @ git+https://github.com/huggingface/peft.git
 transformers @ git+https://github.com/huggingface/transformers.git
 bitsandbytes>=0.41.1
-accelerate @ git+https://github.com/huggingface/accelerate@2a289f6108e77a77a4efffb3f6316bc98538413b
+accelerate @ git+https://github.com/huggingface/accelerate
 addict
 evaluate
 fire

From a94f9cb99e5927cd78830bfab1ebacc2b12fca95 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sun, 10 Sep 2023 12:40:52 -0400
Subject: [PATCH 60/67] fix for quant config from model (#540)

---
 src/axolotl/utils/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index 9ec51f4f75..2000b1aee8 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -160,7 +160,7 @@ def load_model(
         model_kwargs["revision"] = cfg.model_revision
     if cfg.gptq:
         model_config = load_model_config(cfg)
-        if hasattr(model_config, "quantization_config"):
+        if not hasattr(model_config, "quantization_config"):
             LOG.warning("model config does not contain quantization_config information")
         else:
             model_kwargs["quantization_config"] = GPTQConfig(

From b56503d42302903eb0c1a947bf3204983b4c2256 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Mon, 11 Sep 2023 09:44:47 -0400
Subject: [PATCH 61/67] publish to pypi workflow on tagged release (#549)

---
 .github/workflows/pypi.yml | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 .github/workflows/pypi.yml

diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
new file mode 100644
index 0000000000..09690fbebf
--- /dev/null
+++ b/.github/workflows/pypi.yml
@@ -0,0 +1,23 @@
+name: publish pypi
+
+on:
+  push:
+    branches:
+      - "main"
+
+jobs:
+  pypi-publish:
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
+    name: Upload release to PyPI
+    runs-on: ubuntu-latest
+    environment:
+      name: pypi
+      url: https://pypi.org/p/axolotl-ai
+    permissions:
+      id-token: write  # IMPORTANT: this permission is mandatory for trusted publishing
+    with:
+      verbose: true
+    steps:
+      # retrieve your distributions here
+      - name: Publish package distributions to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1

From c5dedb17ad4ca230837eca7c3b2ebf3497e99791 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Mon, 11 Sep 2023 10:27:17 -0400
Subject: [PATCH 62/67] remove with section, doesn't seem to work (#551)

---
 .github/workflows/pypi.yml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
index 09690fbebf..4f3e9bcc32 100644
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -12,11 +12,9 @@ jobs:
     runs-on: ubuntu-latest
     environment:
       name: pypi
-      url: https://pypi.org/p/axolotl-ai
+      url: https://pypi.org/p/axolotl
     permissions:
       id-token: write  # IMPORTANT: this permission is mandatory for trusted publishing
-    with:
-      verbose: true
     steps:
       # retrieve your distributions here
       - name: Publish package distributions to PyPI

From 20ed4c1f9e5cfbea1e49fd2dcc1e6048c6174731 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Mon, 11 Sep 2023 10:33:42 -0400
Subject: [PATCH 63/67] pypi on tag push (#552)

---
 .github/workflows/pypi.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
index 4f3e9bcc32..790b9ce36e 100644
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -2,12 +2,11 @@ name: publish pypi
 
 on:
   push:
-    branches:
-      - "main"
+    tags:
+      - '*'
 
 jobs:
   pypi-publish:
-    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
     name: Upload release to PyPI
     runs-on: ubuntu-latest
     environment:

From 6d57f2f0f0b785d8e3a35e928bc8f605548fd8d8 Mon Sep 17 00:00:00 2001
From: The Objective Dad <63609026+theobjectivedad@users.noreply.github.com>
Date: Mon, 11 Sep 2023 11:35:45 -0500
Subject: [PATCH 64/67] ergonomic update to optimizer config doc (#548)

---
 README.md | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/README.md b/README.md
index 775592efe6..ac68003a65 100644
--- a/README.md
+++ b/README.md
@@ -560,6 +560,30 @@ log_sweep_min_lr:
 log_sweep_max_lr:
 
 # specify optimizer
+# Valid values are driven by the Transformers OptimizerNames class, see:
+# https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134
+#
+# Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of
+# torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used
+# in the examples/ for your model and fine-tuning use case.
+#
+# Valid values for 'optimizer' include:
+# - adamw_hf
+# - adamw_torch
+# - adamw_torch_fused
+# - adamw_torch_xla
+# - adamw_apex_fused
+# - adafactor
+# - adamw_anyprecision
+# - sgd
+# - adagrad
+# - adamw_bnb_8bit
+# - lion_8bit
+# - lion_32bit
+# - paged_adamw_32bit
+# - paged_adamw_8bit
+# - paged_lion_32bit
+# - paged_lion_8bit
 optimizer:
 # specify weight decay
 weight_decay:

From bcbc9597e95c6a6787def6f14dc9f84d0da719b4 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Mon, 11 Sep 2023 13:25:41 -0400
Subject: [PATCH 65/67] replace tags, build dist for pypi publish (#553)

* replace tags, build dist for pypi publish

* missing trailing comma
---
 .github/workflows/pypi.yml | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
index 790b9ce36e..f6d62e1ff7 100644
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -15,6 +15,31 @@ jobs:
     permissions:
       id-token: write  # IMPORTANT: this permission is mandatory for trusted publishing
     steps:
-      # retrieve your distributions here
+      - name: Check out repository code
+        uses: actions/checkout@v3
+
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+
+      - name: Install dependencies
+        run: |
+          pip3 install wheel
+          pip3 install -e .
+          pip3 install -r requirements-tests.txt
+
+      - name: Extract tag name
+        id: tag
+        run: echo ::set-output name=TAG_NAME::$(echo $GITHUB_REF | cut -d / -f 3)
+
+      - name: Update version in setup.py
+        run: >-
+          sed -E 's/version=\"([\d\.]+)",/version="${{ steps.tag.outputs.TAG_NAME }}",/g' setup.py
+
+      - name: Build a binary wheel
+        run: >-
+          python setup.py sdist bdist_wheel
+
       - name: Publish package distributions to PyPI
         uses: pypa/gh-action-pypi-publish@release/v1

From 6c5fbe6223b32665da43591fda28e7e1ee3d0736 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Mon, 11 Sep 2023 13:34:29 -0400
Subject: [PATCH 66/67] add long_description for pypi push (#555)

---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index a53603f8bf..7488f12ae3 100644
--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,8 @@ def parse_requirements():
 setup(
     name="axolotl",
     version="0.1",
-    description="You know you're going to axolotl questions",
+    description="LLM Trainer",
+    long_description="Axolotl is a tool designed to streamline the fine-tuning of various AI models, offering support for multiple configurations and architectures.",
     package_dir={"": "src"},
     packages=find_packages(),
     install_requires=install_requires,

From 772cd870d462e2507d8c309702d038364c3ee08d Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Mon, 11 Sep 2023 13:44:19 -0400
Subject: [PATCH 67/67] fix the sed command to replace the version w the tag

---
 .github/workflows/pypi.yml | 2 +-
 setup.py                   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
index f6d62e1ff7..27b1cb8d69 100644
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -35,7 +35,7 @@ jobs:
 
       - name: Update version in setup.py
         run: >-
-          sed -E 's/version=\"([\d\.]+)",/version="${{ steps.tag.outputs.TAG_NAME }}",/g' setup.py
+          sed -i -E 's/version="([0-9.]+)",/version="${{ steps.tag.outputs.TAG_NAME }}",/g' setup.py
 
       - name: Build a binary wheel
         run: >-
diff --git a/setup.py b/setup.py
index 7488f12ae3..fca5088da1 100644
--- a/setup.py
+++ b/setup.py
@@ -24,7 +24,7 @@ def parse_requirements():
 
 setup(
     name="axolotl",
-    version="0.1",
+    version="0.3.0",
     description="LLM Trainer",
     long_description="Axolotl is a tool designed to streamline the fine-tuning of various AI models, offering support for multiple configurations and architectures.",
     package_dir={"": "src"},