From 2a1589f6f6fbe74a58f867f48ab9cb0c35cf9df9 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Thu, 21 Mar 2024 11:56:13 -0400
Subject: [PATCH] strip out hacky qlora-fsdp workarounds now that qlora-fsdp
 fixes are upstreamed (#1428)

---
 examples/llama-2/qlora-fsdp.yml        |   8 +-
 requirements.txt                       |   7 +-
 src/axolotl/core/policies/__init__.py  |   0
 src/axolotl/core/policies/auto_wrap.py |  55 -------
 src/axolotl/core/trainer_builder.py    |  52 +-----
 src/axolotl/utils/models.py            | 220 +------------------------
 src/axolotl/utils/trainer.py           |   4 +
 tests/e2e/test_mixtral.py              |   4 +-
 8 files changed, 27 insertions(+), 323 deletions(-)
 delete mode 100644 src/axolotl/core/policies/__init__.py
 delete mode 100644 src/axolotl/core/policies/auto_wrap.py

diff --git a/examples/llama-2/qlora-fsdp.yml b/examples/llama-2/qlora-fsdp.yml
index da6c06020f..30916ed45a 100644
--- a/examples/llama-2/qlora-fsdp.yml
+++ b/examples/llama-2/qlora-fsdp.yml
@@ -36,7 +36,7 @@ wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 4
 num_epochs: 4
-optimizer: paged_adamw_8bit
+optimizer: adamw_torch
 lr_scheduler: cosine
 learning_rate: 0.00001
 
@@ -66,5 +66,11 @@ weight_decay: 0.0
 fsdp:
   - full_shard
 fsdp_config:
+  fsdp_limit_all_gathers: true
+  fsdp_sync_module_states: true
+  fsdp_offload_params: true
+  fsdp_use_orig_params: false
+  fsdp_cpu_ram_efficient_loading: true
   fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
+  fsdp_state_dict_type: SHARDED_STATE_DICT
 special_tokens:
diff --git a/requirements.txt b/requirements.txt
index aaa27c547b..75ce7a0d8a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,10 +1,10 @@
 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
 packaging==23.2
 peft==0.9.0
-transformers @ git+https://github.com/huggingface/transformers.git@f6261d7d81edd036fc53bfede65fe91f01a661aa
+transformers @ git+https://github.com/huggingface/transformers.git@73a73b415e36f41481369f6129cb4b62bb127a78
 tokenizers==0.15.0
-bitsandbytes>=0.43.0
-accelerate==0.26.1
+bitsandbytes==0.43.0
+accelerate==0.28.0
 deepspeed==0.13.1
 pydantic==2.6.3
 addict
@@ -40,4 +40,3 @@ gcsfs
 # adlfs
 
 trl @ git+https://github.com/huggingface/trl.git@304e208f778a5442c30cdda500348226cdc97d90
-fastcore>=1.5.29
diff --git a/src/axolotl/core/policies/__init__.py b/src/axolotl/core/policies/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/src/axolotl/core/policies/auto_wrap.py b/src/axolotl/core/policies/auto_wrap.py
deleted file mode 100644
index d42b62ee08..0000000000
--- a/src/axolotl/core/policies/auto_wrap.py
+++ /dev/null
@@ -1,55 +0,0 @@
-"""module for building the auto wrap policy for FSDP"""
-import functools
-
-from peft import PrefixEncoder, PromptEmbedding, PromptEncoder
-from torch.distributed.fsdp.wrap import (
-    _or_policy,
-    lambda_auto_wrap_policy,
-    transformer_auto_wrap_policy,
-)
-from transformers.models.llama.modeling_llama import LlamaDecoderLayer
-from transformers.models.mistral.modeling_mistral import MistralDecoderLayer
-from transformers.models.mixtral.modeling_mixtral import MixtralDecoderLayer
-
-SUPPORTED_AUTO_WRAP_MODEL_TYPES = [
-    "llama",
-    "mistral",
-    "mixtral",
-]
-
-
-def get_wrapping_policy_factory(model_type):
-    if model_type == "llama":
-        layer_to_wrap = LlamaDecoderLayer
-    elif model_type == "mistral":
-        layer_to_wrap = MistralDecoderLayer
-    elif model_type == "mixtral":
-        layer_to_wrap = MixtralDecoderLayer
-
-    def get_wrapping_policy():
-        """This checks for lora layers (has weight and requires_grad)"""
-
-        def lambda_policy_fn(module):
-            return (
-                len(list(module.named_children())) == 0
-                and getattr(module, "weight", None) is not None
-                and module.weight.requires_grad
-            )
-
-        lambda_policy = functools.partial(
-            lambda_auto_wrap_policy, lambda_fn=lambda_policy_fn
-        )
-        transformer_layer_name = layer_to_wrap
-        transformer_wrap_policy = functools.partial(
-            transformer_auto_wrap_policy,
-            transformer_layer_cls=(
-                PrefixEncoder,
-                PromptEncoder,
-                PromptEmbedding,
-                transformer_layer_name,
-            ),
-        )
-        policies = [lambda_policy, transformer_wrap_policy]
-        return functools.partial(_or_policy, policies=policies)
-
-    return get_wrapping_policy
diff --git a/src/axolotl/core/trainer_builder.py b/src/axolotl/core/trainer_builder.py
index 53f6cca903..c2d622ceec 100644
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -8,7 +8,6 @@
 import importlib.util
 import logging
 import math
-import os
 import sys
 from abc import abstractmethod
 from collections import defaultdict
@@ -19,10 +18,7 @@
 
 import torch
 import transformers
-from accelerate import FullyShardedDataParallelPlugin
-from accelerate.utils import str_to_bool
 from datasets import Dataset
-from torch.distributed.fsdp import MixedPrecision
 from torch.optim.lr_scheduler import OneCycleLR
 from torch.utils.data import BatchSampler, DataLoader, RandomSampler, SequentialSampler
 from transformers import (
@@ -35,7 +31,6 @@
 from transformers.utils import is_sagemaker_mp_enabled
 from trl import DPOTrainer
 
-from axolotl.core.policies.auto_wrap import get_wrapping_policy_factory
 from axolotl.loraplus import create_loraplus_optimizer
 from axolotl.monkeypatch.multipack import SUPPORTED_MULTIPACK_MODEL_TYPES
 from axolotl.monkeypatch.relora import ReLoRACallback, ReLoRAScheduler
@@ -591,51 +586,14 @@ def push_to_hub(self, *args, **kwargs) -> str:
 
     @wraps(Trainer.create_accelerator_and_postprocess)
     def create_accelerator_and_postprocess(self):
-        rank = int(os.environ.get("LOCAL_RANK", 0))
         res = super().create_accelerator_and_postprocess()
 
-        if self.args.qlora is False:
-            return res
-
-        # the rest of this method override is specific to fsdp + qlora (for now)
-        sync_module_states = (
-            str_to_bool(os.environ.get("FSDP_SYNC_MODULE_STATES", "True")) == 1
-        )
-
-        mp_policy = None
-        amp = os.environ["ACCELERATE_MIXED_PRECISION"]
-        if amp == "fp16":
-            mp_policy = MixedPrecision(
-                param_dtype=torch.float32,
-                reduce_dtype=torch.float32,
-                buffer_dtype=torch.float32,
-            )
-        elif amp == "bf16":
-            mp_policy = MixedPrecision(
-                param_dtype=torch.float32,
-                reduce_dtype=torch.float32,
-                buffer_dtype=torch.float32,
-            )
-
-        # If somehow we figure out how we want to parameterize we want to autocast buffers...
-        # mp_policy = MixedPrecision(param_dtype=torch.bfloat16, reduce_dtype=torch.bfloat16, buffer_dtype=torch.float32)
-        # load_param_skip_names = ['inv_freq']
-
         if self.is_fsdp_enabled:
-            wrapping_policy = get_wrapping_policy_factory(self.args.model_type)
-            fsdp_plugin = FullyShardedDataParallelPlugin(
-                auto_wrap_policy=wrapping_policy(),
-                cpu_offload=False,
-                use_orig_params=False,
-                limit_all_gathers=True,
-                param_init_fn=lambda module: module.to_empty(
-                    device=torch.device("cuda"), recurse=False
-                )
-                if (rank != 0 and sync_module_states)
-                else None,
-                mixed_precision_policy=mp_policy,
-            )
-            self.accelerator.state.fsdp_plugin = fsdp_plugin
+            if (
+                "limit_all_gathers" in self.args.fsdp_config
+                and self.args.fsdp_config["limit_all_gathers"]
+            ):
+                self.accelerator.state.fsdp_plugin.limit_all_gathers = True
 
         return res
 
diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index 40090a07c0..41fd471e65 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -5,16 +5,14 @@
 import math
 import os
 import types
-from typing import Any, Dict, List, Optional, Tuple, Type, Union  # noqa: F401
+from typing import Any, Dict, Optional, Tuple, Union  # noqa: F401
 
 import addict
 import bitsandbytes as bnb
-import safetensors
 import torch
 import transformers
 from accelerate import init_empty_weights
-from bitsandbytes.nn import Linear4bit, Params4bit
-from fastcore.parallel import parallel
+from bitsandbytes.nn import Params4bit
 from peft import (
     LoftQConfig,
     PeftConfig,
@@ -23,7 +21,7 @@
     prepare_model_for_kbit_training,
 )
 from peft.tuners.lora import QuantLinear
-from torch import Tensor, nn
+from torch import nn
 from transformers import (  # noqa: F401
     AddedToken,
     AutoConfig,
@@ -35,9 +33,7 @@
     PreTrainedTokenizerBase,
 )
 from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
-from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, hub
 
-from axolotl.core.policies.auto_wrap import SUPPORTED_AUTO_WRAP_MODEL_TYPES
 from axolotl.models.mamba import fix_mamba_attn_for_loss
 from axolotl.monkeypatch.multipack import (
     SUPPORTED_MULTIPACK_MODEL_TYPES,
@@ -272,117 +268,6 @@ def load_tokenizer(cfg):
     return tokenizer
 
 
-def replace_linear(
-    model: nn.Module,
-    linear_replacement: Type[nn.Module],
-    quant_config: Union[dict, None] = None,
-    skip_modules=None,
-    **kwargs,
-):
-    """
-    Replace linear modules with a new Linear module.
-    Parameters:
-        model (`torch.nn.Module`):
-            Input model or `torch.nn.Module` as the function is run recursively.
-        linear_replacement (`torch.nn.Module`):
-            The linear module that replaces the old one. Only expects standard arguments.
-            If other arguments need to be passed, use a lambda.
-        skip_modules (`List[str]`, *optional*, defaults to `lm_head`):
-            List of modules names not to convert. Defaults to `lm_head`.
-    """
-    if skip_modules is None:
-        skip_modules = ["lm_head"]
-    for name, module in model.named_children():
-        if len(list(module.children())) > 0:
-            replace_linear(
-                module, linear_replacement, quant_config, skip_modules, **kwargs
-            )
-
-        if isinstance(module, torch.nn.Linear) and name not in skip_modules:
-            if issubclass(linear_replacement, Linear4bit):
-                model._modules[  # pylint: disable=protected-access
-                    name
-                ] = linear_replacement(
-                    module.in_features,
-                    module.out_features,
-                    module.bias is not None,
-                    **kwargs,
-                )
-            else:
-                raise ValueError(
-                    f"Unsupported linear replacement: {type(linear_replacement)}"
-                )
-    return model
-
-
-def load_and_quantize(
-    module: nn.Module,
-    name: str,
-    value: Tensor,
-    device: torch.device = None,
-    dtype: torch.dtype = None,
-    skip_names: Optional[List[str]] = None,
-    is_meta_rank: bool = False,
-    low_memory: bool = True,
-    verbose: bool = False,
-    quant_method: str = "bnb",
-):
-    """
-    Loads `value` tensor into submodule of `module`, optionally skipping `skip_names` and converting to `dtype`.
-
-    Quantizes `Params4bit` on `device` then places on "cpu" if low_memory=True or "meta" if is_meta_rank=True.
-    """
-
-    if skip_names is None:
-        skip_names = []
-
-    def place_on_device(value):
-        if is_meta_rank:
-            device = "meta"
-        elif low_memory:
-            device = "cpu"
-        else:
-            device = "cuda"
-        return value.to(device=device, dtype=dtype)
-
-    if any(skip_name in name for skip_name in skip_names):
-        if verbose:
-            print(f"Skipping {name} because it is in skip_names")
-        return
-
-    module_key, _, value_key = name.rpartition(".")
-    try:
-        submodule = module.get_submodule(module_key)
-    except AttributeError as exc:
-        print(f"Module {module_key} not found:\n{exc}")
-        return
-
-    try:
-        if quant_method == "bnb":
-            param = submodule.get_parameter(value_key)
-            if isinstance(param, Params4bit):
-                # With `sync_module_states=True`, a meta device Params4bit needs to be the same
-                # shape as the quantized Params4bit with an initialized quant_state. However,
-                # FSDP only syncs parameters and buffers, so the quant_state isn't copied. This
-                # workaround quantizes Params4bit to initialize quant_state on all ranks, then
-                # replaces Params4bit's data with a meta tensor to free memory on non-rank 0.
-                value = type(param)(
-                    value.to(device=device, dtype=dtype).data, **param.__dict__
-                ).cuda(device)
-                if is_meta_rank:
-                    value = type(param)(value.data.to("meta"), **value.__dict__)
-                elif low_memory:
-                    value = type(param)(value.data.to("cpu"), **value.__dict__)
-            else:
-                value = type(param)(place_on_device(value).data)
-
-    except AttributeError:
-        # it's a buffer
-        value = place_on_device(value)
-
-    setattr(submodule, value_key, value)
-
-
 def load_model(
     cfg: DictDefault,
     tokenizer: PreTrainedTokenizerBase,
@@ -568,6 +453,7 @@ def load_model(
             "bnb_4bit_compute_dtype": cfg.torch_dtype,
             "bnb_4bit_use_double_quant": True,
             "bnb_4bit_quant_type": "nf4",
+            "bnb_4bit_quant_storage": torch.bfloat16,
         }
 
         if cfg.bnb_config_kwargs:
@@ -617,78 +503,10 @@ def load_model(
         model_kwargs["attn_implementation"] = "eager"
         model_config._attn_implementation = "eager"  # pylint: disable=protected-access
 
-    qlora_fsdp = (
-        cfg.fsdp
-        and cfg.adapter == "qlora"
-        and model_config.model_type in SUPPORTED_AUTO_WRAP_MODEL_TYPES
-    )
+    qlora_fsdp = cfg.fsdp and cfg.adapter == "qlora"
 
     try:
-        if qlora_fsdp:
-            if cfg.bf16 or cfg.bfloat16:
-                torch_dtype, compute_dtype = torch.float32, torch.bfloat16
-            elif cfg.fp16 or cfg.float16:
-                torch_dtype, compute_dtype = torch.float32, torch.float16
-            else:
-                torch_dtype, compute_dtype = torch.float32, torch.float16
-
-            with init_empty_weights():
-                LOG.info("Loading model with empty weights.")
-                model = AutoModelForCausalLM.from_config(model_config)
-                model.model = replace_linear(
-                    model.model,
-                    Linear4bit,
-                    compute_dtype=compute_dtype,
-                    quant_type="nf4",
-                    quant_storage=torch_dtype,
-                )
-
-            model.is_loaded_in_4bit = True
-
-            # Grab the safetensors files that hold the weights
-            try:
-                idx = hub.cached_file(base_model, SAFE_WEIGHTS_INDEX_NAME)
-                files, _ = hub.get_checkpoint_shard_files(base_model, idx)
-            except OSError:
-                try:
-                    # This means the model doesn't have a model.safetensors.index.json because it is not sharded
-                    files = []
-                    files.append(hub.cached_file(base_model, SAFE_WEIGHTS_NAME))
-                except OSError as exc:
-                    # This means the model probably doesn't have a safetensors file
-                    raise exc
-
-            # Load in the weights, using our custom load_and_quantize method which quantizes Params4bit on the fly
-            # and then places each layer on CPU or meta if using low_memory to minimize GPU memory usage
-            def load_and_quantize_parallel(name_param, model, **kwargs):
-                name, param = name_param
-                load_and_quantize(model, name, param, **kwargs)
-
-            param_count = sum((p.numel() for n, p in model.named_parameters()))
-            for filename in files:
-                weights = safetensors.torch.load_file(filename)
-                quant_method = "bnb"
-                devprops = torch.cuda.get_device_properties(torch.cuda.current_device())
-                left = int(os.cpu_count() / torch.cuda.device_count())
-                right = int(
-                    8 * (devprops.total_memory / 1e9 / 40) * (70 / (param_count / 1e9))
-                )
-                n_workers = min(left, right)
-                parallel(
-                    load_and_quantize_parallel,
-                    weights.items(),
-                    n_workers=n_workers,
-                    threadpool=True,
-                    model=model,
-                    dtype=torch_dtype,
-                    device=cfg.local_rank,
-                    skip_names=[],
-                    is_meta_rank=(cfg.local_rank != 0),
-                    verbose=False,
-                    quant_method=quant_method,
-                )
-
-        elif (
+        if (
             model_config.model_type == "llama"
             and not cfg.trust_remote_code
             and not cfg.gptq
@@ -715,32 +533,6 @@ def load_and_quantize_parallel(name_param, model, **kwargs):
                 if cfg.flash_attn_fuse_qkv:
                     LOG.info("patching with fused QKV")
                     replace_llama_qkv_with_fused(model)
-        # elif model_type == "GPTNeoXForCausalLM" and cfg.flash_attention:
-        #     This is a WIP, still an issue with the backward pass
-        #     RuntimeError: grad can be implicitly created only for scalar outputs
-        #     TODO: try config.sequence_parallel = False
-        #     # https://github.com/HazyResearch/flash-attention/blob/40a25c8ee7465cf547b929cfa2937034e37bfce9/tests/models/test_gpt_neox.py#L12
-        #     # https://github.com/HazyResearch/flash-attention/tree/main/training#model-components
-        #     # add `**kwargs` to https://github.com/HazyResearch/flash-attention/blob/40a25c8ee7465cf547b929cfa2937034e37bfce9/flash_attn/models/gpt.py#L442
-        #     from flash_attn.utils.pretrained import state_dict_from_pretrained
-        #     from flash_attn.models.gpt import GPTLMHeadModel
-        #     from flash_attn.models.gpt_neox import remap_state_dict_hf_gpt_neox, gpt_neox_config_to_gpt2_config
-        #     from transformers import GPTNeoXConfig
-        #     config = gpt_neox_config_to_gpt2_config(GPTNeoXConfig.from_pretrained(base_model))
-        #     config.use_flash_attn = True
-        #     config.fused_bias_fc = True
-        #     config.fused_mlp = True  # GPT-NeoX-20B uses "gelu_fast"
-        #     config.activation_function = "gelu_fast"
-        #     config.fused_dropout_add_ln = True
-        #     # config.residual_in_fp32 = True
-        #
-        #     model: GPTLMHeadModel = GPTLMHeadModel.from_pretrained(
-        #         base_model,
-        #         config,
-        #         dtype=torch_dtype,
-        #         device=cfg.device,
-        #     )
-        #     model.train() # sets to train instead of eval mode
         elif model_type == "MambaLMHeadModel":
             # FIXME this is janky at best and hacked together to make it work
             MambaLMHeadModel = fix_mamba_attn_for_loss()  # pylint: disable=invalid-name
diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index e52f35ccca..380264a7ac 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -304,6 +304,10 @@ def setup_fsdp_envs(cfg):
         os.environ["FSDP_OFFLOAD_PARAMS"] = "true"
     if cfg.fsdp_config.fsdp_sync_module_states:
         os.environ["FSDP_SYNC_MODULE_STATES"] = "true"
+    if cfg.fsdp_config.fsdp_cpu_ram_efficient_loading:
+        os.environ["FSDP_CPU_RAM_EFFICIENT_LOADING"] = "true"
+    if cfg.fsdp_config.fsdp_use_orig_params:
+        os.environ["FSDP_USE_ORIG_PARAMS"] = "true"
     if cfg.fsdp_config.fsdp_state_dict_type:
         os.environ["FSDP_STATE_DICT_TYPE"] = cfg.fsdp_config.fsdp_state_dict_type
     if cfg.fsdp_config.fsdp_transformer_layer_cls_to_wrap:
diff --git a/tests/e2e/test_mixtral.py b/tests/e2e/test_mixtral.py
index ee6f06d875..68afcdec4a 100644
--- a/tests/e2e/test_mixtral.py
+++ b/tests/e2e/test_mixtral.py
@@ -77,7 +77,7 @@ def test_qlora_w_fa2(self, temp_dir):
         model, _ = train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
         assert (
             model.base_model.model.model.layers[0].block_sparse_moe.gate.weight.dtype
-            == torch.uint8
+            == torch.float32
         )
         assert (Path(temp_dir) / "adapter_model.bin").exists()
 
@@ -131,7 +131,7 @@ def test_qlora_wo_fa2(self, temp_dir):
         model, _ = train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
         assert (
             model.base_model.model.model.layers[0].block_sparse_moe.gate.weight.dtype
-            == torch.uint8
+            == torch.float32
         )
         assert (Path(temp_dir) / "adapter_model.bin").exists()