diff --git a/examples/deepseek-v2/qlora-fsdp-2_5.yaml b/examples/deepseek-v2/qlora-fsdp-2_5.yaml new file mode 100644 index 0000000000..6e82062d66 --- /dev/null +++ b/examples/deepseek-v2/qlora-fsdp-2_5.yaml @@ -0,0 +1,83 @@ +base_model: axolotl-quants/DeepSeek-V2.5-bnb-nf4-bf16 +trust_remote_code: true + +load_in_8bit: false +load_in_4bit: true +strict: false + + +plugins: + - axolotl.integrations.liger.LigerPlugin +liger_rms_norm: true +liger_swiglu: true +liger_fused_linear_cross_entropy: true + +chat_template: deepseek_v2 +datasets: + - path: mlabonne/FineTome-100k + type: chat_template + split: train + +dataset_prepared_path: last_run_prepared +val_set_size: 0.0 +output_dir: ./outputs/out + +sequence_len: 4096 +sample_packing: true +pad_to_sequence_len: true + +wandb_project: +wandb_entity: +wandb_watch: +wandb_name: +wandb_log_model: + +adapter: qlora +lora_r: 256 +lora_alpha: 256 +lora_target_linear: true +peft_use_rslora: true + +gradient_accumulation_steps: 1 +micro_batch_size: 8 +num_epochs: 1 +optimizer: adamw_torch +lr_scheduler: cosine +learning_rate: 2e-5 + +train_on_inputs: false +group_by_length: false +bf16: auto +fp16: +tf32: false + +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +early_stopping_patience: +resume_from_checkpoint: +logging_steps: 1 +xformers_attention: +flash_attention: true + +warmup_steps: 100 +evals_per_epoch: 2 +eval_table_size: +saves_per_epoch: 1 +debug: +deepspeed: +weight_decay: 0.0 +special_tokens: +fsdp: + - full_shard + - auto_wrap +fsdp_config: + fsdp_limit_all_gathers: true + fsdp_sync_module_states: true + fsdp_offload_params: true + fsdp_use_orig_params: false + fsdp_cpu_ram_efficient_loading: true + fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP + fsdp_transformer_layer_cls_to_wrap: DeepseekV2DecoderLayer + fsdp_state_dict_type: FULL_STATE_DICT + fsdp_sharding_strategy: FULL_SHARD diff --git a/requirements.txt b/requirements.txt index 83116af60f..32a9e0e01c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,11 @@ --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/ packaging==23.2 peft==0.12.0 -transformers==4.44.2 +transformers @ git+https://github.com/huggingface/transformers.git@0963229e287501bed52ae1dabc17922524de6992 tokenizers>=0.19.1 bitsandbytes==0.43.3 accelerate==0.34.2 -datasets==2.20.0 +datasets==2.21.0 deepspeed==0.14.4 pydantic==2.6.3 addict diff --git a/src/axolotl/monkeypatch/transformers_dynamic_module_utils.py b/src/axolotl/monkeypatch/transformers_dynamic_module_utils.py deleted file mode 100644 index dfc3e29c5a..0000000000 --- a/src/axolotl/monkeypatch/transformers_dynamic_module_utils.py +++ /dev/null @@ -1,51 +0,0 @@ -"""Patch transformers.dynamic_module_utils.get_class_in_module to avoid reloading models from disk""" - -import importlib -import os -import sys -import typing -from pathlib import Path - -from transformers.file_utils import HF_MODULES_CACHE - - -def _patched_get_class_in_module( - class_name: str, module_path: typing.Union[str, os.PathLike] -) -> typing.Type: - """ - Import a module on the cache directory for modules and extract a class from it. - - Args: - class_name (`str`): The name of the class to import. - module_path (`str` or `os.PathLike`): The path to the module to import. - - Returns: - `typing.Type`: The class looked for. - """ - name = os.path.normpath(module_path) - if name.endswith(".py"): - name = name[:-3] - name = name.replace(os.path.sep, ".") - module_spec = importlib.util.spec_from_file_location( - name, location=Path(HF_MODULES_CACHE) / module_path - ) - module = sys.modules.get(name) - if module is None: - module = importlib.util.module_from_spec(module_spec) - # insert it into sys.modules before any loading begins - sys.modules[name] = module - # load in initial case only - module_spec.loader.exec_module(module) - return getattr(module, class_name) - - -def patch_transformers_dynamic_module_utils(): - """ - Recently, transformers started reloading modeling code from disk for models marked trust_remote_code=True. - This causes monkey-patches for multipack and liger to be removed. - We replace the original function with a version that does not reload the module from disk. - See https://github.com/huggingface/transformers/pull/30370#pullrequestreview-2264361581 - """ - import transformers - - transformers.dynamic_module_utils.get_class_in_module = _patched_get_class_in_module diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index e0526fb048..e183301991 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -43,9 +43,6 @@ SUPPORTED_MULTIPACK_MODEL_TYPES, patch_for_multipack, ) -from axolotl.monkeypatch.transformers_dynamic_module_utils import ( - patch_transformers_dynamic_module_utils, -) from axolotl.prompt_tokenizers import LLAMA_DEFAULT_EOS_TOKEN from axolotl.utils.bench import log_gpu_memory_usage from axolotl.utils.chat_templates import chat_templates @@ -57,8 +54,6 @@ LOG = logging.getLogger("axolotl") -patch_transformers_dynamic_module_utils() - # copied from accelerator.FullyShardedDataParallelPlugin def get_module_class_from_name(module, name):