diff --git a/examples/deepseek-v2/qlora-fsdp-2_5.yaml b/examples/deepseek-v2/qlora-fsdp-2_5.yaml
new file mode 100644
index 0000000000..6e82062d66
--- /dev/null
+++ b/examples/deepseek-v2/qlora-fsdp-2_5.yaml
@@ -0,0 +1,83 @@
+base_model: axolotl-quants/DeepSeek-V2.5-bnb-nf4-bf16
+trust_remote_code: true
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+
+plugins:
+  - axolotl.integrations.liger.LigerPlugin
+liger_rms_norm: true
+liger_swiglu: true
+liger_fused_linear_cross_entropy: true
+
+chat_template: deepseek_v2
+datasets:
+  - path: mlabonne/FineTome-100k
+    type: chat_template
+    split: train
+
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.0
+output_dir: ./outputs/out
+
+sequence_len: 4096
+sample_packing: true
+pad_to_sequence_len: true
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+adapter: qlora
+lora_r: 256
+lora_alpha: 256
+lora_target_linear: true
+peft_use_rslora: true
+
+gradient_accumulation_steps: 1
+micro_batch_size: 8
+num_epochs: 1
+optimizer: adamw_torch
+lr_scheduler: cosine
+learning_rate: 2e-5
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+early_stopping_patience:
+resume_from_checkpoint:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 100
+evals_per_epoch: 2
+eval_table_size:
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+special_tokens:
+fsdp:
+  - full_shard
+  - auto_wrap
+fsdp_config:
+  fsdp_limit_all_gathers: true
+  fsdp_sync_module_states: true
+  fsdp_offload_params: true
+  fsdp_use_orig_params: false
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_transformer_layer_cls_to_wrap: DeepseekV2DecoderLayer
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_sharding_strategy: FULL_SHARD
diff --git a/requirements.txt b/requirements.txt
index 83116af60f..32a9e0e01c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,11 +1,11 @@
 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
 packaging==23.2
 peft==0.12.0
-transformers==4.44.2
+transformers @ git+https://github.com/huggingface/transformers.git@0963229e287501bed52ae1dabc17922524de6992
 tokenizers>=0.19.1
 bitsandbytes==0.43.3
 accelerate==0.34.2
-datasets==2.20.0
+datasets==2.21.0
 deepspeed==0.14.4
 pydantic==2.6.3
 addict
diff --git a/src/axolotl/monkeypatch/transformers_dynamic_module_utils.py b/src/axolotl/monkeypatch/transformers_dynamic_module_utils.py
deleted file mode 100644
index dfc3e29c5a..0000000000
--- a/src/axolotl/monkeypatch/transformers_dynamic_module_utils.py
+++ /dev/null
@@ -1,51 +0,0 @@
-"""Patch transformers.dynamic_module_utils.get_class_in_module to avoid reloading models from disk"""
-
-import importlib
-import os
-import sys
-import typing
-from pathlib import Path
-
-from transformers.file_utils import HF_MODULES_CACHE
-
-
-def _patched_get_class_in_module(
-    class_name: str, module_path: typing.Union[str, os.PathLike]
-) -> typing.Type:
-    """
-    Import a module on the cache directory for modules and extract a class from it.
-
-    Args:
-        class_name (`str`): The name of the class to import.
-        module_path (`str` or `os.PathLike`): The path to the module to import.
-
-    Returns:
-        `typing.Type`: The class looked for.
-    """
-    name = os.path.normpath(module_path)
-    if name.endswith(".py"):
-        name = name[:-3]
-    name = name.replace(os.path.sep, ".")
-    module_spec = importlib.util.spec_from_file_location(
-        name, location=Path(HF_MODULES_CACHE) / module_path
-    )
-    module = sys.modules.get(name)
-    if module is None:
-        module = importlib.util.module_from_spec(module_spec)
-        # insert it into sys.modules before any loading begins
-        sys.modules[name] = module
-        # load in initial case only
-        module_spec.loader.exec_module(module)
-    return getattr(module, class_name)
-
-
-def patch_transformers_dynamic_module_utils():
-    """
-    Recently, transformers started reloading modeling code from disk for models marked trust_remote_code=True.
-    This causes monkey-patches for multipack and liger to be removed.
-    We replace the original function with a version that does not reload the module from disk.
-    See https://github.com/huggingface/transformers/pull/30370#pullrequestreview-2264361581
-    """
-    import transformers
-
-    transformers.dynamic_module_utils.get_class_in_module = _patched_get_class_in_module
diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index e0526fb048..e183301991 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -43,9 +43,6 @@
     SUPPORTED_MULTIPACK_MODEL_TYPES,
     patch_for_multipack,
 )
-from axolotl.monkeypatch.transformers_dynamic_module_utils import (
-    patch_transformers_dynamic_module_utils,
-)
 from axolotl.prompt_tokenizers import LLAMA_DEFAULT_EOS_TOKEN
 from axolotl.utils.bench import log_gpu_memory_usage
 from axolotl.utils.chat_templates import chat_templates
@@ -57,8 +54,6 @@
 
 LOG = logging.getLogger("axolotl")
 
-patch_transformers_dynamic_module_utils()
-
 
 # copied from accelerator.FullyShardedDataParallelPlugin
 def get_module_class_from_name(module, name):