diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index cbefb966a4..9a8d992759 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -18,6 +18,7 @@ jobs:
             python_version: "3.10"
             pytorch: 2.1.2
             axolotl_extras:
+            axolotl_args: "--extra-index-url https://download.pytorch.org/whl/cu118"
             is_latest: true
           - cuda: 121
             cuda_version: 12.1.0
@@ -54,6 +55,7 @@ jobs:
             BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
             CUDA=${{ matrix.cuda }}
             PYTORCH_VERSION=${{ matrix.pytorch }}
+            AXOLOTL_ARGS=${{ matrix.axolotl_args }}
           file: ./docker/Dockerfile
           push: ${{ github.event_name != 'pull_request' }}
           tags: |
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 802dbf0917..41eae1071a 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -70,6 +70,7 @@ jobs:
             cuda_version: 11.8.0
             python_version: "3.10"
             pytorch: 2.1.2
+            axolotl_args: "--extra-index-url https://download.pytorch.org/whl/cu118"
           - cuda: 121
             cuda_version: 12.1.0
             python_version: "3.10"
@@ -87,11 +88,13 @@ jobs:
           # Set up build arguments
           BASE_TAG="main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}"
           CUDA="${{ matrix.cuda }}"
+          AXOLOTL_ARGS="${{ matrix.axolotl_args }}"
           PYTORCH_VERSION="${{ matrix.pytorch }}"
           # Build the Docker image
           docker build . \
             --file ./docker/Dockerfile-tests \
             --build-arg BASE_TAG=$BASE_TAG \
+            --build-arg AXOLOTL_ARGS="$AXOLOTL_ARGS" \
             --build-arg CUDA=$CUDA \
             --build-arg GITHUB_REF=$GITHUB_REF \
             --build-arg PYTORCH_VERSION=$PYTORCH_VERSION \
diff --git a/docker/Dockerfile b/docker/Dockerfile
index efc40ab061..9154fcda3c 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -3,6 +3,7 @@ FROM winglian/axolotl-base:$BASE_TAG
 
 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
 ARG AXOLOTL_EXTRAS=""
+ARG AXOLOTL_ARGS=""
 ARG CUDA="118"
 ENV BNB_CUDA_VERSION=$CUDA
 ARG PYTORCH_VERSION="2.0.1"
@@ -20,9 +21,9 @@ WORKDIR /workspace/axolotl
 
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS]; \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
     else \
-        pip install -e .[deepspeed,flash-attn,mamba-ssm]; \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm] $AXOLOTL_ARGS; \
     fi
 
 # So we can test the Docker image
diff --git a/docker/Dockerfile-tests b/docker/Dockerfile-tests
index 2ec94f8684..e7df99b472 100644
--- a/docker/Dockerfile-tests
+++ b/docker/Dockerfile-tests
@@ -3,6 +3,7 @@ FROM winglian/axolotl-base:$BASE_TAG
 
 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
 ARG AXOLOTL_EXTRAS=""
+ARG AXOLOTL_ARGS=""
 ARG CUDA="118"
 ENV BNB_CUDA_VERSION=$CUDA
 ARG PYTORCH_VERSION="2.0.1"
@@ -24,9 +25,9 @@ RUN git fetch origin +$GITHUB_REF && \
 
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS]; \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
     else \
-        pip install -e .[deepspeed,flash-attn,mamba-ssm]; \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm] $AXOLOTL_ARGS; \
     fi
 
 # So we can test the Docker image
diff --git a/requirements.txt b/requirements.txt
index a5986fa4ff..6532d39991 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,7 +21,6 @@ hf_transfer
 colorama
 numba
 numpy>=1.24.4
-mlflow
 # qlora things
 evaluate==0.4.1
 scipy
diff --git a/setup.py b/setup.py
index d4a39b76ea..aa730fbe69 100644
--- a/setup.py
+++ b/setup.py
@@ -82,5 +82,8 @@ def parse_requirements():
         "auto-gptq": [
             "auto-gptq==0.5.1",
         ],
+        "mlflow": [
+            "mlflow",
+        ],
     },
 )
diff --git a/src/axolotl/core/trainer_builder.py b/src/axolotl/core/trainer_builder.py
index c69f6cf5ab..5dca1e2b6b 100644
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -5,6 +5,7 @@
 
 import abc
 import importlib
+import importlib.util
 import logging
 import math
 import sys
@@ -34,7 +35,6 @@
     EvalFirstStepCallback,
     GPUStatsCallback,
     LossWatchDogCallback,
-    SaveAxolotlConfigtoMlflowCallback,
     SaveAxolotlConfigtoWandBCallback,
     SaveBetterTransformerModelCallback,
     bench_eval_callback_factory,
@@ -62,6 +62,10 @@
 LOG = logging.getLogger("axolotl.core.trainer_builder")
 
 
+def is_mlflow_available():
+    return importlib.util.find_spec("mlflow") is not None
+
+
 def _sanitize_kwargs_for_tagging(tag_names, kwargs=None):
     if isinstance(tag_names, str):
         tag_names = [tag_names]
@@ -648,7 +652,11 @@ def get_callbacks(self):
             callbacks.append(
                 SaveAxolotlConfigtoWandBCallback(self.cfg.axolotl_config_path)
             )
-        if self.cfg.use_mlflow:
+        if self.cfg.use_mlflow and is_mlflow_available():
+            from axolotl.utils.callbacks.mlflow_ import (
+                SaveAxolotlConfigtoMlflowCallback,
+            )
+
             callbacks.append(
                 SaveAxolotlConfigtoMlflowCallback(self.cfg.axolotl_config_path)
             )
diff --git a/src/axolotl/monkeypatch/llama_attn_hijack_flash.py b/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
index 86dde18a6a..f727c74b82 100644
--- a/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
@@ -44,6 +44,18 @@
 LOG = logging.getLogger("axolotl")
 
 
+def is_xformers_swiglu_available() -> bool:
+    from xformers.ops.common import get_xformers_operator
+
+    try:
+        get_xformers_operator("swiglu_packedw")()
+        return True
+    except RuntimeError as exc:
+        if "No such operator xformers::swiglu_packedw " in str(exc):
+            return False
+        return True
+
+
 def replace_llama_mlp_with_swiglu(model):
     for name, module in model.named_modules():
         if isinstance(module, LlamaMLP):
diff --git a/src/axolotl/utils/callbacks.py b/src/axolotl/utils/callbacks/__init__.py
similarity index 96%
rename from src/axolotl/utils/callbacks.py
rename to src/axolotl/utils/callbacks/__init__.py
index f5357b02c6..6a489f6c0e 100644
--- a/src/axolotl/utils/callbacks.py
+++ b/src/axolotl/utils/callbacks/__init__.py
@@ -9,7 +9,6 @@
 from typing import TYPE_CHECKING, Dict, List
 
 import evaluate
-import mlflow
 import numpy as np
 import pandas as pd
 import torch
@@ -42,8 +41,8 @@
 if TYPE_CHECKING:
     from axolotl.core.trainer_builder import AxolotlTrainingArguments
 
-LOG = logging.getLogger("axolotl.callbacks")
 IGNORE_INDEX = -100
+LOG = logging.getLogger("axolotl.callbacks")
 
 
 class EvalFirstStepCallback(
@@ -756,31 +755,3 @@ def on_train_begin(
             except (FileNotFoundError, ConnectionError) as err:
                 LOG.warning(f"Error while saving Axolotl config to WandB: {err}")
         return control
-
-
-class SaveAxolotlConfigtoMlflowCallback(TrainerCallback):
-    """Callback to save axolotl config to mlflow"""
-
-    def __init__(self, axolotl_config_path):
-        self.axolotl_config_path = axolotl_config_path
-
-    def on_train_begin(
-        self,
-        args: AxolotlTrainingArguments,  # pylint: disable=unused-argument
-        state: TrainerState,  # pylint: disable=unused-argument
-        control: TrainerControl,
-        **kwargs,  # pylint: disable=unused-argument
-    ):
-        if is_main_process():
-            try:
-                with NamedTemporaryFile(
-                    mode="w", delete=False, suffix=".yml", prefix="axolotl_config_"
-                ) as temp_file:
-                    copyfile(self.axolotl_config_path, temp_file.name)
-                    mlflow.log_artifact(temp_file.name, artifact_path="")
-                    LOG.info(
-                        "The Axolotl config has been saved to the MLflow artifacts."
-                    )
-            except (FileNotFoundError, ConnectionError) as err:
-                LOG.warning(f"Error while saving Axolotl config to MLflow: {err}")
-        return control
diff --git a/src/axolotl/utils/callbacks/mlflow_.py b/src/axolotl/utils/callbacks/mlflow_.py
new file mode 100644
index 0000000000..fcbb88edcd
--- /dev/null
+++ b/src/axolotl/utils/callbacks/mlflow_.py
@@ -0,0 +1,44 @@
+"""MLFlow module for trainer callbacks"""
+import logging
+from shutil import copyfile
+from tempfile import NamedTemporaryFile
+from typing import TYPE_CHECKING
+
+import mlflow
+from transformers import TrainerCallback, TrainerControl, TrainerState
+
+from axolotl.utils.distributed import is_main_process
+
+if TYPE_CHECKING:
+    from axolotl.core.trainer_builder import AxolotlTrainingArguments
+
+LOG = logging.getLogger("axolotl.callbacks")
+
+
+class SaveAxolotlConfigtoMlflowCallback(TrainerCallback):
+    # pylint: disable=duplicate-code
+    """Callback to save axolotl config to mlflow"""
+
+    def __init__(self, axolotl_config_path):
+        self.axolotl_config_path = axolotl_config_path
+
+    def on_train_begin(
+        self,
+        args: "AxolotlTrainingArguments",  # pylint: disable=unused-argument
+        state: TrainerState,  # pylint: disable=unused-argument
+        control: TrainerControl,
+        **kwargs,  # pylint: disable=unused-argument
+    ):
+        if is_main_process():
+            try:
+                with NamedTemporaryFile(
+                    mode="w", delete=False, suffix=".yml", prefix="axolotl_config_"
+                ) as temp_file:
+                    copyfile(self.axolotl_config_path, temp_file.name)
+                    mlflow.log_artifact(temp_file.name, artifact_path="")
+                    LOG.info(
+                        "The Axolotl config has been saved to the MLflow artifacts."
+                    )
+            except (FileNotFoundError, ConnectionError) as err:
+                LOG.warning(f"Error while saving Axolotl config to MLflow: {err}")
+        return control
diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index c5f3754458..c2006997de 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -512,11 +512,12 @@ def load_model(
 
             if cfg.flash_attention and not inference:
                 from axolotl.monkeypatch.llama_attn_hijack_flash import (
+                    is_xformers_swiglu_available,
                     replace_llama_mlp_with_swiglu,
                     replace_llama_qkv_with_fused,
                 )
 
-                if cfg.flash_attn_fuse_mlp:
+                if cfg.flash_attn_fuse_mlp and is_xformers_swiglu_available():
                     LOG.info("patching with SwiGLU")
                     replace_llama_mlp_with_swiglu(model)
 
diff --git a/tests/e2e/patched/test_fused_llama.py b/tests/e2e/patched/test_fused_llama.py
index dda08a4636..de1195c368 100644
--- a/tests/e2e/patched/test_fused_llama.py
+++ b/tests/e2e/patched/test_fused_llama.py
@@ -57,9 +57,9 @@ def test_fft_packing(self, temp_dir):
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_torch",
                 "lr_scheduler": "cosine",
-                "max_steps": 20,
-                "save_steps": 10,
-                "eval_steps": 10,
+                "max_steps": 10,
+                "save_steps": 5,
+                "eval_steps": 5,
             }
         )
         if is_torch_bf16_gpu_available():