diff --git a/src/axolotl/integrations/diff_transformer/patches.py b/src/axolotl/monkeypatch/attention/differential.py similarity index 95% rename from src/axolotl/integrations/diff_transformer/patches.py rename to src/axolotl/monkeypatch/attention/differential.py index 37ad0a981b..037a6f0bd2 100644 --- a/src/axolotl/integrations/diff_transformer/patches.py +++ b/src/axolotl/monkeypatch/attention/differential.py @@ -3,7 +3,7 @@ from transformers import PreTrainedModel from transformers.models.llama.modeling_llama import LLAMA_ATTENTION_CLASSES -from .multihead_diffattn import ( +from axolotl.integrations.diff_transformer.multihead_diffattn import ( LlamaDifferentialAttention, LlamaDifferentialSdpaAttention, ) diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index 0b6277b6ff..5d51fd4368 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -456,7 +456,7 @@ def apply_patches(self) -> None: patch_mistral_cross_entropy() if self.cfg.diff_attention: - from axolotl.integrations.diff_transformer.patches import ( + from axolotl.monkeypatch.attention.differential import ( patch_llama_attention_classes, )