diff --git a/optimum/bettertransformer/models/attention.py b/optimum/bettertransformer/models/attention.py index d7ded056941..a9fdb7b3e98 100644 --- a/optimum/bettertransformer/models/attention.py +++ b/optimum/bettertransformer/models/attention.py @@ -913,6 +913,7 @@ def falcon_forward( alibi: Optional[torch.Tensor], attention_mask: torch.Tensor, layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + position_ids: Optional[torch.LongTensor] = None, head_mask: Optional[torch.Tensor] = None, use_cache: bool = False, output_attentions: bool = False, @@ -937,7 +938,7 @@ def falcon_forward( value_layer = value_layer.transpose(1, 2).reshape(batch_size * num_kv_heads, query_length, self.head_dim) past_kv_length = 0 if layer_past is None else layer_past[0].shape[1] - query_layer, key_layer = self.maybe_rotary(query_layer, key_layer, past_kv_length) + query_layer, key_layer = self.maybe_rotary(query_layer, key_layer, past_kv_length, position_ids) if layer_past is not None: past_key, past_value = layer_past diff --git a/optimum/bettertransformer/models/decoder_models.py b/optimum/bettertransformer/models/decoder_models.py index 47aacd17b61..adcbfb4fd87 100644 --- a/optimum/bettertransformer/models/decoder_models.py +++ b/optimum/bettertransformer/models/decoder_models.py @@ -44,7 +44,7 @@ else: from ...utils.dummy_bettertransformer_objects import BarkSelfAttention -if check_if_transformers_greater("4.32"): +if check_if_transformers_greater("4.34"): from transformers.models.falcon.modeling_falcon import FalconAttention else: from ...utils.dummy_bettertransformer_objects import FalconAttention diff --git a/optimum/utils/dummy_bettertransformer_objects.py b/optimum/utils/dummy_bettertransformer_objects.py index e0d982c4263..83a88076563 100644 --- a/optimum/utils/dummy_bettertransformer_objects.py +++ b/optimum/utils/dummy_bettertransformer_objects.py @@ -16,10 +16,10 @@ def __init__(self, *args, **kwargs): class FalconAttention(metaclass=DummyObject): - _backends = ["transformers_432"] + _backends = ["transformers_434"] def __init__(self, *args, **kwargs): - requires_backends(self, ["transformers_432"]) + requires_backends(self, ["transformers_434"]) def _llama_prepare_decoder_attention_mask(*args, **kwargs): diff --git a/optimum/utils/import_utils.py b/optimum/utils/import_utils.py index 773f61576e2..ed310a43ea5 100644 --- a/optimum/utils/import_utils.py +++ b/optimum/utils/import_utils.py @@ -201,6 +201,10 @@ def require_numpy_strictly_lower(version: str, message: str): "transformers_432", (lambda: check_if_transformers_greater("4.32"), "{0} " + TRANSFORMERS_IMPORT_ERROR.format("4.32")), ), + ( + "transformers_434", + (lambda: check_if_transformers_greater("4.34"), "{0} " + TRANSFORMERS_IMPORT_ERROR.format("4.34")), + ), ] ) diff --git a/tests/bettertransformer/testing_utils.py b/tests/bettertransformer/testing_utils.py index 113c59f63cb..ed453d06d7a 100644 --- a/tests/bettertransformer/testing_utils.py +++ b/tests/bettertransformer/testing_utils.py @@ -43,7 +43,7 @@ "distilbert": "hf-internal-testing/tiny-random-DistilBertModel", "electra": "hf-internal-testing/tiny-random-ElectraModel", "ernie": "hf-internal-testing/tiny-random-ErnieModel", - "falcon": "Rocketknight1/tiny-random-falcon-7b", + "falcon": "fxmarty/really-tiny-falcon-testing", "fsmt": "hf-internal-testing/tiny-random-FSMTModel", "gpt2": "hf-internal-testing/tiny-random-GPT2Model", # NOTE: this tiny model does not use attention_softmax_in_fp32=True (contrary to e.g. starcoder)