huggingface · Cyrilvallez · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024
diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py
@@ -279,15 +279,10 @@ def _compute_longrope_parameters(
     # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two
     # values to compute the default attention scaling factor, instead of using `factor`.
     if hasattr(config, "original_max_position_embeddings"):
-        if seq_len and seq_len < config.original_max_position_embeddings:
-            expanded_max_position_embeddings = config.original_max_position_embeddings
-        else:
-            expanded_max_position_embeddings = config.max_position_embeddings
         max_position_embeddings = config.original_max_position_embeddings
-        factor = expanded_max_position_embeddings / max_position_embeddings
+        factor = config.max_position_embeddings / config.original_max_position_embeddings
     else:
         max_position_embeddings = config.max_position_embeddings
-        expanded_max_position_embeddings = max_position_embeddings * factor
 
     # Sets the attention factor as suggested in the paper
     if attention_factor is None:
@@ -297,7 +292,7 @@ def _compute_longrope_parameters(
             attention_factor = math.sqrt(1 + math.log(factor) / math.log(max_position_embeddings))
 
     # Compute the inverse frequencies -- scaled based on the target sequence length
-    if expanded_max_position_embeddings > max_position_embeddings:
+    if seq_len and seq_len > max_position_embeddings:
         ext_factors = torch.tensor(long_factor, dtype=torch.float32, device=device)
     else:
         ext_factors = torch.tensor(short_factor, dtype=torch.float32, device=device)

diff --git a/src/transformers/models/aria/modeling_aria.py b/src/transformers/models/aria/modeling_aria.py
@@ -723,11 +723,7 @@ def _init_weights(self, module):
 
 
 class AriaTextRotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        config: AriaTextConfig,
-        device=None,
-    ):
+    def __init__(self, config: AriaTextConfig, device=None):
         super().__init__()
         self.rope_kwargs = {}
         # BC: "rope_type" was originally "type"

diff --git a/src/transformers/models/bamba/modeling_bamba.py b/src/transformers/models/bamba/modeling_bamba.py
@@ -120,11 +120,7 @@ def __init__(self, config: BambaConfig, batch_size, dtype=torch.float16, device=
 
 
 class BambaRotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        config: BambaConfig,
-        device=None,
-    ):
+    def __init__(self, config: BambaConfig, device=None):
         super().__init__()
         self.rope_kwargs = {}
         # BC: "rope_type" was originally "type"

diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py