Skip to content

Commit

Permalink
fix: Qwen2-VL generate with inputs_embeds (#35466)
Browse files Browse the repository at this point in the history
* fix: Qwen2-VL generate with inputs_embeds

* change: optional input_ids in get_rope_index
  • Loading branch information
minostauros authored Jan 8, 2025
1 parent 88e18b3 commit 4349a0e
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 13 deletions.
15 changes: 5 additions & 10 deletions src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,8 @@
from ...activations import ACT2FN
from ...cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
from ...generation import GenerationMixin
from ...modeling_attn_mask_utils import (
AttentionMaskConverter,
)
from ...modeling_outputs import (
BaseModelOutputWithPast,
ModelOutput,
)
from ...modeling_attn_mask_utils import AttentionMaskConverter
from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput
from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
from ...modeling_utils import PreTrainedModel
from ...utils import (
Expand Down Expand Up @@ -1420,7 +1415,7 @@ def get_decoder(self):

def get_rope_index(
self,
input_ids: torch.LongTensor,
input_ids: Optional[torch.LongTensor] = None,
image_grid_thw: Optional[torch.LongTensor] = None,
video_grid_thw: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
Expand Down Expand Up @@ -1550,7 +1545,7 @@ def get_rope_index(
if attention_mask is not None:
position_ids = attention_mask.long().cumsum(-1) - 1
position_ids.masked_fill_(attention_mask == 0, 1)
position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(input_ids.device)
position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
else:
Expand Down Expand Up @@ -1676,7 +1671,7 @@ def forward(
attention_mask = attention_mask.to(inputs_embeds.device)

# if we get 4D attention mask we cannot calculate rope deltas anymore. TODO @raushan fixme
if position_ids is None and input_ids is not None and (attention_mask is None or attention_mask.ndim == 2):
if position_ids is None and (attention_mask is None or attention_mask.ndim == 2):
# calculate RoPE index once per generation in the pre-fill stage only
if (cache_position is not None and cache_position[0] == 0) or self.rope_deltas is None:
position_ids, rope_deltas = self.get_rope_index(
Expand Down
4 changes: 1 addition & 3 deletions tests/generation/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1615,9 +1615,7 @@ def test_generate_from_inputs_embeds(self, _, num_beams):

# There are a few exception patterns in this test:
# 1 - Some models can't generate without `input_ids`, when `inputs_embeds` are passed
requires_inputs_ids = any(
model_name in model_class.__name__.lower() for model_name in ["idefics", "qwen2vl"]
)
requires_inputs_ids = any(model_name in model_class.__name__.lower() for model_name in ["idefics"])
# 2 - Complex `inputs_embeds` computation, i.e. the correct computation of inputs embeds is more complex
# than calling the embedding layer with `input_ids`. Subcases of this exception:
# 2.A - Ignore `scale_embedding`, if the model supports it (it is controlled by a model-dependent flag)
Expand Down

0 comments on commit 4349a0e

Please sign in to comment.