From 1fb575fcf0ed6d1895a21e1c28b10b6287bd28ec Mon Sep 17 00:00:00 2001 From: Aymeric Roucher <69208727+aymeric-roucher@users.noreply.github.com> Date: Wed, 23 Oct 2024 16:48:21 +0200 Subject: [PATCH 01/99] Support boolean tool args (#34208) Support boolean tool arguments --- src/transformers/agents/tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/agents/tools.py b/src/transformers/agents/tools.py index cfb1e4cf95c..a425ffc8f10 100644 --- a/src/transformers/agents/tools.py +++ b/src/transformers/agents/tools.py @@ -138,7 +138,7 @@ def validate_arguments(self): "inputs": Dict, "output_type": str, } - authorized_types = ["string", "integer", "number", "image", "audio", "any"] + authorized_types = ["string", "integer", "number", "image", "audio", "any", "boolean"] for attr, expected_type in required_attributes.items(): attr_value = getattr(self, attr, None) From d9f733625c43158f3fa52377f2f8bf49350160f3 Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Wed, 23 Oct 2024 11:24:57 -0400 Subject: [PATCH 02/99] Enable Gradient Accumulation fix across all models + trainer fully in forward() (#34283) * Enable grad accum fix across all models + trainer fully in forward() * handle peft case * Account for DDP: need to run scale tests * Use accelerator state * Quality * Guard * Experiment w/ only fairseq fix * Fairseq only * Revert multiply_grads fix * Mult by grad accum to fully bring back solution * Style * Good to go now * Skip fx tests for now * Bookmark * Working now --- .../models/cohere/modeling_cohere.py | 3 +- .../models/gemma/modeling_gemma.py | 3 +- .../models/gemma/modular_gemma.py | 3 +- .../models/gemma2/modeling_gemma2.py | 3 +- .../models/gemma2/modular_gemma2.py | 3 +- src/transformers/models/glm/modeling_glm.py | 3 +- .../models/jamba/modeling_jamba.py | 3 +- .../models/mixtral/modeling_mixtral.py | 3 +- .../models/mllama/modeling_mllama.py | 3 +- .../models/nemotron/modeling_nemotron.py | 3 +- src/transformers/models/olmo/modeling_olmo.py | 3 +- .../models/olmoe/modeling_olmoe.py | 3 +- src/transformers/models/phi/modeling_phi.py | 3 +- src/transformers/models/phi3/modeling_phi3.py | 3 +- .../models/phimoe/modeling_phimoe.py | 3 +- .../models/qwen2/modeling_qwen2.py | 3 +- .../models/qwen2_moe/modeling_qwen2_moe.py | 3 +- .../models/rt_detr/modeling_rt_detr.py | 2 ++ .../models/zamba/modeling_zamba.py | 3 +- src/transformers/trainer.py | 36 ++++++++++++------- tests/models/cohere/test_modeling_cohere.py | 4 +++ tests/models/mistral/test_modeling_mistral.py | 4 +++ tests/models/mixtral/test_modeling_mixtral.py | 4 +++ tests/models/qwen2/test_modeling_qwen2.py | 4 +++ .../qwen2_moe/test_modeling_qwen2_moe.py | 4 +++ 25 files changed, 81 insertions(+), 31 deletions(-) diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py index 3abe6ef8644..9aa588be431 100644 --- a/src/transformers/models/cohere/modeling_cohere.py +++ b/src/transformers/models/cohere/modeling_cohere.py @@ -1114,6 +1114,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: @@ -1172,7 +1173,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py index 6f364ffcf7e..9a4de1022c5 100644 --- a/src/transformers/models/gemma/modeling_gemma.py +++ b/src/transformers/models/gemma/modeling_gemma.py @@ -1030,6 +1030,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: @@ -1087,7 +1088,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/models/gemma/modular_gemma.py b/src/transformers/models/gemma/modular_gemma.py index c3d780bc571..807f91ff9e6 100644 --- a/src/transformers/models/gemma/modular_gemma.py +++ b/src/transformers/models/gemma/modular_gemma.py @@ -961,6 +961,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" ```python @@ -1003,7 +1004,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py index 467981bb78d..6d61c47619f 100644 --- a/src/transformers/models/gemma2/modeling_gemma2.py +++ b/src/transformers/models/gemma2/modeling_gemma2.py @@ -1002,6 +1002,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: @@ -1068,7 +1069,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/models/gemma2/modular_gemma2.py b/src/transformers/models/gemma2/modular_gemma2.py index 49010152b81..7ddb1c9f4c9 100644 --- a/src/transformers/models/gemma2/modular_gemma2.py +++ b/src/transformers/models/gemma2/modular_gemma2.py @@ -756,6 +756,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" ```python @@ -807,7 +808,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py index a458c02a6fe..aad4da282b7 100644 --- a/src/transformers/models/glm/modeling_glm.py +++ b/src/transformers/models/glm/modeling_glm.py @@ -1014,6 +1014,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: @@ -1071,7 +1072,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/models/jamba/modeling_jamba.py b/src/transformers/models/jamba/modeling_jamba.py index 737be17cfc1..32ae6ea02eb 100755 --- a/src/transformers/models/jamba/modeling_jamba.py +++ b/src/transformers/models/jamba/modeling_jamba.py @@ -1450,6 +1450,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: Optional[Union[int, None]] = None, + **loss_kwargs, ) -> Union[Tuple, MoeCausalLMOutputWithPast]: r""" Args: @@ -1515,7 +1516,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) aux_loss = None if output_router_logits: diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py index f5f11ba995c..192b7801af0 100644 --- a/src/transformers/models/mixtral/modeling_mixtral.py +++ b/src/transformers/models/mixtral/modeling_mixtral.py @@ -1240,6 +1240,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, MoeCausalLMOutputWithPast]: r""" Args: @@ -1303,7 +1304,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) aux_loss = None if output_router_logits: diff --git a/src/transformers/models/mllama/modeling_mllama.py b/src/transformers/models/mllama/modeling_mllama.py index c5ae615a12b..8ce6150a2fa 100644 --- a/src/transformers/models/mllama/modeling_mllama.py +++ b/src/transformers/models/mllama/modeling_mllama.py @@ -1887,6 +1887,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: @@ -1949,7 +1950,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/models/nemotron/modeling_nemotron.py b/src/transformers/models/nemotron/modeling_nemotron.py index d5470dbbaa1..d4eb348260c 100644 --- a/src/transformers/models/nemotron/modeling_nemotron.py +++ b/src/transformers/models/nemotron/modeling_nemotron.py @@ -1028,6 +1028,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: @@ -1085,7 +1086,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py index 6c7dc59cdbf..60225d4759c 100644 --- a/src/transformers/models/olmo/modeling_olmo.py +++ b/src/transformers/models/olmo/modeling_olmo.py @@ -1068,6 +1068,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: @@ -1126,7 +1127,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/models/olmoe/modeling_olmoe.py b/src/transformers/models/olmoe/modeling_olmoe.py index 32f7ded42e8..cbb8db0f59d 100644 --- a/src/transformers/models/olmoe/modeling_olmoe.py +++ b/src/transformers/models/olmoe/modeling_olmoe.py @@ -1228,6 +1228,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, MoeCausalLMOutputWithPast]: r""" Args: @@ -1290,7 +1291,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) aux_loss = None if output_router_logits: diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py index ef1a5b4d0ec..4613672ff27 100644 --- a/src/transformers/models/phi/modeling_phi.py +++ b/src/transformers/models/phi/modeling_phi.py @@ -1192,6 +1192,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: @@ -1250,7 +1251,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py index 16601e1f995..9e638c27afa 100644 --- a/src/transformers/models/phi3/modeling_phi3.py +++ b/src/transformers/models/phi3/modeling_phi3.py @@ -1209,6 +1209,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: @@ -1275,7 +1276,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/models/phimoe/modeling_phimoe.py b/src/transformers/models/phimoe/modeling_phimoe.py index 559daeca694..791f6df50bb 100644 --- a/src/transformers/models/phimoe/modeling_phimoe.py +++ b/src/transformers/models/phimoe/modeling_phimoe.py @@ -1377,6 +1377,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, MoeCausalLMOutputWithPast]: r""" Args: @@ -1442,7 +1443,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) aux_loss = None if output_router_logits: diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py index a6e4d12d799..0d97f2ffb72 100644 --- a/src/transformers/models/qwen2/modeling_qwen2.py +++ b/src/transformers/models/qwen2/modeling_qwen2.py @@ -1121,6 +1121,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: @@ -1179,7 +1180,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py index d482316b5b8..36de586265c 100644 --- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py @@ -1305,6 +1305,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, MoeCausalLMOutputWithPast]: r""" Args: @@ -1367,7 +1368,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) aux_loss = None if output_router_logits: diff --git a/src/transformers/models/rt_detr/modeling_rt_detr.py b/src/transformers/models/rt_detr/modeling_rt_detr.py index 1c09025a34b..cae48455047 100644 --- a/src/transformers/models/rt_detr/modeling_rt_detr.py +++ b/src/transformers/models/rt_detr/modeling_rt_detr.py @@ -2027,6 +2027,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + **loss_kwargs, ) -> Union[Tuple[torch.FloatTensor], RTDetrObjectDetectionOutput]: r""" labels (`List[Dict]` of len `(batch_size,)`, *optional*): @@ -2128,6 +2129,7 @@ def forward( enc_topk_logits=enc_topk_logits, enc_topk_bboxes=enc_topk_bboxes, denoising_meta_values=denoising_meta_values, + **loss_kwargs, ) if not return_dict: diff --git a/src/transformers/models/zamba/modeling_zamba.py b/src/transformers/models/zamba/modeling_zamba.py index 921d07f287d..dee7f898fcf 100644 --- a/src/transformers/models/zamba/modeling_zamba.py +++ b/src/transformers/models/zamba/modeling_zamba.py @@ -1418,6 +1418,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, + **loss_kwargs, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: @@ -1477,7 +1478,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits, labels, self.vocab_size) + loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs) if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 7890e084871..1b13787007e 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -582,6 +582,16 @@ def __init__( self.model_wrapped = model self.model = model + # Just in case the model was wrapped outside of the `Trainer` + unwrapped_model = self.accelerator.unwrap_model(model) + model_forward = ( + unwrapped_model.forward + if not _is_peft_model(unwrapped_model) + else unwrapped_model.get_base_model().forward + ) + + self.model_accepts_loss_kwargs = "loss_kwargs" in inspect.signature(model_forward).parameters + self.neftune_noise_alpha = args.neftune_noise_alpha self.compute_metrics = compute_metrics @@ -2417,8 +2427,14 @@ def _inner_training_loop( for inputs in batch_samples: step += 1 total_batched_samples += 1 + is_last_step_and_steps_less_than_grad_acc = ( + steps_in_epoch <= args.gradient_accumulation_steps and (step + 1) == steps_in_epoch + ) + do_sync_step = is_last_step_and_steps_less_than_grad_acc or ( + total_batched_samples % args.gradient_accumulation_steps == 0 + ) # Since we perform prefetching, we need to manually set sync_gradients - if total_batched_samples % args.gradient_accumulation_steps != 0: + if not do_sync_step: self.accelerator.gradient_state._set_sync_gradients(False) else: self.accelerator.gradient_state._set_sync_gradients(True) @@ -2473,16 +2489,7 @@ def _inner_training_loop( self.current_flos += float(self.floating_point_ops(inputs)) - is_last_step_and_steps_less_than_grad_acc = ( - steps_in_epoch <= args.gradient_accumulation_steps and (step + 1) == steps_in_epoch - ) - - if ( - (total_batched_samples) % args.gradient_accumulation_steps == 0 - or - # last step in epoch but step is always smaller than gradient_accumulation_steps - is_last_step_and_steps_less_than_grad_acc - ): + if do_sync_step: # Since we perform prefetching, we need to manually set sync_gradients to True self.accelerator.gradient_state._set_sync_gradients(True) @@ -3610,8 +3617,11 @@ def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=N labels = inputs.pop("labels") else: labels = None - # if num_items_in_batch is not None: - # inputs["num_items_in_batch"] = num_items_in_batch + if self.model_accepts_loss_kwargs: + loss_kwargs = {} + if num_items_in_batch is not None: + loss_kwargs["num_items_in_batch"] = num_items_in_batch + inputs = {**inputs, **loss_kwargs} outputs = model(**inputs) # Save past state if it exists # TODO: this needs to be fixed and made cleaner later. diff --git a/tests/models/cohere/test_modeling_cohere.py b/tests/models/cohere/test_modeling_cohere.py index 7d12dd3d873..b8a5aec9d41 100644 --- a/tests/models/cohere/test_modeling_cohere.py +++ b/tests/models/cohere/test_modeling_cohere.py @@ -304,6 +304,10 @@ def test_model_various_embeddings(self): config_and_inputs[0].position_embedding_type = type self.model_tester.create_and_check_model(*config_and_inputs) + @unittest.skip(reason="PR #34283 made changes to the forward function.") + def test_torch_fx_output_loss(self): + super().test_torch_fx_output_loss() + @require_bitsandbytes @require_torch_sdpa @require_torch_multi_gpu diff --git a/tests/models/mistral/test_modeling_mistral.py b/tests/models/mistral/test_modeling_mistral.py index ff7f1e87bc1..13e5e3d1f60 100644 --- a/tests/models/mistral/test_modeling_mistral.py +++ b/tests/models/mistral/test_modeling_mistral.py @@ -356,6 +356,10 @@ def test_model_various_embeddings(self): config_and_inputs[0].position_embedding_type = type self.model_tester.create_and_check_model(*config_and_inputs) + @unittest.skip(reason="PR #34283 made changes to the forward function.") + def test_torch_fx_output_loss(self): + super().test_torch_fx_output_loss() + def test_Mistral_sequence_classification_model(self): config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() print(config) diff --git a/tests/models/mixtral/test_modeling_mixtral.py b/tests/models/mixtral/test_modeling_mixtral.py index 0e6b2a999e8..0bfb5126ebd 100644 --- a/tests/models/mixtral/test_modeling_mixtral.py +++ b/tests/models/mixtral/test_modeling_mixtral.py @@ -356,6 +356,10 @@ def test_model_various_embeddings(self): config_and_inputs[0].position_embedding_type = type self.model_tester.create_and_check_model(*config_and_inputs) + @unittest.skip(reason="PR #34283 made changes to the forward function.") + def test_torch_fx_output_loss(self): + super().test_torch_fx_output_loss() + def test_Mixtral_sequence_classification_model(self): config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() print(config) diff --git a/tests/models/qwen2/test_modeling_qwen2.py b/tests/models/qwen2/test_modeling_qwen2.py index 1fee3192a64..769d6caabd9 100644 --- a/tests/models/qwen2/test_modeling_qwen2.py +++ b/tests/models/qwen2/test_modeling_qwen2.py @@ -368,6 +368,10 @@ def test_model_various_embeddings(self): config_and_inputs[0].position_embedding_type = type self.model_tester.create_and_check_model(*config_and_inputs) + @unittest.skip(reason="PR #34283 made changes to the forward function.") + def test_torch_fx_output_loss(self): + super().test_torch_fx_output_loss() + def test_Qwen2_sequence_classification_model(self): config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() print(config) diff --git a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py index d7b17b740f9..374d9472ca2 100644 --- a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py +++ b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py @@ -391,6 +391,10 @@ def test_model_various_embeddings(self): config_and_inputs[0].position_embedding_type = type self.model_tester.create_and_check_model(*config_and_inputs) + @unittest.skip(reason="PR #34283 made changes to the forward function.") + def test_torch_fx_output_loss(self): + super().test_torch_fx_output_loss() + def test_Qwen2Moe_sequence_classification_model(self): config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() print(config) From c42b3223db0fc24ff9a694f19e6c78faf3ac58a1 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Wed, 23 Oct 2024 17:27:51 +0200 Subject: [PATCH 03/99] skip `test_pipeline_depth_estimation` temporarily (#34316) skip Co-authored-by: ydshieh --- tests/models/glpn/test_modeling_glpn.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/models/glpn/test_modeling_glpn.py b/tests/models/glpn/test_modeling_glpn.py index 81e95ab244f..254c1135357 100644 --- a/tests/models/glpn/test_modeling_glpn.py +++ b/tests/models/glpn/test_modeling_glpn.py @@ -157,6 +157,14 @@ def setUp(self): self.model_tester = GLPNModelTester(self) self.config_tester = GLPNConfigTester(self, config_class=GLPNConfig) + @unittest.skip(reason="Failing after #32550") + def test_pipeline_depth_estimation(self): + pass + + @unittest.skip(reason="Failing after #32550") + def test_pipeline_depth_estimation_fp16(self): + pass + def test_config(self): self.config_tester.run_common_tests() From e50bf61decf741c6d59e4ba633b7392712673bda Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Wed, 23 Oct 2024 18:33:52 +0200 Subject: [PATCH 04/99] Fix red CI: benchmark script (#34351) * dont'trigger always * fux * oups * update * ?? * ? * aie --- .github/workflows/benchmark.yml | 12 ++--- scripts/deberta_scrtipt.py | 82 +++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+), 8 deletions(-) create mode 100644 scripts/deberta_scrtipt.py diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index c264dfe462a..79f0652e192 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -18,21 +18,17 @@ jobs: name: Benchmark runs-on: group: aws-g5-4xlarge-cache + if: | + (github.event_name == 'pull_request' && contains( github.event.pull_request.labels.*.name, 'run-benchmark') )|| + (github.event_name == 'push' && github.ref == 'refs/heads/main') container: image: huggingface/transformers-pytorch-gpu options: --gpus all --privileged --ipc host steps: - name: Get repo - if: github.event_name == 'pull_request' uses: actions/checkout@v4 with: - ref: ${{ github.event.pull_request.head.sha }} - - - name: Get repo - if: github.event_name == 'push' - uses: actions/checkout@v4 - with: - ref: ${{ github.sha }} + ref: ${{ github.event.pull_request.head.sha || github.sha }} - name: Install libpq-dev & psql run: | diff --git a/scripts/deberta_scrtipt.py b/scripts/deberta_scrtipt.py new file mode 100644 index 00000000000..b910d8de3f5 --- /dev/null +++ b/scripts/deberta_scrtipt.py @@ -0,0 +1,82 @@ +import torch +from transformers import pipeline, AutoTokenizer, AutoModel, AutoModelForMaskedLM +import time + +test_sentence = 'Do you [MASK] the muffin man?' + +# for comparison +bert = pipeline('fill-mask', model = 'bert-base-uncased') +print('\n'.join([d['sequence'] for d in bert(test_sentence)])) + + +deberta = pipeline('fill-mask', model = 'microsoft/deberta-v3-base', model_kwargs={"legacy": False}) +print('\n'.join([d['sequence'] for d in deberta(test_sentence)])) + + +tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base") + +tokenized_dict = tokenizer( + ["Is this working",], ["Not yet",], + return_tensors="pt" +) + +deberta.model.forward = torch.compile(deberta.model.forward) +start=time.time() +deberta.model(**tokenized_dict) +end=time.time() +print(end-start) + + +start=time.time() +deberta.model(**tokenized_dict) +end=time.time() +print(end-start) + + +start=time.time() +deberta.model(**tokenized_dict) +end=time.time() +print(end-start) + + +model = AutoModel.from_pretrained('microsoft/deberta-base') +model.config.return_dict = False +model.config.output_hidden_states=False +input_tuple = (tokenized_dict['input_ids'], tokenized_dict['attention_mask']) + + +start=time.time() +traced_model = torch.jit.trace(model, input_tuple) +end=time.time() +print(end-start) + + +start=time.time() +traced_model(tokenized_dict['input_ids'], tokenized_dict['attention_mask']) +end=time.time() +print(end-start) + + +start=time.time() +traced_model(tokenized_dict['input_ids'], tokenized_dict['attention_mask']) +end=time.time() +print(end-start) + + +start=time.time() +traced_model(tokenized_dict['input_ids'], tokenized_dict['attention_mask']) +end=time.time() +print(end-start) + + +start=time.time() +traced_model(tokenized_dict['input_ids'], tokenized_dict['attention_mask']) +end=time.time() +print(end-start) + + +torch.jit.save(traced_model, "compiled_deberta.pt") + + + +# my_script_module = torch.jit.script(model) From b0f0c61899019d316db17a493023828aa44db06d Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Wed, 23 Oct 2024 21:18:52 +0100 Subject: [PATCH 05/99] Add SynthID (watermerking by Google DeepMind) (#34350) * Add SynthIDTextWatermarkLogitsProcessor * esolving comments. * Resolving comments. * esolving commits, * Improving SynthIDWatermark tests. * switch to PT version * detector as pretrained model + style * update training + style * rebase * Update logits_process.py * Improving SynthIDWatermark tests. * Shift detector training to wikitext negatives and stabilize with lower learning rate. * Clean up. * in for 7B * cleanup * upport python 3.8. * README and final cleanup. * HF Hub upload and initiaze. * Update requirements for synthid_text. * Adding SynthIDTextWatermarkDetector. * Detector testing. * Documentation changes. * Copyrights fix. * Fix detector api. * ironing out errors * ironing out errors * training checks * make fixup and make fix-copies * docstrings and add to docs * copyright * BC * test docstrings * move import * protect type hints * top level imports * watermarking example * direct imports * tpr fpr meaning * process_kwargs * SynthIDTextWatermarkingConfig docstring * assert -> exception * example updates * no immutable dict (cant be serialized) * pack fn * einsum equivalent * import order * fix test on gpu * add detector example --------- Co-authored-by: Sumedh Ghaisas Co-authored-by: Marc Sun Co-authored-by: sumedhghaisas2 <138781311+sumedhghaisas2@users.noreply.github.com> Co-authored-by: raushan --- docs/source/en/internal/generation_utils.md | 18 + .../source/en/main_classes/text_generation.md | 2 - .../research_projects/synthid_text/README.md | 34 ++ .../synthid_text/detector_training.py | 502 ++++++++++++++++++ .../synthid_text/requirements.txt | 5 + .../research_projects/synthid_text/utils.py | 408 ++++++++++++++ src/transformers/__init__.py | 10 + src/transformers/generation/__init__.py | 24 +- .../generation/configuration_utils.py | 221 ++++++-- src/transformers/generation/logits_process.py | 478 ++++++++++++++++- src/transformers/generation/utils.py | 11 +- src/transformers/generation/watermarking.py | 322 ++++++++++- src/transformers/utils/dummy_pt_objects.py | 35 ++ tests/generation/test_logits_process.py | 186 +++++++ tests/generation/test_utils.py | 62 ++- 15 files changed, 2238 insertions(+), 80 deletions(-) create mode 100644 examples/research_projects/synthid_text/README.md create mode 100644 examples/research_projects/synthid_text/detector_training.py create mode 100644 examples/research_projects/synthid_text/requirements.txt create mode 100644 examples/research_projects/synthid_text/utils.py diff --git a/docs/source/en/internal/generation_utils.md b/docs/source/en/internal/generation_utils.md index a81d202c663..946940cb019 100644 --- a/docs/source/en/internal/generation_utils.md +++ b/docs/source/en/internal/generation_utils.md @@ -185,6 +185,9 @@ generation. [[autodoc]] SuppressTokensLogitsProcessor - __call__ +[[autodoc]] SynthIDTextWatermarkLogitsProcessor + - __call__ + [[autodoc]] TemperatureLogitsWarper - __call__ @@ -418,5 +421,20 @@ A [`Constraint`] can be used to force the generation to include specific tokens ## Watermark Utils +[[autodoc]] WatermarkingConfig + - __call__ + [[autodoc]] WatermarkDetector - __call__ + +[[autodoc]] BayesianDetectorConfig + - __call__ + +[[autodoc]] BayesianDetectorModel + - __call__ + +[[autodoc]] SynthIDTextWatermarkingConfig + - __call__ + +[[autodoc]] SynthIDTextWatermarkDetector + - __call__ diff --git a/docs/source/en/main_classes/text_generation.md b/docs/source/en/main_classes/text_generation.md index 574e4c75a6a..76a0f1381cd 100644 --- a/docs/source/en/main_classes/text_generation.md +++ b/docs/source/en/main_classes/text_generation.md @@ -41,8 +41,6 @@ like token streaming. - validate - get_generation_mode -[[autodoc]] generation.WatermarkingConfig - ## GenerationMixin [[autodoc]] GenerationMixin diff --git a/examples/research_projects/synthid_text/README.md b/examples/research_projects/synthid_text/README.md new file mode 100644 index 00000000000..30ab9990373 --- /dev/null +++ b/examples/research_projects/synthid_text/README.md @@ -0,0 +1,34 @@ +# SynthID Text + +This project showcases the use of SynthIDText for watermarking LLMs. The code shown in this repo also +demostrates the training of the detector for detecting such watermarked text. This detector can be uploaded onto +a private HF hub repo (private for security reasons) and can be initialized again through pretrained model loading also shown in this script. + +See our blog post: https://huggingface.co/blog/synthid-text + + +## Python version + +User would need python 3.9 to run this example. + +## Installation and running + +Once you install transformers you would need to install requirements for this project through requirements.txt provided in this folder. + +``` +pip install -r requirements.txt +``` + +## To run the detector training + +``` +python detector_training.py --model_name=google/gemma-7b-it +``` + +Check the script for more parameters are are tunable and check out paper at link +https://www.nature.com/articles/s41586-024-08025-4 for more information on these parameters. + +## Caveat + +Make sure to run the training of the detector and the detection on the same hardware +CPU, GPU or TPU to get consistent results (we use detecterministic randomness which is hardware dependent). diff --git a/examples/research_projects/synthid_text/detector_training.py b/examples/research_projects/synthid_text/detector_training.py new file mode 100644 index 00000000000..35d0ea22f42 --- /dev/null +++ b/examples/research_projects/synthid_text/detector_training.py @@ -0,0 +1,502 @@ +# coding=utf-8 +# Copyright 2024 Google DeepMind. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import dataclasses +import enum +from typing import Any, Dict, List, Optional, Tuple, Union + +import numpy as np +import torch + +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + BayesianDetectorConfig, + BayesianDetectorModel, + SynthIDTextWatermarkDetector, + SynthIDTextWatermarkingConfig, + SynthIDTextWatermarkLogitsProcessor, +) +from utils import ( + get_tokenized_uwm_outputs, + get_tokenized_wm_outputs, + process_raw_model_outputs, + update_fn_if_fpr_tpr, + upload_model_to_hf, +) + + +@enum.unique +class ValidationMetric(enum.Enum): + """Direction along the z-axis.""" + + TPR_AT_FPR = "tpr_at_fpr" + CROSS_ENTROPY = "cross_entropy" + + +@dataclasses.dataclass +class TrainingArguments: + """Training arguments pertaining to the training loop itself.""" + + eval_metric: Optional[str] = dataclasses.field( + default=ValidationMetric.TPR_AT_FPR, metadata={"help": "The evaluation metric used."} + ) + + +def train_detector( + detector: torch.nn.Module, + g_values: torch.Tensor, + mask: torch.Tensor, + watermarked: torch.Tensor, + epochs: int = 250, + learning_rate: float = 1e-3, + minibatch_size: int = 64, + seed: int = 0, + l2_weight: float = 0.0, + shuffle: bool = True, + g_values_val: Optional[torch.Tensor] = None, + mask_val: Optional[torch.Tensor] = None, + watermarked_val: Optional[torch.Tensor] = None, + verbose: bool = False, + validation_metric: ValidationMetric = ValidationMetric.TPR_AT_FPR, +) -> Tuple[Dict[str, Any], float]: + """Trains a Bayesian detector model. + + Args: + g_values: g-values of shape [num_train, seq_len, watermarking_depth]. + mask: A binary array shape [num_train, seq_len] indicating which g-values + should be used. g-values with mask value 0 are discarded. + watermarked: A binary array of shape [num_train] indicating whether the + example is watermarked (0: unwatermarked, 1: watermarked). + epochs: Number of epochs to train for. + learning_rate: Learning rate for optimizer. + minibatch_size: Minibatch size for training. Note that a minibatch + requires ~ 32 * minibatch_size * seq_len * watermarked_depth * + watermarked_depth bits of memory. + seed: Seed for parameter initialization. + l2_weight: Weight to apply to L2 regularization for delta parameters. + shuffle: Whether to shuffle before training. + g_values_val: Validation g-values of shape [num_val, seq_len, + watermarking_depth]. + mask_val: Validation mask of shape [num_val, seq_len]. + watermarked_val: Validation watermark labels of shape [num_val]. + verbose: Boolean indicating verbosity of training. If true, the loss will + be printed. Defaulted to False. + use_tpr_fpr_for_val: Whether to use TPR@FPR=1% as metric for validation. + If false, use cross entropy loss. + + Returns: + Tuple of + training_history: Training history keyed by epoch number where the + values are + dictionaries containing the loss, validation loss, and model + parameters, + keyed by + 'loss', 'val_loss', and 'params', respectively. + min_val_loss: Minimum validation loss achieved during training. + """ + + # Set the random seed for reproducibility + torch.manual_seed(seed) + + # Shuffle the data if required + if shuffle: + indices = torch.randperm(len(g_values)) + g_values = g_values[indices] + mask = mask[indices] + watermarked = watermarked[indices] + + # Initialize optimizer + optimizer = torch.optim.Adam(detector.parameters(), lr=learning_rate) + history = {} + min_val_loss = float("inf") + + for epoch in range(epochs): + losses = [] + detector.train() + num_batches = len(g_values) // minibatch_size + for i in range(0, len(g_values), minibatch_size): + end = i + minibatch_size + if end > len(g_values): + break + loss_batch_weight = l2_weight / num_batches + + optimizer.zero_grad() + loss = detector( + g_values=g_values[i:end], + mask=mask[i:end], + labels=watermarked[i:end], + loss_batch_weight=loss_batch_weight, + )[1] + loss.backward() + optimizer.step() + losses.append(loss.item()) + train_loss = sum(losses) / len(losses) + + val_losses = [] + if g_values_val is not None: + detector.eval() + if validation_metric == ValidationMetric.TPR_AT_FPR: + val_loss = update_fn_if_fpr_tpr( + detector, + g_values_val, + mask_val, + watermarked_val, + minibatch_size=minibatch_size, + ) + else: + for i in range(0, len(g_values_val), minibatch_size): + end = i + minibatch_size + if end > len(g_values_val): + break + with torch.no_grad(): + v_loss = detector( + g_values=g_values_val[i:end], + mask=mask_val[i:end], + labels=watermarked_val[i:end], + loss_batch_weight=0, + )[1] + val_losses.append(v_loss.item()) + val_loss = sum(val_losses) / len(val_losses) + + # Store training history + history[epoch + 1] = {"loss": train_loss, "val_loss": val_loss} + if verbose: + if val_loss is not None: + print(f"Epoch {epoch}: loss {loss} (train), {val_loss} (val)") + else: + print(f"Epoch {epoch}: loss {loss} (train)") + + if val_loss is not None and val_loss < min_val_loss: + min_val_loss = val_loss + best_val_epoch = epoch + + if verbose: + print(f"Best val Epoch: {best_val_epoch}, min_val_loss: {min_val_loss}") + + return history, min_val_loss + + +def train_best_detector( + tokenized_wm_outputs: Union[List[np.ndarray], np.ndarray], + tokenized_uwm_outputs: Union[List[np.ndarray], np.ndarray], + logits_processor: SynthIDTextWatermarkLogitsProcessor, + tokenizer: Any, + torch_device: torch.device, + test_size: float = 0.3, + pos_truncation_length: Optional[int] = 200, + neg_truncation_length: Optional[int] = 100, + max_padded_length: int = 2300, + n_epochs: int = 50, + learning_rate: float = 2.1e-2, + l2_weights: np.ndarray = np.logspace(-3, -2, num=4), + verbose: bool = False, + validation_metric: ValidationMetric = ValidationMetric.TPR_AT_FPR, +): + """Train and return the best detector given range of hyperparameters. + + In practice, we have found that tuning pos_truncation_length, + neg_truncation_length, n_epochs, learning_rate and l2_weights can help + improve the performance of the detector. We reccommend tuning these + parameters for your data. + """ + l2_weights = list(l2_weights) + + ( + train_g_values, + train_masks, + train_labels, + cv_g_values, + cv_masks, + cv_labels, + ) = process_raw_model_outputs( + logits_processor, + tokenizer, + pos_truncation_length, + neg_truncation_length, + max_padded_length, + tokenized_wm_outputs, + test_size, + tokenized_uwm_outputs, + torch_device, + ) + + best_detector = None + lowest_loss = float("inf") + val_losses = [] + for l2_weight in l2_weights: + config = BayesianDetectorConfig(watermarking_depth=len(logits_processor.keys)) + detector = BayesianDetectorModel(config).to(torch_device) + _, min_val_loss = train_detector( + detector=detector, + g_values=train_g_values, + mask=train_masks, + watermarked=train_labels, + g_values_val=cv_g_values, + mask_val=cv_masks, + watermarked_val=cv_labels, + learning_rate=learning_rate, + l2_weight=l2_weight, + epochs=n_epochs, + verbose=verbose, + validation_metric=validation_metric, + ) + val_losses.append(min_val_loss) + if min_val_loss < lowest_loss: + lowest_loss = min_val_loss + best_detector = detector + return best_detector, lowest_loss + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--model_name", + type=str, + default="google/gemma-2b-it", + help=("LM model to train the detector for."), + ) + parser.add_argument( + "--temperature", + type=float, + default=1.0, + help=("Temperature to sample from the model."), + ) + parser.add_argument( + "--top_k", + type=int, + default=40, + help=("Top K for sampling."), + ) + parser.add_argument( + "--top_p", + type=float, + default=1.0, + help=("Top P for sampling."), + ) + parser.add_argument( + "--num_negatives", + type=int, + default=10000, + help=("Number of negatives for detector training."), + ) + parser.add_argument( + "--pos_batch_size", + type=int, + default=32, + help=("Batch size of watermarked positives while sampling."), + ) + parser.add_argument( + "--num_pos_batch", + type=int, + default=313, + help=("Number of positive batches for training."), + ) + parser.add_argument( + "--generation_length", + type=int, + default=512, + help=("Generation length for sampling."), + ) + parser.add_argument( + "--save_model_to_hf_hub", + action="store_true", + help=("Whether to save the trained model HF hub. By default it will be a private repo."), + ) + parser.add_argument( + "--load_from_hf_hub", + action="store_true", + help=( + "Whether to load trained detector model from HF Hub, make sure its the model trained on the same model " + "we are loading in the script." + ), + ) + parser.add_argument( + "--hf_hub_model_name", + type=str, + default=None, + help=("HF hub model name for loading of saving the model."), + ) + parser.add_argument( + "--eval_detector_on_prompts", + action="store_true", + help=("Evaluate detector on a prompt and print probability of watermark."), + ) + + args = parser.parse_args() + model_name = args.model_name + temperature = args.temperature + top_k = args.top_k + top_p = args.top_p + num_negatives = args.num_negatives + pos_batch_size = args.pos_batch_size + num_pos_batch = args.num_pos_batch + if num_pos_batch < 10: + raise ValueError("--num_pos_batch should be greater than 10.") + generation_length = args.generation_length + save_model_to_hf_hub = args.save_model_to_hf_hub + load_from_hf_hub = args.load_from_hf_hub + repo_name = args.hf_hub_model_name + eval_detector_on_prompts = args.eval_detector_on_prompts + + NEG_BATCH_SIZE = 32 + + # Truncate outputs to this length for training. + POS_TRUNCATION_LENGTH = 200 + NEG_TRUNCATION_LENGTH = 100 + # Pad trucated outputs to this length for equal shape across all batches. + MAX_PADDED_LENGTH = 1000 + + DEVICE = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu") + if DEVICE.type not in ("cuda", "tpu"): + raise ValueError("We have found the training stable on GPU and TPU, we are working on" " a fix for CPUs") + + model = None + if not load_from_hf_hub: + # Change this to make your watermark unique. Check documentation in the paper to understand the + # impact of these parameters. + DEFAULT_WATERMARKING_CONFIG = { + "ngram_len": 5, # This corresponds to H=4 context window size in the paper. + "keys": [ + 654, + 400, + 836, + 123, + 340, + 443, + 597, + 160, + 57, + 29, + 590, + 639, + 13, + 715, + 468, + 990, + 966, + 226, + 324, + 585, + 118, + 504, + 421, + 521, + 129, + 669, + 732, + 225, + 90, + 960, + ], + "sampling_table_size": 2**16, + "sampling_table_seed": 0, + "context_history_size": 1024, + } + watermark_config = SynthIDTextWatermarkingConfig(**DEFAULT_WATERMARKING_CONFIG) + + model = AutoModelForCausalLM.from_pretrained(model_name).to(DEVICE) + tokenizer = AutoTokenizer.from_pretrained(model_name) + tokenizer.pad_token = tokenizer.eos_token + + logits_processor = SynthIDTextWatermarkLogitsProcessor(**DEFAULT_WATERMARKING_CONFIG, device=DEVICE) + tokenized_wm_outputs = get_tokenized_wm_outputs( + model, + tokenizer, + watermark_config, + num_pos_batch, + pos_batch_size, + temperature, + generation_length, + top_k, + top_p, + DEVICE, + ) + tokenized_uwm_outputs = get_tokenized_uwm_outputs(num_negatives, NEG_BATCH_SIZE, tokenizer, DEVICE) + + best_detector, lowest_loss = train_best_detector( + tokenized_wm_outputs=tokenized_wm_outputs, + tokenized_uwm_outputs=tokenized_uwm_outputs, + logits_processor=logits_processor, + tokenizer=tokenizer, + torch_device=DEVICE, + test_size=0.3, + pos_truncation_length=POS_TRUNCATION_LENGTH, + neg_truncation_length=NEG_TRUNCATION_LENGTH, + max_padded_length=MAX_PADDED_LENGTH, + n_epochs=100, + learning_rate=3e-3, + l2_weights=[ + 0, + ], + verbose=True, + validation_metric=ValidationMetric.TPR_AT_FPR, + ) + else: + if repo_name is None: + raise ValueError("When loading from pretrained detector model name cannot be None.") + best_detector = BayesianDetectorModel.from_pretrained(repo_name).to(DEVICE) + + best_detector.config.set_detector_information( + model_name=model_name, watermarking_config=DEFAULT_WATERMARKING_CONFIG + ) + if save_model_to_hf_hub: + upload_model_to_hf(best_detector, repo_name) + + # Evaluate model response with the detector + if eval_detector_on_prompts: + model_name = best_detector.config.model_name + watermark_config_dict = best_detector.config.watermarking_config + logits_processor = SynthIDTextWatermarkLogitsProcessor(**watermark_config_dict, device=DEVICE) + tokenizer = AutoTokenizer.from_pretrained(model_name) + tokenizer.pad_token = tokenizer.eos_token + synthid_text_detector = SynthIDTextWatermarkDetector(best_detector, logits_processor, tokenizer) + + if model is None: + model = AutoModelForCausalLM.from_pretrained(model_name).to(DEVICE) + watermarking_config = SynthIDTextWatermarkingConfig(**watermark_config_dict) + + prompts = ["Write a essay on cats."] + inputs = tokenizer( + prompts, + return_tensors="pt", + padding=True, + ).to(DEVICE) + + _, inputs_len = inputs["input_ids"].shape + + outputs = model.generate( + **inputs, + watermarking_config=watermarking_config, + do_sample=True, + max_length=inputs_len + generation_length, + temperature=temperature, + top_k=40, + top_p=1.0, + ) + outputs = outputs[:, inputs_len:] + result = synthid_text_detector(outputs) + + # You should set this based on expected fpr (false positive rate) and tpr (true positive rate). + # Check our demo at HF Spaces for more info. + upper_threshold = 0.95 + lower_threshold = 0.12 + if result[0][0] > upper_threshold: + print("The text is watermarked.") + elif lower_threshold < result[0][0] < upper_threshold: + print("It is hard to determine if the text is watermarked or not.") + else: + print("The text is not watermarked.") diff --git a/examples/research_projects/synthid_text/requirements.txt b/examples/research_projects/synthid_text/requirements.txt new file mode 100644 index 00000000000..9e40a93ee08 --- /dev/null +++ b/examples/research_projects/synthid_text/requirements.txt @@ -0,0 +1,5 @@ +tensorflow-datasets>=4.9.3 +torch >= 1.3 +datasets +scikit-learn +tensorflow diff --git a/examples/research_projects/synthid_text/utils.py b/examples/research_projects/synthid_text/utils.py new file mode 100644 index 00000000000..abcb6ca2f28 --- /dev/null +++ b/examples/research_projects/synthid_text/utils.py @@ -0,0 +1,408 @@ +# coding=utf-8 +# Copyright 2024 Google DeepMind. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +from typing import Any, List, Optional, Tuple + +import datasets +import numpy as np +import tensorflow as tf +import tensorflow_datasets as tfds +import torch +import tqdm +from huggingface_hub import HfApi, create_repo +from huggingface_hub.utils import RepositoryNotFoundError +from sklearn import model_selection + +import transformers + + +def pad_to_len( + arr: torch.Tensor, + target_len: int, + left_pad: bool, + eos_token: int, + device: torch.device, +) -> torch.Tensor: + """Pad or truncate array to given length.""" + if arr.shape[1] < target_len: + shape_for_ones = list(arr.shape) + shape_for_ones[1] = target_len - shape_for_ones[1] + padded = ( + torch.ones( + shape_for_ones, + device=device, + dtype=torch.long, + ) + * eos_token + ) + if not left_pad: + arr = torch.concatenate((arr, padded), dim=1) + else: + arr = torch.concatenate((padded, arr), dim=1) + else: + arr = arr[:, :target_len] + return arr + + +def filter_and_truncate( + outputs: torch.Tensor, + truncation_length: Optional[int], + eos_token_mask: torch.Tensor, +) -> torch.Tensor: + """Filter and truncate outputs to given length. + + Args: + outputs: output tensor of shape [batch_size, output_len] + truncation_length: Length to truncate the final output. + eos_token_mask: EOS token mask of shape [batch_size, output_len] + + Returns: + output tensor of shape [batch_size, truncation_length]. + """ + if truncation_length: + outputs = outputs[:, :truncation_length] + truncation_mask = torch.sum(eos_token_mask, dim=1) >= truncation_length + return outputs[truncation_mask, :] + return outputs + + +def process_outputs_for_training( + all_outputs: List[torch.Tensor], + logits_processor: transformers.generation.SynthIDTextWatermarkLogitsProcessor, + tokenizer: Any, + pos_truncation_length: Optional[int], + neg_truncation_length: Optional[int], + max_length: int, + is_cv: bool, + is_pos: bool, + torch_device: torch.device, +) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: + """Process raw model outputs into format understandable by the detector. + + Args: + all_outputs: sequence of outputs of shape [batch_size, output_len]. + logits_processor: logits processor used for watermarking. + tokenizer: tokenizer used for the model. + pos_truncation_length: Length to truncate wm outputs. + neg_truncation_length: Length to truncate uwm outputs. + max_length: Length to pad truncated outputs so that all processed entries. + have same shape. + is_cv: Process given outputs for cross validation. + is_pos: Process given outputs for positives. + torch_device: torch device to use. + + Returns: + Tuple of + all_masks: list of masks of shape [batch_size, max_length]. + all_g_values: list of g_values of shape [batch_size, max_length, depth]. + """ + all_masks = [] + all_g_values = [] + for outputs in tqdm.tqdm(all_outputs): + # outputs is of shape [batch_size, output_len]. + # output_len can differ from batch to batch. + eos_token_mask = logits_processor.compute_eos_token_mask( + input_ids=outputs, + eos_token_id=tokenizer.eos_token_id, + ) + if is_pos or is_cv: + # filter with length for positives for both train and CV. + # We also filter for length when CV negatives are processed. + outputs = filter_and_truncate(outputs, pos_truncation_length, eos_token_mask) + elif not is_pos and not is_cv: + outputs = filter_and_truncate(outputs, neg_truncation_length, eos_token_mask) + + # If no filtered outputs skip this batch. + if outputs.shape[0] == 0: + continue + + # All outputs are padded to max-length with eos-tokens. + outputs = pad_to_len(outputs, max_length, False, tokenizer.eos_token_id, torch_device) + # outputs shape [num_filtered_entries, max_length] + + eos_token_mask = logits_processor.compute_eos_token_mask( + input_ids=outputs, + eos_token_id=tokenizer.eos_token_id, + ) + + context_repetition_mask = logits_processor.compute_context_repetition_mask( + input_ids=outputs, + ) + + # context_repetition_mask of shape [num_filtered_entries, max_length - + # (ngram_len - 1)]. + context_repetition_mask = pad_to_len(context_repetition_mask, max_length, True, 0, torch_device) + # We pad on left to get same max_length shape. + # context_repetition_mask of shape [num_filtered_entries, max_length]. + combined_mask = context_repetition_mask * eos_token_mask + + g_values = logits_processor.compute_g_values( + input_ids=outputs, + ) + + # g_values of shape [num_filtered_entries, max_length - (ngram_len - 1), + # depth]. + g_values = pad_to_len(g_values, max_length, True, 0, torch_device) + + # We pad on left to get same max_length shape. + # g_values of shape [num_filtered_entries, max_length, depth]. + all_masks.append(combined_mask) + all_g_values.append(g_values) + return all_masks, all_g_values + + +def tpr_at_fpr(detector, detector_inputs, w_true, minibatch_size, target_fpr=0.01) -> torch.Tensor: + """Calculates true positive rate (TPR) at false positive rate (FPR)=target_fpr.""" + positive_idxs = w_true == 1 + negative_idxs = w_true == 0 + num_samples = detector_inputs[0].size(0) + + w_preds = [] + for start in range(0, num_samples, minibatch_size): + end = start + minibatch_size + detector_inputs_ = ( + detector_inputs[0][start:end], + detector_inputs[1][start:end], + ) + with torch.no_grad(): + w_pred = detector(*detector_inputs_)[0] + w_preds.append(w_pred) + + w_pred = torch.cat(w_preds, dim=0) # Concatenate predictions + positive_scores = w_pred[positive_idxs] + negative_scores = w_pred[negative_idxs] + + # Calculate the FPR threshold + # Note: percentile -> quantile + fpr_threshold = torch.quantile(negative_scores, 1 - target_fpr) + # Note: need to switch to FP32 since torch.mean doesn't work with torch.bool + return torch.mean((positive_scores >= fpr_threshold).to(dtype=torch.float32)).item() # TPR + + +def update_fn_if_fpr_tpr(detector, g_values_val, mask_val, watermarked_val, minibatch_size): + """Loss function for negative TPR@FPR=1% as the validation loss.""" + tpr_ = tpr_at_fpr( + detector=detector, + detector_inputs=(g_values_val, mask_val), + w_true=watermarked_val, + minibatch_size=minibatch_size, + ) + return -tpr_ + + +def process_raw_model_outputs( + logits_processor, + tokenizer, + pos_truncation_length, + neg_truncation_length, + max_padded_length, + tokenized_wm_outputs, + test_size, + tokenized_uwm_outputs, + torch_device, +): + # Split data into train and CV + train_wm_outputs, cv_wm_outputs = model_selection.train_test_split(tokenized_wm_outputs, test_size=test_size) + + train_uwm_outputs, cv_uwm_outputs = model_selection.train_test_split(tokenized_uwm_outputs, test_size=test_size) + + process_kwargs = { + "logits_processor": logits_processor, + "tokenizer": tokenizer, + "pos_truncation_length": pos_truncation_length, + "neg_truncation_length": neg_truncation_length, + "max_length": max_padded_length, + "torch_device": torch_device, + } + + # Process both train and CV data for training + wm_masks_train, wm_g_values_train = process_outputs_for_training( + [torch.tensor(outputs, device=torch_device, dtype=torch.long) for outputs in train_wm_outputs], + is_pos=True, + is_cv=False, + **process_kwargs, + ) + wm_masks_cv, wm_g_values_cv = process_outputs_for_training( + [torch.tensor(outputs, device=torch_device, dtype=torch.long) for outputs in cv_wm_outputs], + is_pos=True, + is_cv=True, + **process_kwargs, + ) + uwm_masks_train, uwm_g_values_train = process_outputs_for_training( + [torch.tensor(outputs, device=torch_device, dtype=torch.long) for outputs in train_uwm_outputs], + is_pos=False, + is_cv=False, + **process_kwargs, + ) + uwm_masks_cv, uwm_g_values_cv = process_outputs_for_training( + [torch.tensor(outputs, device=torch_device, dtype=torch.long) for outputs in cv_uwm_outputs], + is_pos=False, + is_cv=True, + **process_kwargs, + ) + + # We get list of data; here we concat all together to be passed to the detector. + def pack(mask, g_values): + mask = torch.cat(mask, dim=0) + g = torch.cat(g_values, dim=0) + return mask, g + + wm_masks_train, wm_g_values_train = pack(wm_masks_train, wm_g_values_train) + # Note: Use float instead of bool. Otherwise, the entropy calculation doesn't work + wm_labels_train = torch.ones((wm_masks_train.shape[0],), dtype=torch.float, device=torch_device) + + wm_masks_cv, wm_g_values_cv = pack(wm_masks_cv, wm_g_values_cv) + wm_labels_cv = torch.ones((wm_masks_cv.shape[0],), dtype=torch.float, device=torch_device) + + uwm_masks_train, uwm_g_values_train = pack(uwm_masks_train, uwm_g_values_train) + uwm_labels_train = torch.zeros((uwm_masks_train.shape[0],), dtype=torch.float, device=torch_device) + + uwm_masks_cv, uwm_g_values_cv = pack(uwm_masks_cv, uwm_g_values_cv) + uwm_labels_cv = torch.zeros((uwm_masks_cv.shape[0],), dtype=torch.float, device=torch_device) + + # Concat pos and negatives data together. + train_g_values = torch.cat((wm_g_values_train, uwm_g_values_train), dim=0).squeeze() + train_labels = torch.cat((wm_labels_train, uwm_labels_train), axis=0).squeeze() + train_masks = torch.cat((wm_masks_train, uwm_masks_train), axis=0).squeeze() + + cv_g_values = torch.cat((wm_g_values_cv, uwm_g_values_cv), axis=0).squeeze() + cv_labels = torch.cat((wm_labels_cv, uwm_labels_cv), axis=0).squeeze() + cv_masks = torch.cat((wm_masks_cv, uwm_masks_cv), axis=0).squeeze() + + # Shuffle data. + shuffled_idx = torch.randperm(train_g_values.shape[0]) # Use torch for GPU compatibility + + train_g_values = train_g_values[shuffled_idx] + train_labels = train_labels[shuffled_idx] + train_masks = train_masks[shuffled_idx] + + # Shuffle the cross-validation data + shuffled_idx_cv = torch.randperm(cv_g_values.shape[0]) # Use torch for GPU compatibility + cv_g_values = cv_g_values[shuffled_idx_cv] + cv_labels = cv_labels[shuffled_idx_cv] + cv_masks = cv_masks[shuffled_idx_cv] + + # Del some variables so we free up GPU memory. + del ( + wm_g_values_train, + wm_labels_train, + wm_masks_train, + wm_g_values_cv, + wm_labels_cv, + wm_masks_cv, + ) + gc.collect() + torch.cuda.empty_cache() + + return train_g_values, train_masks, train_labels, cv_g_values, cv_masks, cv_labels + + +def get_tokenized_uwm_outputs(num_negatives, neg_batch_size, tokenizer, device): + dataset, info = tfds.load("wikipedia/20230601.en", split="train", with_info=True) + dataset = dataset.take(num_negatives) + + # Convert the dataset to a DataFrame + df = tfds.as_dataframe(dataset, info) + ds = tf.data.Dataset.from_tensor_slices(dict(df)) + tf.random.set_seed(0) + ds = ds.shuffle(buffer_size=10_000) + ds = ds.batch(batch_size=neg_batch_size) + + tokenized_uwm_outputs = [] + # Pad to this length (on the right) for batching. + padded_length = 1000 + for i, batch in tqdm.tqdm(enumerate(ds)): + responses = [val.decode() for val in batch["text"].numpy()] + inputs = tokenizer( + responses, + return_tensors="pt", + padding=True, + ).to(device) + inputs = inputs["input_ids"].cpu().numpy() + if inputs.shape[1] >= padded_length: + inputs = inputs[:, :padded_length] + else: + inputs = np.concatenate( + [inputs, np.ones((neg_batch_size, padded_length - inputs.shape[1])) * tokenizer.eos_token_id], axis=1 + ) + tokenized_uwm_outputs.append(inputs) + if len(tokenized_uwm_outputs) * neg_batch_size > num_negatives: + break + return tokenized_uwm_outputs + + +def get_tokenized_wm_outputs( + model, + tokenizer, + watermark_config, + num_pos_batches, + pos_batch_size, + temperature, + max_output_len, + top_k, + top_p, + device, +): + eli5_prompts = datasets.load_dataset("Pavithree/eli5") + + wm_outputs = [] + + for batch_id in tqdm.tqdm(range(num_pos_batches)): + prompts = eli5_prompts["train"]["title"][batch_id * pos_batch_size : (batch_id + 1) * pos_batch_size] + prompts = [prompt.strip('"') for prompt in prompts] + inputs = tokenizer( + prompts, + return_tensors="pt", + padding=True, + ).to(device) + _, inputs_len = inputs["input_ids"].shape + + outputs = model.generate( + **inputs, + watermarking_config=watermark_config, + do_sample=True, + max_length=inputs_len + max_output_len, + temperature=temperature, + top_k=top_k, + top_p=top_p, + ) + + wm_outputs.append(outputs[:, inputs_len:].cpu().detach()) + + del outputs, inputs, prompts + gc.collect() + + gc.collect() + torch.cuda.empty_cache() + return wm_outputs + + +def upload_model_to_hf(model, hf_repo_name: str, private: bool = True): + api = HfApi() + + # Check if the repository exists + try: + api.repo_info(repo_id=hf_repo_name, use_auth_token=True) + print(f"Repository '{hf_repo_name}' already exists.") + except RepositoryNotFoundError: + # If the repository does not exist, create it + print(f"Repository '{hf_repo_name}' not found. Creating it...") + create_repo(repo_id=hf_repo_name, private=private, use_auth_token=True) + print(f"Repository '{hf_repo_name}' created successfully.") + + # Push the model to the Hugging Face Hub + print(f"Uploading model to Hugging Face repo '{hf_repo_name}'...") + model.push_to_hub(repo_id=hf_repo_name, use_auth_token=True) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 7f408859c53..771e3e8f0ae 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1301,6 +1301,8 @@ _import_structure["generation"].extend( [ "AlternatingCodebooksLogitsProcessor", + "BayesianDetectorConfig", + "BayesianDetectorModel", "BeamScorer", "BeamSearchScorer", "ClassifierFreeGuidanceLogitsProcessor", @@ -1339,6 +1341,9 @@ "StopStringCriteria", "SuppressTokensAtBeginLogitsProcessor", "SuppressTokensLogitsProcessor", + "SynthIDTextWatermarkDetector", + "SynthIDTextWatermarkingConfig", + "SynthIDTextWatermarkLogitsProcessor", "TemperatureLogitsWarper", "TopKLogitsWarper", "TopPLogitsWarper", @@ -6213,6 +6218,8 @@ ) from .generation import ( AlternatingCodebooksLogitsProcessor, + BayesianDetectorConfig, + BayesianDetectorModel, BeamScorer, BeamSearchScorer, ClassifierFreeGuidanceLogitsProcessor, @@ -6251,6 +6258,9 @@ StopStringCriteria, SuppressTokensAtBeginLogitsProcessor, SuppressTokensLogitsProcessor, + SynthIDTextWatermarkDetector, + SynthIDTextWatermarkingConfig, + SynthIDTextWatermarkLogitsProcessor, TemperatureLogitsWarper, TopKLogitsWarper, TopPLogitsWarper, diff --git a/src/transformers/generation/__init__.py b/src/transformers/generation/__init__.py index 2bea0026195..b487fa3c7fe 100644 --- a/src/transformers/generation/__init__.py +++ b/src/transformers/generation/__init__.py @@ -18,7 +18,13 @@ _import_structure = { - "configuration_utils": ["GenerationConfig", "GenerationMode", "WatermarkingConfig"], + "configuration_utils": [ + "BaseWatermarkingConfig", + "GenerationConfig", + "GenerationMode", + "SynthIDTextWatermarkingConfig", + "WatermarkingConfig", + ], "streamers": ["TextIteratorStreamer", "TextStreamer"], } @@ -71,6 +77,7 @@ "SequenceBiasLogitsProcessor", "SuppressTokensLogitsProcessor", "SuppressTokensAtBeginLogitsProcessor", + "SynthIDTextWatermarkLogitsProcessor", "TemperatureLogitsWarper", "TopKLogitsWarper", "TopPLogitsWarper", @@ -110,6 +117,9 @@ _import_structure["watermarking"] = [ "WatermarkDetector", "WatermarkDetectorOutput", + "BayesianDetectorModel", + "BayesianDetectorConfig", + "SynthIDTextWatermarkDetector", ] try: @@ -179,7 +189,13 @@ ] if TYPE_CHECKING: - from .configuration_utils import GenerationConfig, GenerationMode, WatermarkingConfig + from .configuration_utils import ( + BaseWatermarkingConfig, + GenerationConfig, + GenerationMode, + SynthIDTextWatermarkingConfig, + WatermarkingConfig, + ) from .streamers import TextIteratorStreamer, TextStreamer try: @@ -217,6 +233,7 @@ SequenceBiasLogitsProcessor, SuppressTokensAtBeginLogitsProcessor, SuppressTokensLogitsProcessor, + SynthIDTextWatermarkLogitsProcessor, TemperatureLogitsWarper, TopKLogitsWarper, TopPLogitsWarper, @@ -254,6 +271,9 @@ SampleEncoderDecoderOutput, ) from .watermarking import ( + BayesianDetectorConfig, + BayesianDetectorModel, + SynthIDTextWatermarkDetector, WatermarkDetector, WatermarkDetectorOutput, ) diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py index 37d57248c46..c460a19885a 100644 --- a/src/transformers/generation/configuration_utils.py +++ b/src/transformers/generation/configuration_utils.py @@ -18,8 +18,9 @@ import json import os import warnings +from abc import ABC, abstractmethod from dataclasses import dataclass, is_dataclass -from typing import TYPE_CHECKING, Any, Dict, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union from .. import __version__ from ..configuration_utils import PretrainedConfig @@ -59,6 +60,7 @@ StaticCache, StaticCacheConfig, ) + from .logits_process import SynthIDTextWatermarkLogitsProcessor, WatermarkLogitsProcessor NEEDS_CACHE_CONFIG["quantized"] = QuantizedCacheConfig NEEDS_CACHE_CONFIG["static"] = StaticCacheConfig @@ -280,23 +282,10 @@ class GenerationConfig(PushToHubMixin): low_memory (`bool`, *optional*): Switch to sequential beam search and sequential topk for contrastive search to reduce peak memory. Used with beam search and contrastive search. - watermarking_config (`WatermarkingConfig` or `dict`, *optional*): - Arguments used to watermark the model outputs by adding a small bias to randomly selected set of "green" tokens. - If passed as `Dict`, it will be converted to a `WatermarkingConfig` internally. - See [this paper](https://arxiv.org/abs/2306.04634) for more details. Accepts the following keys: - - greenlist_ratio (`float`): - Used for watermarking. The ratio of "green" tokens used to the vocabulary size. Defaults to 0.25. - - bias (`float`): - Used with watermarking. The bias added to the selected "green" tokens' logits. Defaults to 2.0. - - hashing_key (`int`): - Hahsing key used for watermarking. Defaults to 15485863 (the millionth prime). - - seeding_scheme (`str`): - Algorithm to use for watermarking. Accepts values: - - "lefthash" (default): "green" tokens selection depend on the last token (Algorithm 2 from the paper) - - "selfhash": "green" tokens selection depends on the current token itself (Algorithm 3 from the paper) - The downside of this scheme is that it considers all possible next tokens and can be slower than "lefthash". - - context_width (`int`): - The context length of previous tokens to use in seeding. Higher context length makes watermarking more robust. + watermarking_config (`BaseWatermarkingConfig` or `dict`, *optional*): + Arguments used to watermark the model outputs by adding a small bias to randomly selected set of "green" + tokens. See the docs of [`SynthIDTextWatermarkingConfig`] and [`WatermarkingConfig`] for more + details. If passed as `Dict`, it will be converted to a `WatermarkingConfig` internally. > Parameters that define the output variables of generate @@ -430,7 +419,7 @@ def __init__(self, **kwargs): watermarking_config = kwargs.pop("watermarking_config", None) if watermarking_config is None: self.watermarking_config = None - elif isinstance(watermarking_config, WatermarkingConfig): + elif isinstance(watermarking_config, BaseWatermarkingConfig): self.watermarking_config = watermarking_config else: self.watermarking_config = WatermarkingConfig.from_dict(watermarking_config) @@ -766,7 +755,15 @@ def validate(self, is_init=False): # 6. check watermarking arguments if self.watermarking_config is not None: - if not isinstance(self.watermarking_config, WatermarkingConfig): + if not ( + isinstance(self.watermarking_config, WatermarkingConfig) + or isinstance(self.watermarking_config, SynthIDTextWatermarkingConfig) + ): + warnings.warn( + "`watermarking_config` as a dict is deprecated. Please construct `watermarking_config` object with " + "`WatermarkingConfig` or `SynthIDTextWatermarkingConfig` class.", + FutureWarning, + ) self.watermarking_config = WatermarkingConfig.from_dict(self.watermarking_config) self.watermarking_config.validate() @@ -1287,52 +1284,20 @@ def update(self, **kwargs): @dataclass -class WatermarkingConfig: - """ - Class that holds arguments for watermark generation and should be passed into `GenerationConfig` during `generate`. - See [this paper](https://arxiv.org/abs/2306.04634) for more details on the arguments. - - Accepts the following keys: - - greenlist_ratio (`float`): - Used for watermarking. The ratio of "green" tokens used to the vocabulary size. Defaults to 0.25. - - bias (`float`): - Used with watermarking. The bias added to the selected "green" tokens' logits. Defaults to 2.0. - - hashing_key (`int`): - Hashing key used for watermarking. Defaults to 15485863 (the millionth prime). - - seeding_scheme (`str`): - Algorithm to use for watermarking. Accepts values: - - "lefthash" (default): "green" tokens selection depend on the last token (Algorithm 2 from the paper) - - "selfhash": "green" tokens selection depends on the current token itself (Algorithm 3 from the paper) - The downside of this scheme is that it considers all possible next tokens and can be slower than "lefthash". - - context_width(`int`): - The context length of previous tokens to use in seeding. Higher context length makes watermarking more robust. - """ - - def __init__( - self, - greenlist_ratio: Optional[float] = 0.25, - bias: Optional[float] = 2.0, - hashing_key: Optional[int] = 15485863, - seeding_scheme: Optional[str] = "lefthash", - context_width: Optional[int] = 1, - ): - self.greenlist_ratio = greenlist_ratio - self.bias = bias - self.hashing_key = hashing_key - self.seeding_scheme = seeding_scheme - self.context_width = context_width +class BaseWatermarkingConfig(ABC): + """Generic watermarking config""" @classmethod def from_dict(cls, config_dict, **kwargs): """ - Constructs a WatermarkingConfig instance from a dictionary of parameters. + Constructs a BaseWatermarkingConfig instance from a dictionary of parameters. Args: config_dict (Dict[str, Any]): Dictionary containing configuration parameters. **kwargs: Additional keyword arguments to override dictionary values. Returns: - WatermarkingConfig: Instance of WatermarkingConfig constructed from the dictionary. + BaseWatermarkingConfig: Instance of BaseWatermarkingConfig constructed from the dictionary. """ config = cls(**config_dict) to_remove = [] @@ -1394,6 +1359,49 @@ def update(self, **kwargs): if hasattr(self, key): setattr(self, key, value) + @abstractmethod + def validate(self): ... + + @abstractmethod + def construct_processor(self, vocab_size): ... + + +@dataclass +class WatermarkingConfig(BaseWatermarkingConfig): + """ + Class that holds arguments for watermark generation and should be passed into `GenerationConfig` during `generate`. + See [this paper](https://arxiv.org/abs/2306.04634) for more details on the arguments. + + Accepts the following keys: + - greenlist_ratio (`float`): + Used for watermarking. The ratio of "green" tokens used to the vocabulary size. Defaults to 0.25. + - bias (`float`): + Used with watermarking. The bias added to the selected "green" tokens' logits. Defaults to 2.0. + - hashing_key (`int`): + Hashing key used for watermarking. Defaults to 15485863 (the millionth prime). + - seeding_scheme (`str`): + Algorithm to use for watermarking. Accepts values: + - "lefthash" (default): "green" tokens selection depend on the last token (Algorithm 2 from the paper) + - "selfhash": "green" tokens selection depends on the current token itself (Algorithm 3 from the paper) + The downside of this scheme is that it considers all possible next tokens and can be slower than "lefthash". + - context_width(`int`): + The context length of previous tokens to use in seeding. Higher context length makes watermarking more robust. + """ + + def __init__( + self, + greenlist_ratio: Optional[float] = 0.25, + bias: Optional[float] = 2.0, + hashing_key: Optional[int] = 15485863, + seeding_scheme: Optional[str] = "lefthash", + context_width: Optional[int] = 1, + ): + self.greenlist_ratio = greenlist_ratio + self.bias = bias + self.hashing_key = hashing_key + self.seeding_scheme = seeding_scheme + self.context_width = context_width + def validate(self): watermark_missing_arg_msg = ( "Some of the keys in `watermarking_config` are defined incorrectly. `{key}` should be {correct_value}` " @@ -1423,3 +1431,104 @@ def validate(self): found_value=self.context_width, ), ) + + def construct_processor(self, vocab_size: int, device) -> "WatermarkLogitsProcessor": + return WatermarkLogitsProcessor( + vocab_size=vocab_size, + device=device, + greenlist_ratio=self.greenlist_ratio, + bias=self.bias, + hashing_key=self.hashing_key, + seeding_scheme=self.seeding_scheme, + context_width=self.context_width, + ) + + +@dataclass +class SynthIDTextWatermarkingConfig(BaseWatermarkingConfig): + """ + Class that holds arguments for watermark generation and should be passed into `GenerationConfig` during `generate`. + See [this paper](https://www.nature.com/articles/s41586-024-08025-4) for more details on the arguments. + + Args: + ngram_len (`int`): + Ngram length. + keys (`List[int]`): + A sequence of watermarking keys, one for each depth. + context_history_size (`int`, *optional*, defaults to 1024): + Size of the tensor to keep track of seen contexts. + sampling_table_seed (`int`, *optional*, defaults to 0): + Random seed to generate the sampling table. + sampling_table_size (`int`, *optional*, defaults to 65536): + Size of the sampling table. + skip_first_ngram_calls (`bool`, *optional*, defaults to `False`): + Whether to skip first ngram calls. + debug_mode (`bool`, optional, *optional*, defaults to `False`): + Logits are modified to uniform one got before watermarking modification is applied. This is to test the + implementation. + + Examples: + ```python + >>> from transformers import AutoModelForCausalLM, AutoTokenizer, SynthIDTextWatermarkingConfig + + >>> tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b-it') + >>> model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b-it') + + >>> # SynthID Text configuration + >>> watermarking_config = SynthIDTextWatermarkingConfig( + ... keys=[654, 400, 836, 123, 340, 443, 597, 160, 57], + ... ngram_len=5, + ... ) + + >>> # Generation with watermarking + >>> tokenized_prompts = tokenizer(["your prompts here"]) + >>> output_sequences = model.generate( + ... **tokenized_prompts, watermarking_config=watermarking_config, do_sample=True, + ... ) + >>> watermarked_text = tokenizer.batch_decode(output_sequences) + ``` + """ + + def __init__( + self, + ngram_len: int, + keys: List[int], + context_history_size: int = 1024, + sampling_table_seed: int = 0, + sampling_table_size: int = 2**16, + skip_first_ngram_calls: bool = False, + debug_mode: bool = False, + ): + self.ngram_len = ngram_len + self.keys = keys + self.sampling_table_size = sampling_table_size + self.sampling_table_seed = sampling_table_seed + self.context_history_size = context_history_size + self.skip_first_ngram_calls = skip_first_ngram_calls + self.debug_mode = debug_mode + + def validate(self): + watermark_missing_arg_msg = ( + "Some of the keys in `watermarking_config` are defined incorrectly. `{key}` should be {correct_value}` " + "but found {found_value}" + ) + if self.sampling_table_size > 2**24: + raise ValueError( + watermark_missing_arg_msg.format( + key="sampling_table_size", + correct_value="< 2**24", + found_value=self.sampling_table_size, + ), + ) + + def construct_processor(self, vocab_size: int, device) -> "WatermarkLogitsProcessor": + return SynthIDTextWatermarkLogitsProcessor( + ngram_len=self.ngram_len, + keys=self.keys, + sampling_table_size=self.sampling_table_size, + sampling_table_seed=self.sampling_table_seed, + context_history_size=self.context_history_size, + device=device, + skip_first_ngram_calls=self.skip_first_ngram_calls, + debug_mode=self.debug_mode, + ) diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py index d88c7a17d89..fde95c7a856 100644 --- a/src/transformers/generation/logits_process.py +++ b/src/transformers/generation/logits_process.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright 2020 The HuggingFace Inc. team +# Copyright 2024 The HuggingFace Inc. team and Google DeepMind. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -2460,6 +2460,7 @@ def _score_rejection_sampling(self, input_seq: torch.LongTensor, scores: torch.F final_greenlist.append(greedy_predictions[i]) return torch.tensor(final_greenlist, device=input_seq.device) + @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING) def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: if input_ids.shape[-1] < self.context_width: logger.warning( @@ -2477,3 +2478,478 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to scores_processed[b_idx, greenlist_ids] = scores_processed[b_idx, greenlist_ids] + self.bias return scores_processed + + +class SynthIDTextWatermarkState: + """SynthID watermarking state.""" + + def __init__( + self, + batch_size: int, + ngram_len: int, + context_history_size: int, + device: torch.device, + ): + """Initializes the state. + + Args: + batch_size (`int`): Batch size. + ngram_len (`int`): Ngram length. + context_history_size (`int`): Size of the tensor to keep track of seen contexts. + device (`int`): Device to use. + """ + self.context = torch.zeros( + (batch_size, ngram_len - 1), + dtype=torch.int64, + device=device, + ) + self.context_history = torch.zeros( + (batch_size, context_history_size), + dtype=torch.int64, + device=device, + ) + self.num_calls = 0 + + +class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor): + r""" + Logits processor that implements watermarking techniques for text generation models. + This class facilitates the application of SynthID text watermarking, a method for embedding imperceptible signals + into generated text to aid in detecting synthetic content. It operates by subtly manipulating the probabilities of + token selection during text generation in a manner that can be reliably recovered later for verification. + + Key Features: + * **State Management:** Maintains internal state to track token sequences and generate watermarking keys + dynamically. + + * **Key Generation:** Computes hashes based on token sequences and watermarking parameters to create unique keys + for each position. + + * **G-Value Sampling:** Employs a pre-computed sampling table to sample watermarking values (g-values) based on + the generated keys. + + * **Score Adjustment:** Applies calculated g-values to modify token probabilities during generation, embedding the + watermark. + + * **Context Repetition Handling:** Incorporates logic to avoid watermarking tokens in repeated contexts, + preserving naturalness. + + * **EOS Token Masking:** Supports masking end-of-sentence tokens to prevent their inclusion in watermarking + calculations. + + * **Utility Functions:** Provides functions to compute g-values directly, check for context repetition, create + EOS token masks, and estimate expected mean g-values. + + Refer to paper url: https://www.nature.com/articles/s41586-024-08025-4 for more details around this. + + Args: + ngram_len (`int`): + Ngram length. + keys (`List[int]`): + A sequence of watermarking keys, one for each depth. + sampling_table_size (`int`): + Size of the sampling table. + sampling_table_seed (`int`): + Random seed to generate the sampling table. + context_history_size (`int`): + Size of the tensor to keep track of seen contexts. + device (`torch.device`): + Device to use. + skip_first_ngram_calls (`bool`, *optional*, defaults to `False`): + Whether to skip first ngram calls. + debug_mode (`bool`, optional, *optional*, defaults to `False`): + Logits are modified to uniform one got before watermarking modification is applied. This is to test the + implementation. + + Examples: + ```python + >>> from transformers import AutoModelForCausalLM, AutoTokenizer, SynthIDTextWatermarkingConfig + + >>> tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b-it') + >>> model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b-it') + + >>> # SynthID Text configuration + >>> watermarking_config = SynthIDTextWatermarkingConfig( + ... keys=[654, 400, 836, 123, 340, 443, 597, 160, 57], + ... ngram_len=5, + ... ) + + >>> # Generation with watermarking + >>> tokenized_prompts = tokenizer(["your prompts here"]) + >>> output_sequences = model.generate( + ... **tokenized_prompts, watermarking_config=watermarking_config, do_sample=True, + ... ) + >>> watermarked_text = tokenizer.batch_decode(output_sequences) + ``` + """ + + def __init__( + self, + ngram_len: int, + keys: List[int], + sampling_table_size: int, + sampling_table_seed: int, + context_history_size: int, + device: torch.device, + skip_first_ngram_calls: bool = False, + debug_mode: bool = False, + ): + self.ngram_len = ngram_len + self.keys = torch.tensor(keys, device=device) + + generator = torch.Generator(device=device).manual_seed(sampling_table_seed) + # A random sampling table is pre-computed and modulo table size is applied to map from a hash of ngram keys to + # g values, this is similar to the hashtable implementation used in + # https://github.com/facebookresearch/three_bricks. We note that the hashing employed in this repository is + # different from that used to watermark the Gemini App, and hence the detectors trained based on the + # hashing in this repository will not transfer to text generated by the Gemini App. + self.sampling_table = torch.randint( + low=0, + high=2, + size=(sampling_table_size,), + generator=generator, + device=device, + ) + self.context_history_size = context_history_size + self.device = device + self.state = None + self.skip_first_ngram_calls = skip_first_ngram_calls + self.debug_mode = debug_mode + + def _init_state(self, batch_size: int): + """Initializes the state.""" + self.state = SynthIDTextWatermarkState( + batch_size=batch_size, + ngram_len=self.ngram_len, + context_history_size=self.context_history_size, + device=self.device, + ) + + def update_scores(self, scores: torch.FloatTensor, g_values: torch.FloatTensor) -> torch.FloatTensor: + """Updates scores using the g values. + + We assume that the scores are in the log space. + Args: + scores (`torch.FloatTensor`): Scores (batch_size, vocab_size). + g_values (`torch.FloatTensor`): G valus (batch_size, vocab_size, depth). + + Returns: + Updated scores (batch_size, vocab_size). + """ + _, _, depth = g_values.shape + + probs = torch.softmax(scores, dim=1) + + for i in range(depth): + g_values_at_depth = g_values[:, :, i] + g_mass_at_depth = (g_values_at_depth * probs).sum(axis=1, keepdims=True) + probs = probs * (1 + g_values_at_depth - g_mass_at_depth) + + log_probs = torch.log(probs) + log_probs = torch.where(torch.isfinite(log_probs), log_probs, torch.finfo(log_probs.dtype).min) + return log_probs + + @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING) + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: + self._check_input_ids_shape(input_ids) + batch_size, vocab_size = scores.shape + + if self.debug_mode: + scores = torch.ones_like(scores) + + # Currently indices is just a arange to compute watermarking on the desnse logits. + all_indices = torch.stack([torch.arange(vocab_size, device=self.device) for _ in range(batch_size)]) + + if self.state is None: + # Initialize watermarking state if it does not exist. + self._init_state(batch_size) + else: + # Append last input id (which is the input id added in last call) to the + # previous context so we have the context to be used for current + # watermarking. + self.state.context = torch.concat( + (self.state.context, input_ids[:, -1:]), + dim=1, + ) + self.state.context = self.state.context[:, 1:] + + if self.state is None: + raise ValueError("self.state can't be None! Call `self._init_state` to initialize the state.") + + self.state.num_calls += 1 + + # Don't watermark the first ngram_len - 1 tokens if set. + if self.skip_first_ngram_calls and self.state.num_calls < self.ngram_len: + return scores + + # 2. Generate random keys for each ngram key combination. + ngram_keys, hash_result_with_just_context = self._compute_keys(self.state.context, all_indices) + # ngram_keys shape [batch_size, top_k, depth] + + # 3. Sample g values. + g_values = self.sample_g_values(ngram_keys) + # g_values shape [batch_size, top_k, depth] + + # 4. Modify scores. + updated_scores = self.update_scores(scores, g_values) + # updated scores shape [batch_size, top_k] + + # 5. Check if the current watermarking context was previously used, if yes skip watermarking. + hash_result_with_just_context = hash_result_with_just_context[:, None] + is_repeated_context = (self.state.context_history == hash_result_with_just_context).any( + dim=1, + keepdim=True, + ) + self.state.context_history = torch.concat( + (hash_result_with_just_context, self.state.context_history), + dim=1, + )[:, :-1] + + updated_watermarked_scores = torch.where( + is_repeated_context, + input=scores, + other=updated_scores, + ) + return updated_watermarked_scores + + def accumulate_hash( + self, + current_hash: torch.LongTensor, + data: torch.LongTensor, + multiplier: int = 6364136223846793005, + increment: int = 1, + ) -> torch.LongTensor: + """ + Accumulate hash of data on current hash. + + Method uses adapted linear congruential generator with newlib/musl parameters. + + This function has following property - + f(x, data[T]) = f(f(x, data[:T - 1]), data[T]) + + This function expects current_hash.shape and data.shape[:-1] to + match/broadcastable. + + Args: + current_hash (`torch.LongTensor`): + (shape,) + data (`torch.LongTensor`): + (shape, tensor_len) + multiplier (`int`, optional, *optional*, defaults to 6364136223846793005): + multiplier of linear congruential generator + increment (`int`, optional, *optional*, defaults to 1): + increment of linear congruential generator + + Returns: + updated hash (shape,) + """ + for i in range(data.shape[-1]): + current_hash = torch.add(current_hash, data[..., i]) + current_hash = torch.mul(current_hash, multiplier) + current_hash = torch.add(current_hash, increment) + return current_hash + + def compute_ngram_keys(self, ngrams: torch.LongTensor) -> torch.LongTensor: + """Computes random keys for each ngram and depth. + + Args: + ngrams (`torch.LongTensor`): + Ngrams (batch_size, num_ngrams, ngram_len). + + Returns: + ngram keys (batch_size, num_ngrams, depth). + """ + if len(ngrams.shape) != 3: + raise ValueError( + "Ngrams should be of shape (batch_size, num_ngrams, ngram_len), but" f" is {ngrams.shape}" + ) + if ngrams.shape[2] != self.ngram_len: + raise ValueError( + "Ngrams should be of shape (batch_size, num_ngrams, ngram_len)," + f" where ngram_len is {self.ngram_len}, but is {ngrams.shape}" + ) + batch_size, _, _ = ngrams.shape + + hash_result = torch.ones(batch_size, device=self.device, dtype=torch.long) + # hash_result shape [batch_size,] + # ngrams shape [batch_size, num_ngrams, ngram_len] + hash_result = torch.vmap(self.accumulate_hash, in_dims=(None, 1), out_dims=1)(hash_result, ngrams) + # hash_result shape [batch_size, num_ngrams] + + keys = self.keys[None, None, :, None] + # hash_result shape [batch_size, num_ngrams] + # keys shape [1, 1, depth, 1] + hash_result = torch.vmap(self.accumulate_hash, in_dims=(None, 2), out_dims=2)(hash_result, keys) + # hash_result shape [batch_size, num_ngrams, depth] + + return hash_result + + def _compute_keys( + self, n_minus_1_grams: torch.LongTensor, indices: torch.LongTensor + ) -> Tuple[torch.LongTensor, torch.LongTensor]: + """Computes random keys for each ngram and depth. + + Args: + n_minus_1_grams (`torch.LongTensor`): + Ngrams (batch_size, ngram_len - 1). + indices (`torch.LongTensor`): + indices of the continuations (batch_size, num_indices) + + Returns: + Ngram keys (batch_size, num_indices, depth). + """ + batch_size, _ = n_minus_1_grams.shape + + hash_result = torch.ones(batch_size, device=self.device, dtype=torch.long) + # First hash n_minus_1 gram, for each batch entry we have a single + # n_minus_1 gram context. + # hash_result shape [batch_size] + # n_minus_1_gram shape [batch_size, ngram_len - 1] + hash_result_with_just_context = self.accumulate_hash(hash_result, n_minus_1_grams) + # hash_result shape [batch_size,] + # Indices is of shape [batch_size, num_indices], so we make it + # [batch_size, num_indices, 1] so we can vmap over num_indices dim. + hash_result = torch.vmap(self.accumulate_hash, in_dims=(None, 1), out_dims=1)( + hash_result_with_just_context, indices[:, :, None] + ) + # hash_result shape [batch_size, num_indices] + # Basically we have a hash for each batch entry and each indices + # Now we add watermarking keys to this hash. + # keys are of shape [depth,] + # We add batch, num_indices and data dimension to this making it + # [1, 1, depth, 1]. + # So we can vmap over the depth dimension for compute_hash + keys = self.keys[None, None, :, None] + hash_result = torch.vmap(self.accumulate_hash, in_dims=(None, 2), out_dims=2)(hash_result, keys) + # hash_result shape should be [batch_size, num_indices, depth] + return hash_result, hash_result_with_just_context + + def sample_g_values(self, ngram_keys: torch.LongTensor) -> torch.LongTensor: + """ + Samples g values from Bernoulli distribution. + + It is not possible to pass random keys in a vectorized way in torch. Instead + we pre-compute a random sampling table, and use apply modulo table size to + map from ngram keys (int64) to g values. + + Args: + ngram_keys (`torch.LongTensor`): + Random keys (batch_size, num_ngrams, depth). + + Returns: + G values (batch_size, num_ngrams, depth). + """ + (sampling_table_size,) = self.sampling_table.shape + sampling_table = self.sampling_table.reshape((1, 1, sampling_table_size)) + ngram_keys = ngram_keys % sampling_table_size + return torch.take_along_dim(sampling_table, indices=ngram_keys, dim=2) + + def _check_input_ids_shape(self, input_ids: torch.LongTensor): + """Checks the shape of input ids.""" + if len(input_ids.shape) != 2: + raise ValueError("Input ids should be of shape (batch_size, input_len), but is" f" {input_ids.shape}") + + def compute_g_values(self, input_ids: torch.LongTensor) -> torch.LongTensor: + """ + Computes g values for each ngram from the given sequence of tokens. + + Args: + input_ids (`torch.LongTensor`): + Input token ids (batch_size, input_len). + + Returns: + G values (batch_size, input_len - (ngram_len - 1), depth). + """ + self._check_input_ids_shape(input_ids) + ngrams = input_ids.unfold(dimension=1, size=self.ngram_len, step=1) + ngram_keys = self.compute_ngram_keys(ngrams) + return self.sample_g_values(ngram_keys) + + def compute_context_repetition_mask(self, input_ids: torch.LongTensor) -> torch.LongTensor: + """ + Computes repetition mask. + + 0 and 1 stand for repeated and not repeated context n-1 grams respectively. + + Args: + input_ids (`torch.LongTensor`): + Input token ids (batch_size, input_len). + + Returns: + Repetitions mask (batch_size, input_len - (ngram_len - 1)). + """ + self._check_input_ids_shape(input_ids) + batch_size, _ = input_ids.shape + state = SynthIDTextWatermarkState( + batch_size=batch_size, + ngram_len=self.ngram_len, + context_history_size=self.context_history_size, + device=self.device, + ) + contexts = input_ids[:, :-1].unfold( + dimension=1, + size=self.ngram_len - 1, + step=1, + ) + _, num_contexts, _ = contexts.shape + + are_repeated_contexts = [] + for i in range(num_contexts): + context = contexts[:, i, :] + hash_result = torch.ones(batch_size, device=self.device, dtype=torch.long) + context_hash = self.accumulate_hash(hash_result, context)[:, None] + is_repeated_context = (state.context_history == context_hash).any( + dim=1, + keepdim=True, + ) + are_repeated_contexts.append(is_repeated_context) + state.context_history = torch.concat( + (context_hash, state.context_history), + dim=1, + )[:, :-1] + are_repeated_contexts = torch.concat(are_repeated_contexts, dim=1) + + return torch.logical_not(are_repeated_contexts) + + def compute_eos_token_mask(self, input_ids: torch.LongTensor, eos_token_id: int) -> torch.LongTensor: + """ + Computes repetitions mask. + + 1 stands for ngrams that don't contain EOS tokens and vice versa. + + Args: + input_ids (`torch.LongTensor`): + Input token ids (batch_size, input_len). + eos_token_id (`int`): + EOS token ID. + + Returns: + EOS token mask (batch_size, input_len). + """ + self._check_input_ids_shape(input_ids) + noneos_masks = [] + all_eos_equated = input_ids == eos_token_id + for eos_equated in all_eos_equated: + nonzero_idx = torch.nonzero(eos_equated) + noneos_mask = torch.ones_like(eos_equated) + if nonzero_idx.shape[0] != 0: + noneos_mask[nonzero_idx[0][0] :] = 0 + noneos_masks.append(noneos_mask) + return torch.stack(noneos_masks, dim=0) + + def expected_mean_g_value(self, vocab_size: int, coinflip_prob: float = 0.5) -> float: + """ + Compute expected mean g-value after watermarking, assuming uniform LM dist. + + This is the theoretical expected value for single-layer watermarking. + + Args: + vocab_size (`int`): + The size of the vocabulary. + coinflip_prob arg_name (`float`, *optional*, defaults to 0.5): + Probability of 1 in boolean prf. + + Returns: + The expected mean g-value for watermarked text. + """ + return coinflip_prob + coinflip_prob * (1 - coinflip_prob) * (1 - (1 / vocab_size)) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index c399a8a2c82..700ea0443f4 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -92,7 +92,6 @@ TopPLogitsWarper, TypicalLogitsWarper, UnbatchedClassifierFreeGuidanceLogitsProcessor, - WatermarkLogitsProcessor, ) from .stopping_criteria import ( ConfidenceCriteria, @@ -1011,15 +1010,7 @@ def _get_logits_processor( ) if generation_config.watermarking_config is not None: processors.append( - WatermarkLogitsProcessor( - vocab_size=self.config.vocab_size, - device=device, - greenlist_ratio=generation_config.watermarking_config.greenlist_ratio, - bias=generation_config.watermarking_config.bias, - hashing_key=generation_config.watermarking_config.hashing_key, - seeding_scheme=generation_config.watermarking_config.seeding_scheme, - context_width=generation_config.watermarking_config.context_width, - ) + generation_config.watermarking_config.construct_processor(self.config.vocab_size, device) ) # TODO (joao): find a strategy to specify the order of the processors diff --git a/src/transformers/generation/watermarking.py b/src/transformers/generation/watermarking.py index e998d996ec4..da90c03dd0d 100644 --- a/src/transformers/generation/watermarking.py +++ b/src/transformers/generation/watermarking.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright 2024 The HuggingFace Inc. team +# Copyright 2024 The HuggingFace Inc. team and Google DeepMind. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,19 +16,22 @@ import collections from dataclasses import dataclass from functools import lru_cache -from typing import Dict, Optional, Union +from typing import Any, Dict, Optional, Tuple, Union import numpy as np +import torch +from torch import nn +from torch.nn import BCELoss -from ..configuration_utils import PretrainedConfig -from ..utils import is_torch_available, logging -from .configuration_utils import WatermarkingConfig +from ..modeling_utils import PreTrainedModel +from ..utils import ModelOutput, is_torch_available, logging +from .configuration_utils import PretrainedConfig, WatermarkingConfig if is_torch_available(): import torch - from .logits_process import WatermarkLogitsProcessor + from .logits_process import SynthIDTextWatermarkLogitsProcessor, WatermarkLogitsProcessor logger = logging.get_logger(__name__) @@ -237,3 +240,310 @@ def __call__( confidence=confidence, ) return prediction + + +class BayesianDetectorConfig(PretrainedConfig): + """ + This is the configuration class to store the configuration of a [`BayesianDetectorModel`]. It is used to + instantiate a Bayesian Detector model according to the specified arguments. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + watermarking_depth (`int`, *optional*): + The number of tournament layers. + base_rate (`float1`, *optional*, defaults to 0.5): + Prior probability P(w) that a text is watermarked. + """ + + def __init__(self, watermarking_depth: int = None, base_rate: float = 0.5, **kwargs): + self.watermarking_depth = watermarking_depth + self.base_rate = base_rate + # These can be set later to store information about this detector. + self.model_name = None + self.watermarking_config = None + + super().__init__(**kwargs) + + def set_detector_information(self, model_name, watermarking_config): + self.model_name = model_name + self.watermarking_config = watermarking_config + + +@dataclass +class BayesianWatermarkDetectorModelOutput(ModelOutput): + """ + Base class for outputs of models predicting if the text is watermarked. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss. + posterior_probabilities (`torch.FloatTensor` of shape `(1,)`): + Multiple choice classification loss. + """ + + loss: Optional[torch.FloatTensor] = None + posterior_probabilities: Optional[torch.FloatTensor] = None + + +class BayesianDetectorWatermarkedLikelihood(nn.Module): + """Watermarked likelihood model for binary-valued g-values. + + This takes in g-values and returns p(g_values|watermarked). + """ + + def __init__(self, watermarking_depth: int): + """Initializes the model parameters.""" + super().__init__() + self.watermarking_depth = watermarking_depth + self.beta = torch.nn.Parameter(-2.5 + 0.001 * torch.randn(1, 1, watermarking_depth)) + self.delta = torch.nn.Parameter(0.001 * torch.randn(1, 1, self.watermarking_depth, watermarking_depth)) + + def _compute_latents(self, g_values: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """Computes the unique token probability distribution given g-values. + + Args: + g_values (`torch.Tensor` of shape `(batch_size, seq_len, watermarking_depth)`): + PRF values. + + Returns: + p_one_unique_token and p_two_unique_tokens, both of shape + [batch_size, seq_len, watermarking_depth]. p_one_unique_token[i,t,l] + gives the probability of there being one unique token in a tournament + match on layer l, on timestep t, for batch item i. + p_one_unique_token[i,t,l] + p_two_unique_token[i,t,l] = 1. + """ + # Tile g-values to produce feature vectors for predicting the latents + # for each layer in the tournament; our model for the latents psi is a + # logistic regression model psi = sigmoid(delta * x + beta). + + # [batch_size, seq_len, watermarking_depth, watermarking_depth] + x = torch.repeat_interleave(torch.unsqueeze(g_values, dim=-2), self.watermarking_depth, axis=-2) + + # mask all elements above -1 diagonal for autoregressive factorization + x = torch.tril(x, diagonal=-1) + + # [batch_size, seq_len, watermarking_depth] + # (i, j, k, l) x (i, j, k, l) -> (i, j, k) einsum equivalent + logits = (self.delta[..., None, :] @ x.type(self.delta.dtype)[..., None]).squeeze() + self.beta + + p_two_unique_tokens = torch.sigmoid(logits) + p_one_unique_token = 1 - p_two_unique_tokens + return p_one_unique_token, p_two_unique_tokens + + def forward(self, g_values: torch.Tensor) -> torch.Tensor: + """Computes the likelihoods P(g_values|watermarked). + + Args: + g_values (`torch.Tensor` of shape `(batch_size, seq_len, watermarking_depth)`): + g-values (values 0 or 1) + + Returns: + p(g_values|watermarked) of shape [batch_size, seq_len, watermarking_depth]. + """ + p_one_unique_token, p_two_unique_tokens = self._compute_latents(g_values) + + # P(g_tl | watermarked) is equal to + # 0.5 * [ (g_tl+0.5) * p_two_unique_tokens + p_one_unique_token]. + return 0.5 * ((g_values + 0.5) * p_two_unique_tokens + p_one_unique_token) + + +class BayesianDetectorModel(PreTrainedModel): + r""" + Bayesian classifier for watermark detection. + + This detector uses Bayes' rule to compute a watermarking score, which is the sigmoid of the log of ratio of the + posterior probabilities P(watermarked|g_values) and P(unwatermarked|g_values). Please see the section on + BayesianScore in the paper for further details. + Paper URL: https://www.nature.com/articles/s41586-024-08025-4 + + Note that this detector only works with non-distortionary Tournament-based watermarking using the Bernoulli(0.5) + g-value distribution. + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`BayesianDetectorConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. + """ + + config_class = BayesianDetectorConfig + base_model_prefix = "model" + + def __init__(self, config): + super().__init__(config) + + self.watermarking_depth = config.watermarking_depth + self.base_rate = config.base_rate + self.likelihood_model_watermarked = BayesianDetectorWatermarkedLikelihood( + watermarking_depth=self.watermarking_depth + ) + self.prior = torch.nn.Parameter(torch.tensor([self.base_rate])) + + def _init_weights(self, module): + """Initialize the weights.""" + if isinstance(module, nn.Parameter): + module.weight.data.normal_(mean=0.0, std=0.02) + + def _compute_posterior( + self, + likelihoods_watermarked: torch.Tensor, + likelihoods_unwatermarked: torch.Tensor, + mask: torch.Tensor, + prior: float, + ) -> torch.Tensor: + """ + Compute posterior P(w|g) given likelihoods, mask and prior. + + Args: + likelihoods_watermarked (`torch.Tensor` of shape `(batch, length, depth)`): + Likelihoods P(g_values|watermarked) of g-values under watermarked model. + likelihoods_unwatermarked (`torch.Tensor` of shape `(batch, length, depth)`): + Likelihoods P(g_values|unwatermarked) of g-values under unwatermarked model. + mask (`torch.Tensor` of shape `(batch, length)`): + A binary array indicating which g-values should be used. g-values with mask value 0 are discarded. + prior (`float`): + the prior probability P(w) that the text is watermarked. + + Returns: + Posterior probability P(watermarked|g_values), shape [batch]. + """ + mask = torch.unsqueeze(mask, dim=-1) + prior = torch.clamp(prior, min=1e-5, max=1 - 1e-5) + log_likelihoods_watermarked = torch.log(torch.clamp(likelihoods_watermarked, min=1e-30, max=float("inf"))) + log_likelihoods_unwatermarked = torch.log(torch.clamp(likelihoods_unwatermarked, min=1e-30, max=float("inf"))) + log_odds = log_likelihoods_watermarked - log_likelihoods_unwatermarked + + # Sum relative surprisals (log odds) across all token positions and layers. + relative_surprisal_likelihood = torch.einsum("i...->i", log_odds * mask) + + # Compute the relative surprisal prior + relative_surprisal_prior = torch.log(prior) - torch.log(1 - prior) + + # Combine prior and likelihood. + # [batch_size] + relative_surprisal = relative_surprisal_prior + relative_surprisal_likelihood + + # Compute the posterior probability P(w|g) = sigmoid(relative_surprisal). + return torch.sigmoid(relative_surprisal) + + def forward( + self, + g_values: torch.Tensor, + mask: torch.Tensor, + labels: Optional[torch.Tensor] = None, + loss_batch_weight=1, + return_dict=False, + ) -> BayesianWatermarkDetectorModelOutput: + """ + Computes the watermarked posterior P(watermarked|g_values). + + Args: + g_values (`torch.Tensor` of shape `(batch_size, seq_len, watermarking_depth, ...)`): + g-values (with values 0 or 1) + mask: + A binary array shape [batch_size, seq_len] indicating which g-values should be used. g-values with mask + value 0 are discarded. + + Returns: + p(watermarked | g_values), of shape [batch_size]. + """ + + likelihoods_watermarked = self.likelihood_model_watermarked(g_values) + likelihoods_unwatermarked = 0.5 * torch.ones_like(g_values) + out = self._compute_posterior( + likelihoods_watermarked=likelihoods_watermarked, + likelihoods_unwatermarked=likelihoods_unwatermarked, + mask=mask, + prior=self.prior, + ) + + loss = None + if labels is not None: + loss_fct = BCELoss() + loss_unwweight = torch.sum(self.likelihood_model_watermarked.delta**2) + loss_weight = loss_unwweight * loss_batch_weight + loss = loss_fct(torch.clamp(out, 1e-5, 1 - 1e-5), labels) + loss_weight + + if not return_dict: + return (out,) if loss is None else (out, loss) + + return BayesianWatermarkDetectorModelOutput(loss=loss, posterior_probabilities=out) + + +class SynthIDTextWatermarkDetector: + r""" + SynthID text watermark detector class. + + This class has to be initialized with the trained bayesian detector module check script + in examples/synthid_text/detector_training.py for example in training/saving/loading this + detector module. The folder also showcases example use case of this detector. + + Parameters: + detector_module ([`BayesianDetectorModel`]): + Bayesian detector module object initialized with parameters. + Check examples/research_projects/synthid_text/detector_training.py for usage. + logits_processor (`SynthIDTextWatermarkLogitsProcessor`): + The logits processor used for watermarking. + tokenizer (`Any`): + The tokenizer used for the model. + + Examples: + ```python + >>> from transformers import ( + ... AutoTokenizer, BayesianDetectorModel, SynthIDTextWatermarkLogitsProcessor, SynthIDTextWatermarkDetector + ... ) + + >>> # Load the detector. See examples/research_projects/synthid_text for training a detector. + >>> detector_model = BayesianDetectorModel.from_pretrained("joaogante/dummy_synthid_detector") + >>> logits_processor = SynthIDTextWatermarkLogitsProcessor( + ... **detector_model.config.watermarking_config, device="cpu" + ... ) + >>> tokenizer = AutoTokenizer.from_pretrained(detector_model.config.model_name) + >>> detector = SynthIDTextWatermarkDetector(detector_model, logits_processor, tokenizer) + + >>> # Test whether a certain string is watermarked + >>> test_input = tokenizer(["This is a test input"], return_tensors="pt") + >>> is_watermarked = detector(test_input.input_ids) + ``` + """ + + def __init__( + self, + detector_module: BayesianDetectorModel, + logits_processor: SynthIDTextWatermarkLogitsProcessor, + tokenizer: Any, + ): + self.detector_module = detector_module + self.logits_processor = logits_processor + self.tokenizer = tokenizer + + def __call__(self, tokenized_outputs: torch.Tensor): + # eos mask is computed, skip first ngram_len - 1 tokens + # eos_mask will be of shape [batch_size, output_len] + eos_token_mask = self.logits_processor.compute_eos_token_mask( + input_ids=tokenized_outputs, + eos_token_id=self.tokenizer.eos_token_id, + )[:, self.logits_processor.ngram_len - 1 :] + + # context repetition mask is computed + context_repetition_mask = self.logits_processor.compute_context_repetition_mask( + input_ids=tokenized_outputs, + ) + # context repitition mask shape [batch_size, output_len - (ngram_len - 1)] + + combined_mask = context_repetition_mask * eos_token_mask + + g_values = self.logits_processor.compute_g_values( + input_ids=tokenized_outputs, + ) + # g values shape [batch_size, output_len - (ngram_len - 1), depth] + return self.detector_module(g_values, combined_mask) diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index e109ea659c7..36e1ff2cfe6 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -191,6 +191,20 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class BayesianDetectorConfig(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class BayesianDetectorModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class BeamScorer(metaclass=DummyObject): _backends = ["torch"] @@ -457,6 +471,27 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class SynthIDTextWatermarkDetector(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class SynthIDTextWatermarkingConfig(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class SynthIDTextWatermarkLogitsProcessor(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class TemperatureLogitsWarper(metaclass=DummyObject): _backends = ["torch"] diff --git a/tests/generation/test_logits_process.py b/tests/generation/test_logits_process.py index a5d3ab37efa..aeebb5c4c53 100644 --- a/tests/generation/test_logits_process.py +++ b/tests/generation/test_logits_process.py @@ -16,6 +16,7 @@ import unittest from typing import List, Union +import numpy as np from parameterized import parameterized from transformers import is_torch_available @@ -48,6 +49,7 @@ PrefixConstrainedLogitsProcessor, RepetitionPenaltyLogitsProcessor, SequenceBiasLogitsProcessor, + SynthIDTextWatermarkLogitsProcessor, TemperatureLogitsWarper, TopKLogitsWarper, TopPLogitsWarper, @@ -975,3 +977,187 @@ def test_watermarking_processor(self): scores_wo_bias = scores[:, -1].clone() out = watermark(input_ids=input_ids, scores=scores) self.assertTrue((out[:, 1] == scores_wo_bias + watermark.bias).all()) + + @parameterized.expand([(5, 3, 10000), (10, 5, 1000)]) + def test_synthidtext_watermarking_processor_bias_uniformity(self, ngram_len, num_layers, vocab_size): + """Test SynthID watermarked distribution bias uniformity over iterations.""" + torch.manual_seed(0) + np.random.seed(0) + watermarking_config = { + "ngram_len": ngram_len, + "keys": np.random.randint(low=0, high=2**16, size=(num_layers,)), + "sampling_table_size": 2**16, + "sampling_table_seed": 0, + "context_history_size": 512, + "device": torch_device, + } + batch_size = 100000 + ngrams = torch.randint( + low=0, + high=vocab_size, + size=(batch_size, ngram_len), + device=torch_device, + ) + + logits_processor = SynthIDTextWatermarkLogitsProcessor(**watermarking_config) + g_values = logits_processor.compute_g_values(ngrams) + g_values_mean = torch.mean(torch.mean(g_values.float(), dim=0)) + self.assertAlmostEqual(g_values_mean, 0.5, delta=0.01) + + @parameterized.expand([(10000, 3), (1000, 20)]) + def test_synthidtext_watermark_processor_bias_uniformity_across_vocab(self, vocab_size, num_layers): + """Test SynthID watermarked distribution bias uniformity over vocabs of the model.""" + batch_size = 1000 + ngram_len = 5 + torch.manual_seed(0) + np.random.seed(0) + watermarking_config = { + "ngram_len": ngram_len, + "keys": np.random.randint(low=0, high=2**16, size=(num_layers,)), + "sampling_table_size": 2**16, + "sampling_table_seed": 0, + "context_history_size": 512, + "device": torch_device, + } + n_minus_1_grams = torch.randint( + low=0, + high=vocab_size, + size=(batch_size, watermarking_config["ngram_len"] - 1), + device=torch_device, + ) + + logits_processor = SynthIDTextWatermarkLogitsProcessor(**watermarking_config) + ngram_keys, _ = logits_processor._compute_keys( + n_minus_1_grams, + torch.stack([torch.arange(vocab_size, device=torch_device) for _ in range(batch_size)]), + ) + + g_values = logits_processor.sample_g_values(ngram_keys) + # g_values shape should be [batch_size, vocab_size, num_layers] + g_values_mean = torch.mean(torch.mean(g_values.float(), dim=1)) + self.assertAlmostEqual(g_values_mean, 0.5, delta=0.001) + + @parameterized.expand([(2, "uniform"), (10, "uniform"), (2, "random"), (10, "random")]) + def test_synthidtext_watermark_processor_distributional_convergence(self, vocab_size, logits_type): + """Check if watermarked distribution converges to unwatermarked logits distribution.""" + batch_size = 1500 + num_keys = 1000 + + updated_softmaxes = 0 + np.random.seed(0) + torch.manual_seed(0) + if logits_type == "uniform": + fixed_logits = torch.ones((batch_size, vocab_size), device=torch_device) + elif logits_type == "random": + fixed_logits = torch.rand( + ( + 1, + vocab_size, + ), + device=torch_device, + ) + fixed_logits = fixed_logits.repeat(batch_size, 1) + else: + raise ValueError(f"Unrecognized logits_type {logits_type}") + for _ in range(num_keys): + watermarking_config = { + "ngram_len": 5, + "keys": np.random.randint(0, 10**9, size=(1,), dtype=np.int64), + "sampling_table_size": 2**16, + "sampling_table_seed": 0, + "context_history_size": 1024, + "device": torch_device, + } + + logits_processor = SynthIDTextWatermarkLogitsProcessor(**watermarking_config) + + ngrams = torch.randint( + low=0, + high=vocab_size, + size=(batch_size, watermarking_config["ngram_len"]), + device=torch_device, + ) + + # Insert ngram-1 into logit_processor state. + for idx in range(watermarking_config["ngram_len"] - 1): + _ = logits_processor(ngrams[:, :idx], fixed_logits) + + updated_scores = logits_processor(ngrams, fixed_logits) + updated_softmaxes += torch.nn.functional.softmax(updated_scores, dim=1).cpu().numpy() + + updated_softmaxes = np.mean(updated_softmaxes, axis=0) / num_keys + is_close = torch.all( + torch.isclose( + torch.tensor(updated_softmaxes, device=torch_device), + torch.nn.Softmax()(fixed_logits[0]), # Take any batch entry, all are same. + atol=1e-3, + rtol=0, + ) + ) + self.assertTrue(is_close) + + @parameterized.expand([(2, 10, 1, 0.01), (100, 5, 1, 0.01), (100, 10, 2, 0.02)]) + def test_synthidtext_watermark_processor_bias_test(self, vocab_size, ngram_len, num_layers, atol): + """Test SynthID watermarking bias matches theoretical value.""" + batch_size = 20000 + generator = torch.Generator(device=torch_device).manual_seed(0) + np.random.seed(0) + + keys = [np.random.randint(0, 10**9) for _ in range(num_layers)] + # Use 10**9 rather than vocab_size to ensure variety in (n-1)-grams. + context = torch.randint( + low=0, + high=10**9, + size=(batch_size, ngram_len - 1), + dtype=torch.int64, + generator=generator, + device=torch_device, + ) + + context_history_size = 1024 + logits_processor = SynthIDTextWatermarkLogitsProcessor( + ngram_len=ngram_len, + keys=keys, + sampling_table_size=2**16, + sampling_table_seed=0, + context_history_size=context_history_size, + device=torch_device, + ) + + scores = torch.ones( + (batch_size, vocab_size), + dtype=torch.float64, + device=torch_device, + ) + # Init state of the logits processor. + logits_processor(context, scores) + # insert context into the state. + for idx in range(1, ngram_len - 1): + _ = logits_processor(context[:, :idx], scores) + + updated_scores = logits_processor(context, scores) + + probs = torch.nn.functional.softmax(updated_scores, dim=1) + generator = torch.Generator(device=torch_device).manual_seed(0) + next_tokens = torch.multinomial( + probs, + num_samples=1, + generator=generator, + ) + + ngrams = torch.concat((context, next_tokens), dim=1) + g_values = logits_processor.compute_g_values(ngrams) + mean_g_values = g_values.mean(dtype=torch.float64, dim=(0, 1)) + + expected_mean_g_value = logits_processor.expected_mean_g_value( + vocab_size=vocab_size, + ) + is_close = torch.all( + torch.isclose( + mean_g_values, + torch.tensor(expected_mean_g_value, dtype=torch.float64, device=torch_device), + atol=atol, + rtol=0, + ) + ) + self.assertTrue(is_close) diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 996d95eb80f..4e5d8f30265 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -84,6 +84,7 @@ SampleEncoderDecoderOutput, StoppingCriteria, StoppingCriteriaList, + SynthIDTextWatermarkingConfig, WatermarkDetector, WatermarkingConfig, ) @@ -2517,9 +2518,9 @@ def test_beam_search_low_memory(self): self.assertListEqual(low_output.tolist(), high_output.tolist()) @slow - def test_watermark_generation(self): - tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2") - model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2").to(torch_device) + def test_green_red_watermark_generation(self): + model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device) + tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2") tokenizer.pad_token_id = tokenizer.eos_token_id model_inputs = tokenizer("I will be", return_tensors="pt").to(torch_device) input_len = model_inputs["input_ids"].shape[-1] @@ -2548,6 +2549,61 @@ def test_watermark_generation(self): self.assertListEqual(detection_out_watermarked.prediction.tolist(), [True]) self.assertListEqual(detection_out.prediction.tolist(), [False]) + """Check the mean bias inserted by the watermarking algorithm.""" + + @slow + def test_synthid_text_watermark_generation_mean_expected_bias(self): + model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device) + tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2") + tokenizer.pad_token_id = tokenizer.eos_token_id + model_inputs = tokenizer("I will be", return_tensors="pt").to(torch_device) + input_len = 5 + batch_size = 200 + + # generation should work with both input types: WatermarkingConfig or Dict, so let's check it here :) + watermark_config = SynthIDTextWatermarkingConfig(keys=[10, 20], ngram_len=5, debug_mode=True) + logits_processor = watermark_config.construct_processor(model.config.vocab_size, torch_device) + mean_g_values_repeats = [] + for _ in range(40): + input_ids = torch.zeros( + (batch_size, input_len), + dtype=torch.int64, + device=torch_device, + ) + model_inputs = { + "input_ids": input_ids, + "attention_mask": torch.ones_like(input_ids, device=torch_device), + } + output = model.generate( + **model_inputs, watermarking_config=watermark_config, do_sample=True, max_length=500, top_k=1000 + ) + g_values = logits_processor.compute_g_values(input_ids=output[:, input_len:]) + context_repetition_mask = logits_processor.compute_context_repetition_mask( + input_ids=output[:, input_len:], + ).unsqueeze(dim=2) + + mean_g_values = torch.masked.mean( + g_values, + mask=context_repetition_mask, + dim=0, + keepdim=True, + dtype=torch.float64, + ) + mean_g_values_repeats.append(mean_g_values) + + mean_g_values = torch.concat(mean_g_values_repeats, dim=0).mean(dim=0) + expected_mean_g_value = logits_processor.expected_mean_g_value( + vocab_size=model.config.vocab_size, + ) + atol = 0.03 + is_close = torch.isclose( + mean_g_values, + torch.tensor(expected_mean_g_value, dtype=torch.float64), + atol=atol, + rtol=0, + ) + self.assertTrue(torch.all(is_close)) + @slow def test_beam_search_example_integration(self): # PT-only test: TF doesn't have a BeamSearchScorer From 65753d6065e4d6e79199c923494edbf0d6248fb1 Mon Sep 17 00:00:00 2001 From: Abhishek Maurya <124327945+Abhishek-TAMU@users.noreply.github.com> Date: Thu, 24 Oct 2024 05:02:54 -0400 Subject: [PATCH 06/99] Remove graph breaks for torch.compile() in flash_attention_forward when Lllama Model is padding free tuned (#33932) * fix: fixes for graph breaks Signed-off-by: Abhishek * fix: formatting Signed-off-by: Abhishek * fix: import error Signed-off-by: Abhishek * fix: Add Fa2Kwargs Signed-off-by: Abhishek * fix: PR Changes Signed-off-by: Abhishek * PR changes Signed-off-by: Abhishek * PR changes Signed-off-by: Abhishek * PR changes Signed-off-by: Abhishek * PR changes Signed-off-by: Abhishek * Revert "PR changes" This reverts commit 39d2868e5c93cc5f3f3c7c6ff981b66614c0e0e4. * PR changes Signed-off-by: Abhishek * fix: FlashAttentionKwarg Signed-off-by: Abhishek * fix: FlashAttentionKwarg Signed-off-by: Abhishek * PR Changes Signed-off-by: Abhishek * PR Changes Signed-off-by: Abhishek * PR Changes Signed-off-by: Abhishek * PR Changes Signed-off-by: Abhishek * PR Changes Signed-off-by: Abhishek * addition of documentation Signed-off-by: Abhishek * change in _flash_attention_forward Signed-off-by: Abhishek * make fix-copies Signed-off-by: Abhishek * revert make fix-copies Signed-off-by: Abhishek * fix copies * style * loss kwargs typing * style and pull latest changes --------- Signed-off-by: Abhishek Co-authored-by: Arthur Zucker --- docs/source/en/llm_optims.md | 93 +++++++++++++++++++ .../modeling_flash_attention_utils.py | 65 ++++++++++--- .../models/cohere/modeling_cohere.py | 4 + src/transformers/models/glm/modeling_glm.py | 14 ++- src/transformers/models/glm/modular_glm.py | 2 + .../models/llama/modeling_llama.py | 16 +++- src/transformers/tokenization_utils_base.py | 2 +- src/transformers/utils/__init__.py | 1 + src/transformers/utils/generic.py | 15 ++- 9 files changed, 192 insertions(+), 20 deletions(-) diff --git a/docs/source/en/llm_optims.md b/docs/source/en/llm_optims.md index 16be638498d..0a6a7e15bea 100644 --- a/docs/source/en/llm_optims.md +++ b/docs/source/en/llm_optims.md @@ -348,6 +348,99 @@ model = AutoModelForCausalLM.from_pretrained( ) ``` +### Fine-Tuning with torch.compile and Padding-Free Data Collation + +In addition to optimizing inference, you can also enhance the training efficiency of large language models by leveraging torch.compile during fine-tuning and using a padding-free data collator. This approach can significantly speed up training and reduce computational overhead. + +Here's how you can fine-tune a Llama model using SFTTrainer from the TRL library, with torch_compile enabled and a padding-free data collator: + +``` +#################### IMPORTS ################### + +import math +import datasets +import dataclasses +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + TrainingArguments +) +from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM + +#################### MODEL LOADING WITH FLASH ATTENTION ################### + +model_name = "meta-llama/Llama-3.2-1B" +model = AutoModelForCausalLM.from_pretrained( + model_name, + attn_implementation="flash_attention_2" # Enables FlashAttention-2 +) +tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) + +#################### DATA PREPROCESSING (PADDING-FREE) ################### + +response_template = "\n### Label:" +response_template_ids = tokenizer.encode( + response_template, add_special_tokens=False +)[2:] # Exclude special tokens + +data_collator = DataCollatorForCompletionOnlyLM( + response_template_ids=response_template_ids, + tokenizer=tokenizer, + ignore_index=-100, + padding_free=True # Enables padding-free collation +) + +def format_dataset(example): + return { + "output": example["output"] + tokenizer.eos_token + } + +data_files = {"train": "path/to/dataset"} # Replace with your dataset path +json_dataset = datasets.load_dataset("json", data_files=data_files) +formatted_train_dataset = json_dataset["train"].map(format_dataset) + +################# TRAINING CONFIGURATION ############################ + +train_args = TrainingArguments( + num_train_epochs=5, + per_device_train_batch_size=4, + per_device_eval_batch_size=4, + gradient_accumulation_steps=4, + learning_rate=1e-5, + weight_decay=0.0, + warmup_ratio=0.03, + lr_scheduler_type="cosine", + logging_steps=1, + include_tokens_per_second=True, + save_strategy="epoch", + output_dir="output", + torch_compile=True, # Enables torch.compile + torch_compile_backend="inductor", + torch_compile_mode="default" +) + +# Convert TrainingArguments to SFTConfig +transformer_train_arg_fields = [x.name for x in dataclasses.fields(SFTConfig)] +transformer_kwargs = { + k: v + for k, v in train_args.to_dict().items() + if k in transformer_train_arg_fields +} +training_args = SFTConfig(**transformer_kwargs) + +####################### FINE-TUNING ##################### + +trainer = SFTTrainer( + model=model, + tokenizer=tokenizer, + train_dataset=formatted_train_dataset, + data_collator=data_collator, + dataset_text_field="output", + args=training_args, +) +trainer.train() +``` + ### PyTorch scaled dot product attention Scaled dot product attention (SDPA) is automatically enabled in PyTorch 2.0 and it supports FlashAttention, xFormers, and PyTorch's C++ implementation. SDPA chooses the most performant attention algorithm if you're using a CUDA backend. For other backends, SDPA defaults to the PyTorch C++ implementation. diff --git a/src/transformers/modeling_flash_attention_utils.py b/src/transformers/modeling_flash_attention_utils.py index da961c6060e..045d2f6d646 100644 --- a/src/transformers/modeling_flash_attention_utils.py +++ b/src/transformers/modeling_flash_attention_utils.py @@ -15,7 +15,7 @@ import inspect import os -from typing import Optional, Tuple +from typing import Optional, Tuple, TypedDict import torch import torch.nn.functional as F @@ -180,6 +180,10 @@ def prepare_fa2_from_position_ids(query, key, value, position_ids): return (query, key, value, indices_q, (cu_seq_lens, cu_seq_lens), (max_length, max_length)) +flash_241 = is_flash_attn_greater_or_equal("2.4.1") +deterministic_g = os.environ.get("FLASH_ATTENTION_DETERMINISTIC", "0") == "1" + + def _flash_attention_forward( query_states: torch.Tensor, key_states: torch.Tensor, @@ -194,6 +198,10 @@ def _flash_attention_forward( use_top_left_mask: bool = False, softcap: Optional[float] = None, deterministic: bool = None, + cu_seq_lens_q: Optional[torch.LongTensor] = None, + cu_seq_lens_k: Optional[torch.LongTensor] = None, + max_length_q: Optional[int] = None, + max_length_k: Optional[int] = None, ): """ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token @@ -232,9 +240,9 @@ def _flash_attention_forward( ) flash_kwargs = {"window_size": (sliding_window, sliding_window)} if use_sliding_windows else {} - if is_flash_attn_greater_or_equal("2.4.1"): + if flash_241: if deterministic is None: - deterministic = os.environ.get("FLASH_ATTENTION_DETERMINISTIC", "0") == "1" + deterministic = deterministic_g flash_kwargs["deterministic"] = deterministic if softcap is not None: @@ -267,24 +275,32 @@ def _flash_attention_forward( # If position_ids is provided and check all examples do not contain only 1 sequence, If tensor in increasing # then we probably have one sequence, otherwise it is packed. Additionally check we are in pre-fill/training stage. # Use `flash_attn_varlen_func` to prevent cross-example attention and also allow padding free approach - # Note: the `torch.diff(...)` condition is last to use short-circuit and avoid the cuda synchronization it incurs during inference (query_length == 1 always) - elif position_ids is not None and query_length != 1 and not (torch.diff(position_ids, dim=-1) >= 0).all(): + elif position_ids is not None and ( + max_length_q is not None or (query_length != 1 and not (torch.diff(position_ids, dim=-1) >= 0).all()) + ): batch_size = query_states.size(0) - query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = prepare_fa2_from_position_ids( - query_states, key_states, value_states, position_ids - ) - cu_seqlens_q, cu_seqlens_k = cu_seq_lens - max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + if cu_seq_lens_q is None or cu_seq_lens_k is None: + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = ( + prepare_fa2_from_position_ids(query_states, key_states, value_states, position_ids) + ) + + cu_seq_lens_q, cu_seq_lens_k = cu_seq_lens + max_length_q, max_length_k = max_seq_lens + + else: + query_states = query_states.reshape(-1, query_states.size(-2), query_states.size(-1)) + key_states = key_states.reshape(-1, key_states.size(-2), key_states.size(-1)) + value_states = value_states.reshape(-1, value_states.size(-2), value_states.size(-1)) attn_output = flash_attn_varlen_func( query_states, key_states, value_states, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_k=cu_seqlens_k, - max_seqlen_q=max_seqlen_in_batch_q, - max_seqlen_k=max_seqlen_in_batch_k, + cu_seqlens_q=cu_seq_lens_q, + cu_seqlens_k=cu_seq_lens_k, + max_seqlen_q=max_length_q, + max_seqlen_k=max_length_k, dropout_p=dropout, softmax_scale=softmax_scale, causal=causal, @@ -299,3 +315,24 @@ def _flash_attention_forward( ) return attn_output + + +class FlashAttentionKwargs(TypedDict, total=False): + """ + Keyword arguments for Flash Attention with Compile. + + Attributes: + cu_seq_lens_q (`torch.LongTensor`, *optional*) + Gets cumlative sequence length for query state. + cu_seq_lens_k (`torch.LongTensor`, *optional*) + Gets cumlative sequence length for key state. + max_length_q (`int`, *optional*): + Maximum sequence length for query state. + max_length_k (`int`, *optional*): + Maximum sequence length for key state. + """ + + cu_seq_lens_q: Optional[torch.LongTensor] + cu_seq_lens_k: Optional[torch.LongTensor] + max_length_q: Optional[int] + max_length_k: Optional[int] diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py index 9aa588be431..b215fb6561b 100644 --- a/src/transformers/models/cohere/modeling_cohere.py +++ b/src/transformers/models/cohere/modeling_cohere.py @@ -33,12 +33,14 @@ from ...cache_utils import Cache, DynamicCache, StaticCache from ...generation import GenerationMixin from ...modeling_attn_mask_utils import AttentionMaskConverter +from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_outputs import ( BaseModelOutputWithPast, CausalLMOutputWithPast, ) from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS from ...modeling_utils import PreTrainedModel +from ...processing_utils import Unpack from ...pytorch_utils import ALL_LAYERNORM_LAYERS from ...utils import ( add_start_docstrings, @@ -832,6 +834,7 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, + **flash_attn_kwargs: Unpack[FlashAttentionKwargs], ) -> Union[Tuple, BaseModelOutputWithPast]: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( @@ -913,6 +916,7 @@ def forward( use_cache=use_cache, cache_position=cache_position, position_embeddings=position_embeddings, + **flash_attn_kwargs, ) hidden_states = layer_outputs[0] diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py index aad4da282b7..6354e20e33f 100644 --- a/src/transformers/models/glm/modeling_glm.py +++ b/src/transformers/models/glm/modeling_glm.py @@ -38,6 +38,7 @@ ) from ...modeling_utils import PreTrainedModel from ...utils import ( + add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, is_flash_attn_2_available, @@ -51,7 +52,11 @@ if is_flash_attn_2_available(): from ...modeling_flash_attention_utils import _flash_attention_forward -from ...modeling_flash_attention_utils import _flash_attention_forward +from ...modeling_flash_attention_utils import FlashAttentionKwargs, _flash_attention_forward +from ...processing_utils import Unpack + + +_CHECKPOINT_FOR_DOC = "dummy" class GlmRMSNorm(nn.Module): @@ -736,6 +741,7 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, + **flash_attn_kwargs: Unpack[FlashAttentionKwargs], ) -> Union[Tuple, BaseModelOutputWithPast]: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( @@ -817,6 +823,7 @@ def forward( use_cache=use_cache, cache_position=cache_position, position_embeddings=position_embeddings, + **flash_attn_kwargs, ) hidden_states = layer_outputs[0] @@ -1222,6 +1229,11 @@ def set_input_embeddings(self, value): self.model.embed_tokens = value @add_start_docstrings_to_model_forward(GLM_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) def forward( self, input_ids: Optional[torch.LongTensor] = None, diff --git a/src/transformers/models/glm/modular_glm.py b/src/transformers/models/glm/modular_glm.py index 55bf89d1c56..c26477fdc17 100644 --- a/src/transformers/models/glm/modular_glm.py +++ b/src/transformers/models/glm/modular_glm.py @@ -46,6 +46,8 @@ logger = logging.get_logger(__name__) +_CHECKPOINT_FOR_DOC = "dummy" + class GlmRMSNorm(Phi3RMSNorm): pass diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 617ef38e4ae..4d95f01849d 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -29,7 +29,7 @@ from ...cache_utils import Cache, DynamicCache, StaticCache from ...generation import GenerationMixin from ...modeling_attn_mask_utils import AttentionMaskConverter -from ...modeling_flash_attention_utils import _flash_attention_forward +from ...modeling_flash_attention_utils import FlashAttentionKwargs, _flash_attention_forward from ...modeling_outputs import ( BaseModelOutputWithPast, CausalLMOutputWithPast, @@ -39,8 +39,10 @@ ) from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS from ...modeling_utils import PreTrainedModel +from ...processing_utils import Unpack from ...pytorch_utils import ALL_LAYERNORM_LAYERS from ...utils import ( + LossKwargs, add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, @@ -422,6 +424,7 @@ def forward( use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46 + **kwargs: Unpack[FlashAttentionKwargs], ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: if isinstance(past_key_value, StaticCache): raise ValueError( @@ -506,6 +509,7 @@ def forward( sliding_window=getattr(self, "sliding_window", None), use_top_left_mask=self._flash_attn_uses_top_left_mask, is_causal=self.is_causal, + **kwargs, ) attn_output = attn_output.reshape(bsz, q_len, -1).contiguous() @@ -870,6 +874,7 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, + **flash_attn_kwargs: Unpack[FlashAttentionKwargs], ) -> Union[Tuple, BaseModelOutputWithPast]: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( @@ -951,6 +956,7 @@ def forward( use_cache=use_cache, cache_position=cache_position, position_embeddings=position_embeddings, + **flash_attn_kwargs, ) hidden_states = layer_outputs[0] @@ -1102,6 +1108,9 @@ def _prepare_4d_causal_attention_mask_with_cache_position( return causal_mask +class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ... + + class LlamaForCausalLM(LlamaPreTrainedModel, GenerationMixin): _tied_weights_keys = ["lm_head.weight"] @@ -1148,7 +1157,7 @@ def forward( return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, num_logits_to_keep: int = 0, - **loss_kwargs, + **kwargs: Unpack[KwargsForCausalLM], ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: @@ -1198,6 +1207,7 @@ def forward( output_hidden_states=output_hidden_states, return_dict=return_dict, cache_position=cache_position, + **kwargs, ) hidden_states = outputs[0] @@ -1211,7 +1221,7 @@ def forward( loss = None if labels is not None: - loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **loss_kwargs) + loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs) if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 16c05a14028..4f3187d510f 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -815,7 +815,7 @@ def to(self, device: Union[str, "torch.device"]) -> "BatchEncoding": # Otherwise it passes the casts down and casts the LongTensor containing the token idxs # into a HalfTensor if isinstance(device, str) or is_torch_device(device) or isinstance(device, int): - self.data = {k: v.to(device=device) for k, v in self.data.items() if isinstance(v, torch.Tensor)} + self.data = {k: v.to(device=device) if isinstance(v, torch.Tensor) else v for k, v in self.data.items()} else: logger.warning(f"Attempting to cast a BatchEncoding to type {str(device)}. This is not supported.") return self diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py index a781389c2fb..2a10bcaa3c9 100755 --- a/src/transformers/utils/__init__.py +++ b/src/transformers/utils/__init__.py @@ -37,6 +37,7 @@ from .generic import ( ContextManagers, ExplicitEnum, + LossKwargs, ModelOutput, PaddingStrategy, TensorType, diff --git a/src/transformers/utils/generic.py b/src/transformers/utils/generic.py index a5f01fa2e0d..26ec82b20fd 100644 --- a/src/transformers/utils/generic.py +++ b/src/transformers/utils/generic.py @@ -24,7 +24,7 @@ from dataclasses import fields, is_dataclass from enum import Enum from functools import partial, wraps -from typing import Any, ContextManager, Iterable, List, Optional, Tuple +from typing import Any, ContextManager, Iterable, List, Optional, Tuple, TypedDict import numpy as np from packaging import version @@ -854,3 +854,16 @@ def wrapper(*args, **kwargs): return wrapper return decorator + + +class LossKwargs(TypedDict, total=False): + """ + Keyword arguments to be passed to the loss function + + Attributes: + num_items_in_batch (`int`, *optional*): + Number of items in the batch. It is recommended to pass it when + you are doing gradient accumulation. + """ + + num_items_in_batch: Optional[int] From 05863817d6bdf8183f9acc33c21b7e4fc026005a Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Thu, 24 Oct 2024 11:11:55 +0200 Subject: [PATCH 07/99] Better defaults (#34026) * be nice to our usres * nit * fixup * default to -1 * oups * turbo nit * auto infer framework --- src/transformers/generation/utils.py | 2 ++ src/transformers/pipelines/base.py | 17 ++++------------- 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 700ea0443f4..3938457155d 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -1440,6 +1440,8 @@ def _prepare_generated_length( and not self.config.is_encoder_decoder ): generation_config.max_length -= inputs_tensor.shape[1] + else: # by default let's always generate 10 new tokens + generation_config.max_length = generation_config.max_length + input_ids_length # same for min length if generation_config.min_new_tokens is not None: diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py index 042958cbb0c..25c2a11564c 100644 --- a/src/transformers/pipelines/base.py +++ b/src/transformers/pipelines/base.py @@ -881,18 +881,7 @@ def __init__( # Take the first device used by `accelerate`. device = next(iter(hf_device_map.values())) else: - device = -1 - if ( - is_torch_mlu_available() - or is_torch_cuda_available() - or is_torch_npu_available() - or is_torch_xpu_available(check_device=True) - or is_torch_mps_available() - ): - logger.warning( - "Hardware accelerator e.g. GPU is available in the environment, but no `device` argument" - " is passed to the `Pipeline` object. Model will be on CPU." - ) + device = 0 if is_torch_available() and self.framework == "pt": if device == -1 and self.model.device is not None: @@ -920,10 +909,12 @@ def __init__( elif is_torch_mps_available(): self.device = torch.device(f"mps:{device}") else: - raise ValueError(f"{device} unrecognized or not available.") + self.device = torch.device("cpu") else: self.device = device if device is not None else -1 + logger.warning(f"Device set to use {self.device}") + self.binary_output = binary_output # We shouldn't call `model.to()` for models loaded with accelerate as well as the case that model is already on device if ( From f0e640adfa3cedea53912b95e3093f05cc2b66b5 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Thu, 24 Oct 2024 11:16:55 +0200 Subject: [PATCH 08/99] Drop support for Python 3.8 (#34314) * drop python 3.8 * update docker files --------- Co-authored-by: ydshieh --- CONTRIBUTING.md | 2 +- README.md | 2 +- docker/transformers-all-latest-gpu/Dockerfile | 2 +- docker/transformers-pytorch-gpu/Dockerfile | 2 +- docker/transformers-quantization-latest-gpu/Dockerfile | 8 ++++---- docker/transformers-tensorflow-gpu/Dockerfile | 2 +- docs/source/de/contributing.md | 2 +- docs/source/ko/contributing.md | 2 +- docs/source/zh/contributing.md | 2 +- setup.py | 5 ++--- src/transformers/dependency_versions_table.py | 2 +- 11 files changed, 15 insertions(+), 16 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4d62a44ab25..9eeea997154 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -132,7 +132,7 @@ You will need basic `git` proficiency to contribute to manual. Type `git --help` in a shell and enjoy! If you prefer books, [Pro Git](https://git-scm.com/book/en/v2) is a very good reference. -You'll need **[Python 3.8](https://github.com/huggingface/transformers/blob/main/setup.py#L449)** or above to contribute to 🤗 Transformers. Follow the steps below to start contributing: +You'll need **[Python 3.9](https://github.com/huggingface/transformers/blob/main/setup.py#L449)** or above to contribute to 🤗 Transformers. Follow the steps below to start contributing: 1. Fork the [repository](https://github.com/huggingface/transformers) by clicking on the **[Fork](https://github.com/huggingface/transformers/fork)** button on the repository's page. This creates a copy of the code diff --git a/README.md b/README.md index 68e2a215d4c..c748e675066 100644 --- a/README.md +++ b/README.md @@ -249,7 +249,7 @@ The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/sta ### With pip -This repository is tested on Python 3.8+, Flax 0.4.1+, PyTorch 1.11+, and TensorFlow 2.6+. +This repository is tested on Python 3.9+, Flax 0.4.1+, PyTorch 1.11+, and TensorFlow 2.6+. You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile index 08e37ea6e12..93f9b6f6a17 100644 --- a/docker/transformers-all-latest-gpu/Dockerfile +++ b/docker/transformers-all-latest-gpu/Dockerfile @@ -1,4 +1,4 @@ -FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 +FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04 LABEL maintainer="Hugging Face" ARG DEBIAN_FRONTEND=noninteractive diff --git a/docker/transformers-pytorch-gpu/Dockerfile b/docker/transformers-pytorch-gpu/Dockerfile index 2c1f153eef2..62578ad0f36 100644 --- a/docker/transformers-pytorch-gpu/Dockerfile +++ b/docker/transformers-pytorch-gpu/Dockerfile @@ -1,4 +1,4 @@ -FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 +FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04 LABEL maintainer="Hugging Face" ARG DEBIAN_FRONTEND=noninteractive diff --git a/docker/transformers-quantization-latest-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile index 0617ac8cdd7..53e66662f9e 100755 --- a/docker/transformers-quantization-latest-gpu/Dockerfile +++ b/docker/transformers-quantization-latest-gpu/Dockerfile @@ -1,4 +1,4 @@ -FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 +FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 LABEL maintainer="Hugging Face" ARG DEBIAN_FRONTEND=noninteractive @@ -9,12 +9,12 @@ SHELL ["sh", "-lc"] # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant # to be used as arguments for docker build (so far). -ARG PYTORCH='2.2.1' +ARG PYTORCH='2.4.1' # Example: `cu102`, `cu113`, etc. ARG CUDA='cu118' RUN apt update -RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python python3-pip ffmpeg +RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg RUN python3 -m pip install --no-cache-dir --upgrade pip ARG REF=main @@ -53,7 +53,7 @@ RUN python3 -m pip install --no-cache-dir gguf # Add autoawq for quantization testing # >=v0.2.3 needed for compatibility with torch 2.2.1 -RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+cu118-cp38-cp38-linux_x86_64.whl +RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+cu118-cp310-cp310-linux_x86_64.whl # Add quanto for quantization testing RUN python3 -m pip install --no-cache-dir optimum-quanto diff --git a/docker/transformers-tensorflow-gpu/Dockerfile b/docker/transformers-tensorflow-gpu/Dockerfile index adccee1ace4..d765767780f 100644 --- a/docker/transformers-tensorflow-gpu/Dockerfile +++ b/docker/transformers-tensorflow-gpu/Dockerfile @@ -1,4 +1,4 @@ -FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 +FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04 LABEL maintainer="Hugging Face" ARG DEBIAN_FRONTEND=noninteractive diff --git a/docs/source/de/contributing.md b/docs/source/de/contributing.md index 4c0e131a352..d014dd67c83 100644 --- a/docs/source/de/contributing.md +++ b/docs/source/de/contributing.md @@ -112,7 +112,7 @@ Bevor Sie irgendwelchen Code schreiben, empfehlen wir Ihnen dringend, die besteh Sie benötigen grundlegende `git`-Kenntnisse, um zu 🤗 Transformers beizutragen. Obwohl `git` nicht das einfachste Werkzeug ist, hat es ein sehr gutes Handbuch. Geben Sie `git --help` in eine Shell ein und genießen Sie es! Wenn Sie Bücher bevorzugen, ist [Pro Git](https://git-scm.com/book/en/v2) eine gute Anlaufstelle. -Sie benötigen **[Python 3.8](https://github.com/huggingface/transformers/blob/main/setup.py#L426)** oder höher, um zu 🤗 Transformers beizutragen. Folgen Sie den nachstehenden Schritten, um mit dem Beitrag zu beginnen: +Sie benötigen **[Python 3.9](https://github.com/huggingface/transformers/blob/main/setup.py#L426)** oder höher, um zu 🤗 Transformers beizutragen. Folgen Sie den nachstehenden Schritten, um mit dem Beitrag zu beginnen: 1. Forken Sie das [Repository](https://github.com/huggingface/transformers), indem Sie auf den **[Fork](https://github.com/huggingface/transformers/fork)**-Button auf der Seite des Repositorys klicken. Dadurch wird eine Kopie des Codes auf Ihrem GitHub-Account erstellt. diff --git a/docs/source/ko/contributing.md b/docs/source/ko/contributing.md index f5003eff07c..99f1d2b6664 100644 --- a/docs/source/ko/contributing.md +++ b/docs/source/ko/contributing.md @@ -113,7 +113,7 @@ python src/transformers/commands/transformers_cli.py env 🤗 Transformers에 기여하기 위해서는 기본적인 `git` 사용 능력이 필요합니다. `git`은 사용하기 쉬운 도구는 아니지만, 매우 훌륭한 매뉴얼을 제공합니다. 쉘(shell)에서 `git --help`을 입력하여 확인해보세요! 만약 책을 선호한다면, [Pro Git](https://git-scm.com/book/en/v2)은 매우 좋은 참고 자료가 될 것입니다. -🤗 Transformers에 기여하려면 **[Python 3.8](https://github.com/huggingface/transformers/blob/main/setup.py#L426)** 이상의 버전이 필요합니다. 기여를 시작하려면 다음 단계를 따르세요: +🤗 Transformers에 기여하려면 **[Python 3.9](https://github.com/huggingface/transformers/blob/main/setup.py#L426)** 이상의 버전이 필요합니다. 기여를 시작하려면 다음 단계를 따르세요: 1. 저장소 페이지에서 **[Fork](https://github.com/huggingface/transformers/fork)** 버튼을 클릭하여 저장소를 포크하세요. 이렇게 하면 코드의 복사본이 여러분의 GitHub 사용자 계정 아래에 생성됩니다. diff --git a/docs/source/zh/contributing.md b/docs/source/zh/contributing.md index 9c247a60a14..b525754359b 100644 --- a/docs/source/zh/contributing.md +++ b/docs/source/zh/contributing.md @@ -112,7 +112,7 @@ python src/transformers/commands/transformers_cli.py env 要为 🤗 Transformers 做贡献,你需要基本的 `git` 使用技能。虽然 `git` 不是一个很容易使用的工具,但它提供了非常全面的手册,在命令行中输入 `git --help` 并享受吧!如果你更喜欢书籍,[Pro Git](https://git-scm.com/book/en/v2)是一本很好的参考书。 -要为 🤗 Transformers 做贡献,你需要 **[Python 3.8](https://github.com/huggingface/transformers/blob/main/setup.py#L426)** 或更高版本。请按照以下步骤开始贡献: +要为 🤗 Transformers 做贡献,你需要 **[Python 3.9](https://github.com/huggingface/transformers/blob/main/setup.py#L426)** 或更高版本。请按照以下步骤开始贡献: 1. 点击[仓库](https://github.com/huggingface/transformers)页面上的 **[Fork](https://github.com/huggingface/transformers/fork)** 按钮,这会在你的 GitHub 账号下拷贝一份代码。 diff --git a/setup.py b/setup.py index 1846f7bf97b..f4028d13c44 100644 --- a/setup.py +++ b/setup.py @@ -150,7 +150,7 @@ "pytest>=7.2.0,<8.0.0", "pytest-timeout", "pytest-xdist", - "python>=3.8.0", + "python>=3.9.0", "ray[tune]>=2.7.0", "regex!=2019.12.17", "requests", @@ -451,7 +451,7 @@ def run(self): zip_safe=False, extras_require=extras, entry_points={"console_scripts": ["transformers-cli=transformers.commands.transformers_cli:main"]}, - python_requires=">=3.8.0", + python_requires=">=3.9.0", install_requires=list(install_requires), classifiers=[ "Development Status :: 5 - Production/Stable", @@ -461,7 +461,6 @@ def run(self): "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Topic :: Scientific/Engineering :: Artificial Intelligence", diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 5ce23f4b764..a633f54a4af 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -56,7 +56,7 @@ "pytest": "pytest>=7.2.0,<8.0.0", "pytest-timeout": "pytest-timeout", "pytest-xdist": "pytest-xdist", - "python": "python>=3.8.0", + "python": "python>=3.9.0", "ray[tune]": "ray[tune]>=2.7.0", "regex": "regex!=2019.12.17", "requests": "requests", From 9643069465ff63191da97ddc459813d129308818 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Thu, 24 Oct 2024 11:23:29 +0200 Subject: [PATCH 09/99] v4.47.0.dev0 --- examples/flax/question-answering/run_qa.py | 2 +- .../speech-recognition/run_flax_speech_recognition_seq2seq.py | 2 +- examples/flax/text-classification/run_flax_glue.py | 2 +- examples/flax/token-classification/run_flax_ner.py | 2 +- .../pytorch/audio-classification/run_audio_classification.py | 2 +- examples/pytorch/contrastive-image-text/run_clip.py | 2 +- .../pytorch/image-classification/run_image_classification.py | 2 +- .../image-classification/run_image_classification_no_trainer.py | 2 +- examples/pytorch/image-pretraining/run_mae.py | 2 +- examples/pytorch/image-pretraining/run_mim.py | 2 +- examples/pytorch/image-pretraining/run_mim_no_trainer.py | 2 +- .../pytorch/instance-segmentation/run_instance_segmentation.py | 2 +- .../run_instance_segmentation_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_clm.py | 2 +- examples/pytorch/language-modeling/run_clm_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_fim.py | 2 +- examples/pytorch/language-modeling/run_fim_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_mlm.py | 2 +- examples/pytorch/language-modeling/run_mlm_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_plm.py | 2 +- examples/pytorch/multiple-choice/run_swag.py | 2 +- examples/pytorch/multiple-choice/run_swag_no_trainer.py | 2 +- examples/pytorch/object-detection/run_object_detection.py | 2 +- .../pytorch/object-detection/run_object_detection_no_trainer.py | 2 +- examples/pytorch/question-answering/run_qa.py | 2 +- examples/pytorch/question-answering/run_qa_beam_search.py | 2 +- .../pytorch/question-answering/run_qa_beam_search_no_trainer.py | 2 +- examples/pytorch/question-answering/run_qa_no_trainer.py | 2 +- examples/pytorch/question-answering/run_seq2seq_qa.py | 2 +- .../pytorch/semantic-segmentation/run_semantic_segmentation.py | 2 +- .../run_semantic_segmentation_no_trainer.py | 2 +- .../pytorch/speech-recognition/run_speech_recognition_ctc.py | 2 +- .../speech-recognition/run_speech_recognition_ctc_adapter.py | 2 +- .../speech-recognition/run_speech_recognition_seq2seq.py | 2 +- examples/pytorch/summarization/run_summarization.py | 2 +- examples/pytorch/summarization/run_summarization_no_trainer.py | 2 +- examples/pytorch/text-classification/run_classification.py | 2 +- examples/pytorch/text-classification/run_glue.py | 2 +- examples/pytorch/text-classification/run_glue_no_trainer.py | 2 +- examples/pytorch/text-classification/run_xnli.py | 2 +- examples/pytorch/token-classification/run_ner.py | 2 +- examples/pytorch/token-classification/run_ner_no_trainer.py | 2 +- examples/pytorch/translation/run_translation.py | 2 +- examples/pytorch/translation/run_translation_no_trainer.py | 2 +- examples/tensorflow/contrastive-image-text/run_clip.py | 2 +- .../tensorflow/image-classification/run_image_classification.py | 2 +- examples/tensorflow/multiple-choice/run_swag.py | 2 +- examples/tensorflow/question-answering/run_qa.py | 2 +- examples/tensorflow/summarization/run_summarization.py | 2 +- examples/tensorflow/text-classification/run_glue.py | 2 +- examples/tensorflow/translation/run_translation.py | 2 +- setup.py | 2 +- src/transformers/__init__.py | 2 +- 53 files changed, 53 insertions(+), 53 deletions(-) diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py index f72aa0df1ff..25a8706d869 100644 --- a/examples/flax/question-answering/run_qa.py +++ b/examples/flax/question-answering/run_qa.py @@ -61,7 +61,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") Array = Any Dataset = datasets.arrow_dataset.Dataset diff --git a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py index 361ab4aa54f..c0085c9f4bb 100644 --- a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py +++ b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py @@ -60,7 +60,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risk. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/flax/speech-recognition/requirements.txt") diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py index 092db16c987..9ffbb82cd3a 100755 --- a/examples/flax/text-classification/run_flax_glue.py +++ b/examples/flax/text-classification/run_flax_glue.py @@ -56,7 +56,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") Array = Any Dataset = datasets.arrow_dataset.Dataset diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py index dd801456da2..9ffaade2056 100644 --- a/examples/flax/token-classification/run_flax_ner.py +++ b/examples/flax/token-classification/run_flax_ner.py @@ -57,7 +57,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py index 009a1f63724..cfbc4d83d93 100644 --- a/examples/pytorch/audio-classification/run_audio_classification.py +++ b/examples/pytorch/audio-classification/run_audio_classification.py @@ -45,7 +45,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt") diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py index cab047ae0cb..3bed494b75c 100644 --- a/examples/pytorch/contrastive-image-text/run_clip.py +++ b/examples/pytorch/contrastive-image-text/run_clip.py @@ -54,7 +54,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt") diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py index 0a9789426c2..aa1cd089ef5 100755 --- a/examples/pytorch/image-classification/run_image_classification.py +++ b/examples/pytorch/image-classification/run_image_classification.py @@ -56,7 +56,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt") diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index 0866cb0f832..2c60b359bd1 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -49,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py index 46863cbbf1c..90b30c60e78 100644 --- a/examples/pytorch/image-pretraining/run_mae.py +++ b/examples/pytorch/image-pretraining/run_mae.py @@ -43,7 +43,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py index 3912c693440..773038f445c 100644 --- a/examples/pytorch/image-pretraining/run_mim.py +++ b/examples/pytorch/image-pretraining/run_mim.py @@ -48,7 +48,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py index 5db5f55730f..5f38481db23 100644 --- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py +++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py @@ -53,7 +53,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation.py b/examples/pytorch/instance-segmentation/run_instance_segmentation.py index aeb78f95d28..368296709f6 100644 --- a/examples/pytorch/instance-segmentation/run_instance_segmentation.py +++ b/examples/pytorch/instance-segmentation/run_instance_segmentation.py @@ -46,7 +46,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt") diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py index 75b74d17d9d..d8bb9d6f235 100644 --- a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py +++ b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py @@ -52,7 +52,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py index 656571eb37e..d3f8ad8da97 100755 --- a/examples/pytorch/language-modeling/run_clm.py +++ b/examples/pytorch/language-modeling/run_clm.py @@ -55,7 +55,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index e40a7bb265b..15538b2ef2e 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/language-modeling/run_fim.py b/examples/pytorch/language-modeling/run_fim.py index 154fc151838..9d0e0008839 100644 --- a/examples/pytorch/language-modeling/run_fim.py +++ b/examples/pytorch/language-modeling/run_fim.py @@ -58,7 +58,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_fim_no_trainer.py b/examples/pytorch/language-modeling/run_fim_no_trainer.py index b06aad86629..0af6d61107d 100644 --- a/examples/pytorch/language-modeling/run_fim_no_trainer.py +++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py @@ -60,7 +60,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py index d021318ae06..4b615fdc4cf 100755 --- a/examples/pytorch/language-modeling/run_mlm.py +++ b/examples/pytorch/language-modeling/run_mlm.py @@ -54,7 +54,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index 8961ee93d31..13a1f7a0d86 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") logger = get_logger(__name__) require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py index 0a207b80479..1c2b7ecf990 100755 --- a/examples/pytorch/language-modeling/run_plm.py +++ b/examples/pytorch/language-modeling/run_plm.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py index ac5db5f6b02..ea6c4a0e317 100755 --- a/examples/pytorch/multiple-choice/run_swag.py +++ b/examples/pytorch/multiple-choice/run_swag.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py index 49436fefd1d..2f672390795 100755 --- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py +++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") logger = get_logger(__name__) # You should update this to your particular problem to have better documentation of `model_type` diff --git a/examples/pytorch/object-detection/run_object_detection.py b/examples/pytorch/object-detection/run_object_detection.py index 0aea1a11c14..91118744386 100644 --- a/examples/pytorch/object-detection/run_object_detection.py +++ b/examples/pytorch/object-detection/run_object_detection.py @@ -48,7 +48,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/object-detection/requirements.txt") diff --git a/examples/pytorch/object-detection/run_object_detection_no_trainer.py b/examples/pytorch/object-detection/run_object_detection_no_trainer.py index 23420205a9f..f312d0ce8a1 100644 --- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py +++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py @@ -51,7 +51,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") logging.basicConfig(level=logging.INFO) logger = get_logger(__name__) diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py index bb0a6455926..3159a79c7e5 100755 --- a/examples/pytorch/question-answering/run_qa.py +++ b/examples/pytorch/question-answering/run_qa.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py index b3d9ee1e9c7..2fc71e0666b 100755 --- a/examples/pytorch/question-answering/run_qa_beam_search.py +++ b/examples/pytorch/question-answering/run_qa_beam_search.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py index 68dbdf0d6c1..3b7d607933c 100644 --- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py index d8dfb3ec350..a8f1fc10b9c 100755 --- a/examples/pytorch/question-answering/run_qa_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py index 7cf50cf94a0..b0bcb940e51 100644 --- a/examples/pytorch/question-answering/run_seq2seq_qa.py +++ b/examples/pytorch/question-answering/run_seq2seq_qa.py @@ -46,7 +46,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py index 4c119dcbb4a..46f2fa45a24 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py @@ -51,7 +51,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt") diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py index 2787a228134..a0ce4d0f75c 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py index ff5da5ed49a..78c798fd471 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py index 66a75ca5d09..4d9bb778042 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py @@ -53,7 +53,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py index 8740ec5f88f..aa03dacd981 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py index 9a25d944053..9c4c2ac13d4 100755 --- a/examples/pytorch/summarization/run_summarization.py +++ b/examples/pytorch/summarization/run_summarization.py @@ -52,7 +52,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py index ad6abc7df3e..3d38e35aac5 100644 --- a/examples/pytorch/summarization/run_summarization_no_trainer.py +++ b/examples/pytorch/summarization/run_summarization_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/pytorch/text-classification/run_classification.py b/examples/pytorch/text-classification/run_classification.py index a440a48110a..e7a186836fb 100755 --- a/examples/pytorch/text-classification/run_classification.py +++ b/examples/pytorch/text-classification/run_classification.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py index 4284fdf12f8..90acf81a36a 100755 --- a/examples/pytorch/text-classification/run_glue.py +++ b/examples/pytorch/text-classification/run_glue.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py index 3f18d974a96..7fcdf81fa86 100644 --- a/examples/pytorch/text-classification/run_glue_no_trainer.py +++ b/examples/pytorch/text-classification/run_glue_no_trainer.py @@ -49,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py index 6578e96dc9c..b058b6f74fd 100755 --- a/examples/pytorch/text-classification/run_xnli.py +++ b/examples/pytorch/text-classification/run_xnli.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py index d2a4c3dabfd..c8cb098e344 100755 --- a/examples/pytorch/token-classification/run_ner.py +++ b/examples/pytorch/token-classification/run_ner.py @@ -49,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py index 597da1d9d66..0646af80bdc 100755 --- a/examples/pytorch/token-classification/run_ner_no_trainer.py +++ b/examples/pytorch/token-classification/run_ner_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py index 4e164010185..ea37b9c51e6 100755 --- a/examples/pytorch/translation/run_translation.py +++ b/examples/pytorch/translation/run_translation.py @@ -52,7 +52,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py index 4f896dff21c..ba1f15dd83e 100644 --- a/examples/pytorch/translation/run_translation_no_trainer.py +++ b/examples/pytorch/translation/run_translation_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") diff --git a/examples/tensorflow/contrastive-image-text/run_clip.py b/examples/tensorflow/contrastive-image-text/run_clip.py index 68728003fc3..20a01a46f21 100644 --- a/examples/tensorflow/contrastive-image-text/run_clip.py +++ b/examples/tensorflow/contrastive-image-text/run_clip.py @@ -51,7 +51,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version( "datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/contrastive-image-text/requirements.txt" diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py index e87d6b2cacc..78655e7d6bc 100644 --- a/examples/tensorflow/image-classification/run_image_classification.py +++ b/examples/tensorflow/image-classification/run_image_classification.py @@ -55,7 +55,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt") diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py index 389d633854e..cbd4400580d 100644 --- a/examples/tensorflow/multiple-choice/run_swag.py +++ b/examples/tensorflow/multiple-choice/run_swag.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py index 480330122b6..f9c6de0e42b 100755 --- a/examples/tensorflow/question-answering/run_qa.py +++ b/examples/tensorflow/question-answering/run_qa.py @@ -62,7 +62,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py index 428c3459903..92ebd0e1d77 100644 --- a/examples/tensorflow/summarization/run_summarization.py +++ b/examples/tensorflow/summarization/run_summarization.py @@ -53,7 +53,7 @@ # region Checking dependencies # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py index c051e27e504..a51939d8d58 100644 --- a/examples/tensorflow/text-classification/run_glue.py +++ b/examples/tensorflow/text-classification/run_glue.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") task_to_keys = { "cola": ("sentence", None), diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py index bc37685b66d..50189345d56 100644 --- a/examples/tensorflow/translation/run_translation.py +++ b/examples/tensorflow/translation/run_translation.py @@ -56,7 +56,7 @@ # region Dependencies and constants # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.46.0.dev0") +check_min_version("4.47.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/setup.py b/setup.py index f4028d13c44..cbfcfd43428 100644 --- a/setup.py +++ b/setup.py @@ -435,7 +435,7 @@ def run(self): setup( name="transformers", - version="4.46.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) + version="4.47.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)", author_email="transformers@huggingface.co", description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow", diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 771e3e8f0ae..cc8b0739502 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -18,7 +18,7 @@ # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names # in the namespace without actually importing anything (and especially none of the backends). -__version__ = "4.46.0.dev0" +__version__ = "4.47.0.dev0" from typing import TYPE_CHECKING From f0b3ef9e2e6a76bd22091502899091b47ce7e930 Mon Sep 17 00:00:00 2001 From: blueingman <15329507600@163.com> Date: Thu, 24 Oct 2024 17:47:58 +0800 Subject: [PATCH 10/99] translated gguf.md into chinese (#34163) * translated gguf.md into chinese * Apply suggestions from code review I have updated the PR accordingly.Thank you very much for detailed guidance,and I 'll pay more attention to the details next time. Co-authored-by: Isotr0py <2037008807@qq.com> * Apply suggestions from code review Co-authored-by: Isotr0py <2037008807@qq.com> --------- Co-authored-by: Isotr0py <2037008807@qq.com> --- docs/source/zh/_toctree.yml | 2 + docs/source/zh/gguf.md | 104 ++++++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) create mode 100644 docs/source/zh/gguf.md diff --git a/docs/source/zh/_toctree.yml b/docs/source/zh/_toctree.yml index fe966bdbfcf..07c97e51550 100644 --- a/docs/source/zh/_toctree.yml +++ b/docs/source/zh/_toctree.yml @@ -50,6 +50,8 @@ title: 导出为 TFLite - local: torchscript title: 导出为 TorchScript + - local: gguf + title: 与 GGUF 格式的互操作性 title: 开发者指南 - sections: - local: performance diff --git a/docs/source/zh/gguf.md b/docs/source/zh/gguf.md new file mode 100644 index 00000000000..3da64a5d995 --- /dev/null +++ b/docs/source/zh/gguf.md @@ -0,0 +1,104 @@ + + +# GGUF 和 Transformers 的交互 + +GGUF文件格式用于存储模型,以便通过[GGML](https://github.com/ggerganov/ggml)和其他依赖它的库进行推理,例如非常流行的[llama.cpp](https://github.com/ggerganov/llama.cpp)或[whisper.cpp](https://github.com/ggerganov/whisper.cpp)。 + +该文件格式[由抱抱脸支持](https://huggingface.co/docs/hub/en/gguf),可用于快速检查文件中张量和元数据。 + +该文件格式是一种“单文件格式”,通常单个文件就包含了配置属性、分词器词汇表和其他属性,同时还有模型中要加载的所有张量。这些文件根据文件的量化类型有不同的格式。我们在[这里](https://huggingface.co/docs/hub/en/gguf#quantization-types)进行了简要介绍。 + +## 在 Transformers 中的支持 + +我们在 transformers 中添加了加载 gguf 文件的功能,这样可以对 GGUF 模型进行进一步的训练或微调,然后再将模型转换回 GGUF 格式,以便在 ggml 生态系统中使用。加载模型时,我们首先将其反量化为 FP32,然后再加载权重以在 PyTorch 中使用。 + +> [!注意] +> 目前这个功能还处于探索阶段,欢迎大家贡献力量,以便在不同量化类型和模型架构之间更好地完善这一功能。 + +目前,支持的模型架构和量化类型如下: + +### 支持的量化类型 + +根据分享在 Hub 上的较为热门的量化文件,初步支持以下量化类型: + +- F32 +- F16 +- BF16 +- Q4_0 +- Q4_1 +- Q5_0 +- Q5_1 +- Q8_0 +- Q2_K +- Q3_K +- Q4_K +- Q5_K +- Q6_K +- IQ1_S +- IQ1_M +- IQ2_XXS +- IQ2_XS +- IQ2_S +- IQ3_XXS +- IQ3_S +- IQ4_XS +- IQ4_NL + +> [!注意] +> 为了支持 gguf 反量化,需要安装 `gguf>=0.10.0`。 + +### 支持的模型架构 + +目前支持以下在 Hub 上非常热门的模型架构: + +- LLaMa +- Mistral +- Qwen2 +- Qwen2Moe +- Phi3 +- Bloom +- Falcon +- StableLM +- GPT2 +- Starcoder2 + +## 使用示例 + +为了在`transformers`中加载`gguf`文件,你需要在 `from_pretrained`方法中为分词器和模型指定 `gguf_file`参数。下面是从同一个文件中加载分词器和模型的示例: + +```py +from transformers import AutoTokenizer, AutoModelForCausalLM + +model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF" +filename = "tinyllama-1.1b-chat-v1.0.Q6_K.gguf" + +tokenizer = AutoTokenizer.from_pretrained(model_id, gguf_file=filename) +model = AutoModelForCausalLM.from_pretrained(model_id, gguf_file=filename) +``` + +现在,你就已经可以结合 PyTorch 生态系统中的一系列其他工具,来使用完整的、未量化的模型了。 + +为了将模型转换回`gguf`文件,我们建议使用`llama.cpp`中的[`convert-hf-to-gguf.py`文件](https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py)。 + +以下是如何补充上面的脚本,以保存模型并将其导出回 `gguf`的示例: + +```py +tokenizer.save_pretrained('directory') +model.save_pretrained('directory') + +!python ${path_to_llama_cpp}/convert-hf-to-gguf.py ${directory} +``` \ No newline at end of file From b29c24ff1ed130d717c59b58091cfedb652872d0 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Thu, 24 Oct 2024 13:44:53 +0200 Subject: [PATCH 11/99] CI: fix failures (#34371) fix --- src/transformers/models/video_llava/modeling_video_llava.py | 1 + tests/models/instructblip/test_modeling_instructblip.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py index c4ec1b51969..0fe89676b92 100644 --- a/src/transformers/models/video_llava/modeling_video_llava.py +++ b/src/transformers/models/video_llava/modeling_video_llava.py @@ -561,6 +561,7 @@ def forward( ) video_features = None + num_frames = 0 if pixel_values_videos is not None: video_features, num_frames = self.get_video_features( pixel_values_videos=pixel_values_videos, vision_feature_layer=vision_feature_layer diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py index 5182ac20cd9..a33be021353 100644 --- a/tests/models/instructblip/test_modeling_instructblip.py +++ b/tests/models/instructblip/test_modeling_instructblip.py @@ -621,7 +621,7 @@ def test_inference_vicuna_7b(self): logits = model(**inputs).logits expected_slice = torch.tensor( - [[-3.3926, -12.2969, 8.4922], [-5.0195, -11.9531, 8.1406], [-4.0039, -13.3594, 9.2578]], + [[-3.3047, -12.0625, 8.4922], [-4.9258, -11.7578, 8.1406], [-3.9297, -13.5000, 9.2500]], device=torch_device, ) From 2112027d0cb8ae83ea9343176d77cb8a642c4556 Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Thu, 24 Oct 2024 14:29:33 +0200 Subject: [PATCH 12/99] Zamba is an LM (#34342) * Zamba is an LM * Addition --- docs/source/en/_toctree.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index aa975fc9d9f..a7806059afa 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -606,6 +606,8 @@ title: XLNet - local: model_doc/yoso title: YOSO + - local: model_doc/zamba + title: Zamba title: Text models - isExpanded: false sections: @@ -715,8 +717,6 @@ title: ViTMSN - local: model_doc/yolos title: YOLOS - - local: model_doc/zamba - title: Zamba - local: model_doc/zoedepth title: ZoeDepth title: Vision models @@ -973,4 +973,4 @@ - local: internal/time_series_utils title: Utilities for Time Series title: Internal Helpers - title: API \ No newline at end of file + title: API From 30c76d5b2836b9ba6b0e417aafb5ba77b4129ffe Mon Sep 17 00:00:00 2001 From: Thomas Furtner Date: Thu, 24 Oct 2024 14:42:47 +0200 Subject: [PATCH 13/99] add code generation to natural language processing section (#34333) --- docs/source/en/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/index.md b/docs/source/en/index.md index ce0ffc7db05..aaff45ab65d 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -19,7 +19,7 @@ State-of-the-art Machine Learning for [PyTorch](https://pytorch.org/), [TensorFl 🤗 Transformers provides APIs and tools to easily download and train state-of-the-art pretrained models. Using pretrained models can reduce your compute costs, carbon footprint, and save you the time and resources required to train a model from scratch. These models support common tasks in different modalities, such as: -📝 **Natural Language Processing**: text classification, named entity recognition, question answering, language modeling, summarization, translation, multiple choice, and text generation.
+📝 **Natural Language Processing**: text classification, named entity recognition, question answering, language modeling, code generation, summarization, translation, multiple choice, and text generation.
🖼️ **Computer Vision**: image classification, object detection, and segmentation.
🗣️ **Audio**: automatic speech recognition and audio classification.
🐙 **Multimodal**: table question answering, optical character recognition, information extraction from scanned documents, video classification, and visual question answering. From dd267fca729621cec18b6199b31671ed9513a82c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=A4=80=EC=9E=AC?= <55151385+junejae@users.noreply.github.com> Date: Thu, 24 Oct 2024 22:10:59 +0900 Subject: [PATCH 14/99] Add T5 GGUF loading support (#33389) * add: GGUFT5Converter * add: tensormapping for t5 * add: test code for t5 * fix: Remove whitespace from blank line * add: t5 fp16 tests * fix: whitespace formatting * fix: minor formatting * fix: testing every weights --- docs/source/en/gguf.md | 1 + src/transformers/integrations/ggml.py | 128 +++++++++++++++++- .../modeling_gguf_pytorch_utils.py | 17 ++- .../models/t5/tokenization_t5_fast.py | 2 +- tests/quantization/ggml/test_ggml.py | 56 +++++++- 5 files changed, 197 insertions(+), 7 deletions(-) diff --git a/docs/source/en/gguf.md b/docs/source/en/gguf.md index 01583cedbf4..20531b990bc 100644 --- a/docs/source/en/gguf.md +++ b/docs/source/en/gguf.md @@ -85,6 +85,7 @@ For now the supported model architectures are the architectures that have been v - StableLM - GPT2 - Starcoder2 +- T5 ## Example usage diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py index 7b5828176ff..4a2740fcb30 100644 --- a/src/transformers/integrations/ggml.py +++ b/src/transformers/integrations/ggml.py @@ -21,11 +21,11 @@ from array import array import numpy as np -from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers -from tokenizers.models import BPE +from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors +from tokenizers.models import BPE, Unigram from .. import AddedToken -from ..convert_slow_tokenizer import GPT2Converter, LlamaConverter, Qwen2Converter +from ..convert_slow_tokenizer import GPT2Converter, LlamaConverter, Qwen2Converter, T5Converter from ..utils import logging from ..utils.logging import tqdm @@ -148,6 +148,51 @@ ".output.": ".lm_head.", "output_norm": "ln_f", }, + "t5": { + "token_embd": "shared", + "dec.blk.{bid}.attn_q": "decoder.block.{bid}.layer.0.SelfAttention.q", + "dec.blk.{bid}.attn_k": "decoder.block.{bid}.layer.0.SelfAttention.k", + "dec.blk.{bid}.attn_v": "decoder.block.{bid}.layer.0.SelfAttention.v", + "dec.blk.{bid}.attn_o": "decoder.block.{bid}.layer.0.SelfAttention.o", + "dec.blk.{bid}.attn_rel_b": "decoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", + "dec.blk.{bid}.attn_norm": "decoder.block.{bid}.layer.0.layer_norm", + "dec.blk.{bid}.cross_attn_q": "decoder.block.{bid}.layer.1.EncDecAttention.q", + "dec.blk.{bid}.cross_attn_k": "decoder.block.{bid}.layer.1.EncDecAttention.k", + "dec.blk.{bid}.cross_attn_v": "decoder.block.{bid}.layer.1.EncDecAttention.v", + "dec.blk.{bid}.cross_attn_o": "decoder.block.{bid}.layer.1.EncDecAttention.o", + "dec.blk.{bid}.cross_attn_norm": "decoder.block.{bid}.layer.1.layer_norm", + "dec.blk.{bid}.ffn_gate": "decoder.block.{bid}.layer.2.DenseReluDense.wi_0", + "dec.blk.{bid}.ffn_up": "decoder.block.{bid}.layer.2.DenseReluDense.wi_1", + "dec.blk.{bid}.ffn_down": "decoder.block.{bid}.layer.2.DenseReluDense.wo", + "dec.blk.{bid}.ffn_norm": "decoder.block.{bid}.layer.2.layer_norm", + "dec.output_norm": "decoder.final_layer_norm", + "enc.blk.{bid}.attn_q": "encoder.block.{bid}.layer.0.SelfAttention.q", + "enc.blk.{bid}.attn_k": "encoder.block.{bid}.layer.0.SelfAttention.k", + "enc.blk.{bid}.attn_v": "encoder.block.{bid}.layer.0.SelfAttention.v", + "enc.blk.{bid}.attn_o": "encoder.block.{bid}.layer.0.SelfAttention.o", + "enc.blk.{bid}.attn_rel_b": "encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", + "enc.blk.{bid}.attn_norm": "encoder.block.{bid}.layer.0.layer_norm", + "enc.blk.{bid}.ffn_gate": "encoder.block.{bid}.layer.1.DenseReluDense.wi_0", + "enc.blk.{bid}.ffn_up": "encoder.block.{bid}.layer.1.DenseReluDense.wi_1", + "enc.blk.{bid}.ffn_down": "encoder.block.{bid}.layer.1.DenseReluDense.wo", + "enc.blk.{bid}.ffn_norm": "encoder.block.{bid}.layer.1.layer_norm", + "enc.output_norm": "encoder.final_layer_norm", + "output.weight": "lm_head.weight", + }, + "t5encoder": { + "token_embd": "shared", + "enc.blk.{bid}.attn_q": "encoder.block.{bid}.layer.0.SelfAttention.q", + "enc.blk.{bid}.attn_k": "encoder.block.{bid}.layer.0.SelfAttention.k", + "enc.blk.{bid}.attn_v": "encoder.block.{bid}.layer.0.SelfAttention.v", + "enc.blk.{bid}.attn_o": "encoder.block.{bid}.layer.0.SelfAttention.o", + "enc.blk.{bid}.attn_rel_b": "encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", + "enc.blk.{bid}.attn_norm": "encoder.block.{bid}.layer.0.layer_norm", + "enc.blk.{bid}.ffn_gate": "encoder.block.{bid}.layer.1.DenseReluDense.wi_0", + "enc.blk.{bid}.ffn_up": "encoder.block.{bid}.layer.1.DenseReluDense.wi_1", + "enc.blk.{bid}.ffn_down": "encoder.block.{bid}.layer.1.DenseReluDense.wo", + "enc.blk.{bid}.ffn_norm": "encoder.block.{bid}.layer.1.layer_norm", + "enc.output_norm": "encoder.final_layer_norm", + }, "stablelm": { "token_embd": "model.embed_tokens", "blk": "model.layers", @@ -287,6 +332,19 @@ "vocab_size": "vocab_size", "attention.layer_norm_epsilon": "layer_norm_epsilon", }, + "t5": { + "context_length": "n_positions", + "block_count": "num_layers", + "feed_forward_length": "d_ff", + "embedding_length": "d_model", + "attention.key_length": "d_kv", + "attention.head_count": "num_heads", + "attention.head_count_kv": "num_key_value_heads", + "attention.layer_norm_epsilon": "layer_norm_epsilon", + "attention.relative_buckets_count": "relative_attention_num_buckets", + "decoder_start_token_id": "decoder_start_token_id", + "vocab_size": "vocab_size", + }, "stablelm": { "context_length": "max_position_embeddings", "block_count": "num_hidden_layers", @@ -636,6 +694,69 @@ def converted(self) -> Tokenizer: return tokenizer +class GGUFT5Converter(T5Converter): + def __init__(self, tokenizer_dict): + # set dummy data to avoid unnecessary merges calculation + tokenizer_dict["merges"] = ["dummy text"] + + self.proto = GGUFTokenizerSkeleton(tokenizer_dict) + self.token2id = {k: v for v, k in enumerate(self.proto.tokens)} + self.original_tokenizer = self.proto + self.additional_kwargs = {} + + def vocab(self, proto): + return list(zip(proto.tokens, proto.scores)) + + def normalizer(self, proto): + if getattr(self.original_tokenizer, "legacy", True): + sequence = [] + if getattr(self.original_tokenizer, "add_prefix_space", True): + sequence += [normalizers.Prepend(prepend="▁")] + sequence += [normalizers.Replace(pattern=" ", content="▁")] + return normalizers.Sequence(sequence) + return None # non-legacy, no normalizer + + def post_processor(self): + return processors.TemplateProcessing( + single=["$A", ""], + pair=["$A", "", "$B", ""], + special_tokens=[ + ("", self.token2id[""]), + ], + ) + + def converted(self) -> Tokenizer: + vocab_scores = self.vocab(self.proto) + tokenizer = Tokenizer( + Unigram( + vocab_scores, + unk_id=self.proto.unk_token_id, + byte_fallback=False, + ) + ) + + # Tokenizer assemble + normalizer = self.normalizer(self.proto) + if normalizer is not None: + tokenizer.normalizer = normalizer + + replacement = "▁" + add_prefix_space = True + if hasattr(self.original_tokenizer, "add_prefix_space"): + add_prefix_space = self.original_tokenizer.add_prefix_space + + pre_tokenizer = self.pre_tokenizer(replacement, add_prefix_space) + if pre_tokenizer is not None: + tokenizer.pre_tokenizer = pre_tokenizer + + tokenizer.decoder = self.decoder(replacement, add_prefix_space) + post_processor = self.post_processor() + if post_processor: + tokenizer.post_processor = post_processor + + return tokenizer + + GGUF_TO_FAST_CONVERTERS = { "llama": GGUFLlamaConverter, "qwen2": GGUFQwen2Converter, @@ -646,6 +767,7 @@ def converted(self) -> Tokenizer: "stablelm": GGUFGPTConverter, "gpt2": GGUFGPTConverter, "starcoder2": GGUFGPTConverter, + "t5": GGUFT5Converter, } diff --git a/src/transformers/modeling_gguf_pytorch_utils.py b/src/transformers/modeling_gguf_pytorch_utils.py index b1d7b896085..171b2f4d15b 100644 --- a/src/transformers/modeling_gguf_pytorch_utils.py +++ b/src/transformers/modeling_gguf_pytorch_utils.py @@ -94,6 +94,12 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): # to add this patch to ensure things work correctly on our side. if "llama" in architecture and "mistral" in model_name: updated_architecture = "mistral" + # FIXME: Currnetly this implementation is only for flan-t5 architecture. + # It needs to be developed for supporting legacy t5. + elif "t5" in architecture or "t5encoder" in architecture: + parsed_parameters["config"]["tie_word_embeddings"] = False + parsed_parameters["config"]["is_gated_act"] = True + updated_architecture = "t5" else: updated_architecture = architecture @@ -191,6 +197,13 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): else: weights = reverse_reshape_bias(weights, num_heads, n_embed) + bid = None + if architecture in ("t5", "t5encoder"): + for chunk in name.split("."): + if chunk.isdigit(): + bid = int(chunk) + break + if architecture == "gpt2": if ( "attn_qkv.weight" in name @@ -209,8 +222,8 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): continue for tensor_name in tensor_key_mapping: - if tensor_name in name: - name = name.replace(tensor_name, tensor_key_mapping[tensor_name]) + if tensor_name.format(bid=bid) in name: + name = name.replace(tensor_name.format(bid=bid), tensor_key_mapping[tensor_name].format(bid=bid)) # Use copy to avoid errors with numpy and pytorch parsed_parameters["tensors"][name] = torch.from_numpy(np.copy(weights)) diff --git a/src/transformers/models/t5/tokenization_t5_fast.py b/src/transformers/models/t5/tokenization_t5_fast.py index 0a92803f165..4c3fa950559 100644 --- a/src/transformers/models/t5/tokenization_t5_fast.py +++ b/src/transformers/models/t5/tokenization_t5_fast.py @@ -117,7 +117,7 @@ def __init__( kwargs["from_slow"] = True super().__init__( - vocab_file, + vocab_file=vocab_file, tokenizer_file=tokenizer_file, eos_token=eos_token, unk_token=unk_token, diff --git a/tests/quantization/ggml/test_ggml.py b/tests/quantization/ggml/test_ggml.py index 6e47d46f07c..ddc791e96a6 100644 --- a/tests/quantization/ggml/test_ggml.py +++ b/tests/quantization/ggml/test_ggml.py @@ -15,7 +15,7 @@ import tempfile import unittest -from transformers import AddedToken, AutoModelForCausalLM, AutoTokenizer +from transformers import AddedToken, AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer from transformers.testing_utils import ( require_gguf, require_torch_gpu, @@ -48,6 +48,8 @@ class GgufIntegrationTests(unittest.TestCase): falcon7b_model_id = "xaviviro/falcon-7b-quantized-gguf" falcon40b_model_id = "maddes8cht/tiiuae-falcon-40b-gguf" original_flacon7b_model_id = "tiiuae/falcon-7b" + t5_model_id = "repetitio/flan-t5-small" + original_t5_model_id = "google/flan-t5-small" stablelm_model_id = "afrideva/stablelm-3b-4e1t-GGUF" stablelm2_model_id = "afrideva/stablelm-2-1_6b-GGUF" original_stablelm2_model_id = "stabilityai/stablelm-2-1_6b" @@ -92,6 +94,8 @@ class GgufIntegrationTests(unittest.TestCase): q2_k_falcon7b_model_id = "falcon-7b-q2_k.gguf" fp16_falcon7b_model_id = "falcon-7b-fp16.gguf" q2_k_falcon40b_model_id = "tiiuae-falcon-40b-Q2_K.gguf" + fp16_t5_model_id = "flan-t5-small-f16.gguf" + q8_0_t5_model_id = "flan-t5-small-q8_0.gguf" fp16_qwen2moe_model_id = "Qwen1.5-MoE-A2.7B.gguf" fp16_gpt2_model_id = "gpt2.f16.gguf" q8_gpt2_model_id = "gpt2.Q8_0.gguf" @@ -487,6 +491,56 @@ def test_bloom_weights_conversion_fp16(self): self.assertTrue(quantized_param.shape == original_param.shape) torch.testing.assert_close(quantized_param, original_param) + def test_t5_f16(self): + tokenizer = AutoTokenizer.from_pretrained(self.t5_model_id, gguf_file=self.fp16_t5_model_id) + model = AutoModelForSeq2SeqLM.from_pretrained( + self.t5_model_id, gguf_file=self.fp16_t5_model_id, device_map="auto", torch_dtype=torch.float16 + ) + + T5_EXAMPLE_TEXT = "translate English to German: How old are you?" + + text = tokenizer(T5_EXAMPLE_TEXT, return_tensors="pt").to(torch_device) + out = model.generate(**text, max_new_tokens=10) + + EXPECTED_TEXT = "Wie ich er?" + self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT) + + def test_t5_q8_0(self): + tokenizer = AutoTokenizer.from_pretrained(self.t5_model_id, gguf_file=self.q8_0_t5_model_id) + model = AutoModelForSeq2SeqLM.from_pretrained( + self.t5_model_id, gguf_file=self.q8_0_t5_model_id, device_map="auto", torch_dtype=torch.float16 + ) + + T5_EXAMPLE_TEXT = "translate English to German: How old are you?" + + text = tokenizer(T5_EXAMPLE_TEXT, return_tensors="pt").to(torch_device) + out = model.generate(**text, max_new_tokens=10) + + EXPECTED_TEXT = "Wie ich er?" + self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT) + + def test_t5_weights_conversion_fp16(self): + quantized_model = AutoModelForSeq2SeqLM.from_pretrained( + self.t5_model_id, + gguf_file=self.fp16_t5_model_id, + device_map="auto", + torch_dtype=torch.float16, + ) + original_model = AutoModelForSeq2SeqLM.from_pretrained( + self.original_t5_model_id, + device_map="auto", + torch_dtype=torch.float16, + ) + + quantized_state_dict = quantized_model.state_dict() + original_state_dict = original_model.state_dict() + + for (quantized_name, quantized_param), (original_name, original_param) in zip( + quantized_state_dict.items(), original_state_dict.items() + ): + self.assertTrue(quantized_param.shape == original_param.shape) + torch.testing.assert_close(quantized_param, original_param, rtol=5e-04, atol=5e-04) + def test_gpt2_q8(self): tokenizer = AutoTokenizer.from_pretrained(self.gpt2_model_id, gguf_file=self.q8_gpt2_model_id) model = AutoModelForCausalLM.from_pretrained( From 6432ad8bb5dec9c7ece1041767c9e208ff6b4cbb Mon Sep 17 00:00:00 2001 From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com> Date: Thu, 24 Oct 2024 09:22:50 -0400 Subject: [PATCH 15/99] Fix pil_torch_interpolation_mapping import in image_processing_detr_fast (#34375) fix pil_torch_interpolation_mapping import --- src/transformers/models/detr/image_processing_detr_fast.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/detr/image_processing_detr_fast.py b/src/transformers/models/detr/image_processing_detr_fast.py index 97940ab3132..0fa1d0ffd9d 100644 --- a/src/transformers/models/detr/image_processing_detr_fast.py +++ b/src/transformers/models/detr/image_processing_detr_fast.py @@ -40,7 +40,6 @@ get_image_type, infer_channel_dimension_format, make_list_of_images, - pil_torch_interpolation_mapping, validate_annotations, validate_kwargs, ) @@ -72,7 +71,8 @@ if is_torchvision_available(): from torchvision.io import read_image - from ...image_utils import pil_torch_interpolation_mapping + if is_vision_available(): + from ...image_utils import pil_torch_interpolation_mapping if is_torchvision_v2_available(): from torchvision.transforms.v2 import functional as F From 450b9cbfacc5b5aaf18ecc25217ab80b6fc8cf99 Mon Sep 17 00:00:00 2001 From: Vijay Date: Thu, 24 Oct 2024 20:58:51 +0530 Subject: [PATCH 16/99] Add code sample docstrings and checkpoint reference for GLM models (#34360) * Add code sample docstrings and checkpoint reference for GLM models * Update modular_glm.py * Update modeling_glm.py --- src/transformers/models/glm/modeling_glm.py | 3 +++ src/transformers/models/glm/modular_glm.py | 2 ++ 2 files changed, 5 insertions(+) diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py index 6354e20e33f..1e7c6eae0ee 100644 --- a/src/transformers/models/glm/modeling_glm.py +++ b/src/transformers/models/glm/modeling_glm.py @@ -59,6 +59,9 @@ _CHECKPOINT_FOR_DOC = "dummy" +_CHECKPOINT_FOR_DOC = "THUDM/glm-4-9b" + + class GlmRMSNorm(nn.Module): def __init__(self, hidden_size, eps=1e-6): """ diff --git a/src/transformers/models/glm/modular_glm.py b/src/transformers/models/glm/modular_glm.py index c26477fdc17..9cfd617eeb2 100644 --- a/src/transformers/models/glm/modular_glm.py +++ b/src/transformers/models/glm/modular_glm.py @@ -44,6 +44,8 @@ from .configuration_glm import GlmConfig +_CHECKPOINT_FOR_DOC = "THUDM/glm-4-9b" + logger = logging.get_logger(__name__) _CHECKPOINT_FOR_DOC = "dummy" From e2886166065db25029afb58c699d6272baf22965 Mon Sep 17 00:00:00 2001 From: "Winston H." <56998716+winstxnhdw@users.noreply.github.com> Date: Thu, 24 Oct 2024 16:40:26 +0100 Subject: [PATCH 17/99] refactor: remove redundant if-condition and improve type correctness for `convert_tokens_to_ids` (#34030) * chore: remove redundant if-condition * fix: import `Iterable` --- src/transformers/tokenization_utils_fast.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py index cec91e038dd..fabc1a1d5ed 100644 --- a/src/transformers/tokenization_utils_fast.py +++ b/src/transformers/tokenization_utils_fast.py @@ -21,7 +21,7 @@ import json import os from collections import defaultdict -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union import tokenizers.pre_tokenizers as pre_tokenizers_fast from tokenizers import Encoding as EncodingFast @@ -326,20 +326,17 @@ def _convert_encoding( return encoding_dict, encodings - def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]: + def convert_tokens_to_ids(self, tokens: Union[str, Iterable[str]]) -> Union[int, List[int]]: """ - Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the + Converts a token string (or a sequence of tokens) in a single integer id (or a Iterable of ids), using the vocabulary. Args: - tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s). + tokens (`str` or `Iterable[str]`): One or several token(s) to convert to token id(s). Returns: `int` or `List[int]`: The token id or list of token ids. """ - if tokens is None: - return None - if isinstance(tokens, str): return self._convert_token_to_id_with_added_voc(tokens) From fe3507331998e7154a206055b34e3ba338290d3d Mon Sep 17 00:00:00 2001 From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com> Date: Thu, 24 Oct 2024 11:46:39 -0400 Subject: [PATCH 18/99] Ignore unsupported kwarg in ProcessorMixin call (#34285) Fix accept any common kwargs --- src/transformers/processing_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index cb2327e5c46..b363f8c72cc 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -874,7 +874,11 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg else: # kwargs is a flat dictionary for key in kwargs: - if key not in used_keys: + if key not in ModelProcessorKwargs.__annotations__["common_kwargs"].__annotations__.keys(): + logger.warning_once( + f"Keyword argument `{key}` is not a valid argument for this processor and will be ignored." + ) + elif key not in used_keys: output_kwargs["common_kwargs"][key] = kwargs[key] # all modality-specific kwargs are updated with common kwargs From d9989e0b9a5633db923f12e61cb8b6e72cf71a7c Mon Sep 17 00:00:00 2001 From: Benjamin Bossan Date: Thu, 24 Oct 2024 17:56:40 +0200 Subject: [PATCH 19/99] [PEFT] Add warning for missing key in LoRA adapter (#34068) When loading a LoRA adapter, so far, there was only a warning when there were unexpected keys in the checkpoint. Now, there is also a warning when there are missing keys. This change is consistent with https://github.com/huggingface/peft/pull/2118 in PEFT and the planned PR https://github.com/huggingface/diffusers/pull/9622 in diffusers. Apart from this change, the error message for unexpected keys was slightly altered for consistency (it should be more readable now). Also, besides adding a test for the missing keys warning, a test for unexpected keys warning was also added, as it was missing so far. --- src/transformers/integrations/peft.py | 24 +++++- .../peft_integration/test_peft_integration.py | 78 ++++++++++++++++++- 2 files changed, 96 insertions(+), 6 deletions(-) diff --git a/src/transformers/integrations/peft.py b/src/transformers/integrations/peft.py index bd0ca16f865..8afff36eb08 100644 --- a/src/transformers/integrations/peft.py +++ b/src/transformers/integrations/peft.py @@ -235,13 +235,29 @@ def load_adapter( ) if incompatible_keys is not None: - # check only for unexpected keys + err_msg = "" + origin_name = peft_model_id if peft_model_id is not None else "state_dict" + # Check for unexpected keys. if hasattr(incompatible_keys, "unexpected_keys") and len(incompatible_keys.unexpected_keys) > 0: - logger.warning( - f"Loading adapter weights from {peft_model_id} led to unexpected keys not found in the model: " - f" {incompatible_keys.unexpected_keys}. " + err_msg = ( + f"Loading adapter weights from {origin_name} led to unexpected keys not found in the model: " + f"{', '.join(incompatible_keys.unexpected_keys)}. " ) + # Check for missing keys. + missing_keys = getattr(incompatible_keys, "missing_keys", None) + if missing_keys: + # Filter missing keys specific to the current adapter, as missing base model keys are expected. + lora_missing_keys = [k for k in missing_keys if "lora_" in k and adapter_name in k] + if lora_missing_keys: + err_msg += ( + f"Loading adapter weights from {origin_name} led to missing keys in the model: " + f"{', '.join(lora_missing_keys)}" + ) + + if err_msg: + logger.warning(err_msg) + # Re-dispatch model and hooks in case the model is offloaded to CPU / Disk. if ( (getattr(self, "hf_device_map", None) is not None) diff --git a/tests/peft_integration/test_peft_integration.py b/tests/peft_integration/test_peft_integration.py index a80919dc61c..aebf2b29526 100644 --- a/tests/peft_integration/test_peft_integration.py +++ b/tests/peft_integration/test_peft_integration.py @@ -20,8 +20,9 @@ from huggingface_hub import hf_hub_download from packaging import version -from transformers import AutoModelForCausalLM, OPTForCausalLM +from transformers import AutoModelForCausalLM, OPTForCausalLM, logging from transformers.testing_utils import ( + CaptureLogger, require_bitsandbytes, require_peft, require_torch, @@ -72,9 +73,15 @@ def test_peft_from_pretrained(self): This checks if we pass a remote folder that contains an adapter config and adapter weights, it should correctly load a model that has adapters injected on it. """ + logger = logging.get_logger("transformers.integrations.peft") + for model_id in self.peft_test_model_ids: for transformers_class in self.transformers_test_model_classes: - peft_model = transformers_class.from_pretrained(model_id).to(torch_device) + with CaptureLogger(logger) as cl: + peft_model = transformers_class.from_pretrained(model_id).to(torch_device) + # ensure that under normal circumstances, there are no warnings about keys + self.assertNotIn("unexpected keys", cl.out) + self.assertNotIn("missing keys", cl.out) self.assertTrue(self._check_lora_correctly_converted(peft_model)) self.assertTrue(peft_model._hf_peft_config_loaded) @@ -548,3 +555,70 @@ def test_peft_from_pretrained_hub_kwargs(self): model = OPTForCausalLM.from_pretrained(peft_model_id, adapter_kwargs=adapter_kwargs) self.assertTrue(self._check_lora_correctly_converted(model)) + + def test_peft_from_pretrained_unexpected_keys_warning(self): + """ + Test for warning when loading a PEFT checkpoint with unexpected keys. + """ + from peft import LoraConfig + + logger = logging.get_logger("transformers.integrations.peft") + + for model_id, peft_model_id in zip(self.transformers_test_model_ids, self.peft_test_model_ids): + for transformers_class in self.transformers_test_model_classes: + model = transformers_class.from_pretrained(model_id).to(torch_device) + + peft_config = LoraConfig() + state_dict_path = hf_hub_download(peft_model_id, "adapter_model.bin") + dummy_state_dict = torch.load(state_dict_path) + + # add unexpected key + dummy_state_dict["foobar"] = next(iter(dummy_state_dict.values())) + + with CaptureLogger(logger) as cl: + model.load_adapter( + adapter_state_dict=dummy_state_dict, peft_config=peft_config, low_cpu_mem_usage=False + ) + + msg = "Loading adapter weights from state_dict led to unexpected keys not found in the model: foobar" + self.assertIn(msg, cl.out) + + def test_peft_from_pretrained_missing_keys_warning(self): + """ + Test for warning when loading a PEFT checkpoint with missing keys. + """ + from peft import LoraConfig + + logger = logging.get_logger("transformers.integrations.peft") + + for model_id, peft_model_id in zip(self.transformers_test_model_ids, self.peft_test_model_ids): + for transformers_class in self.transformers_test_model_classes: + model = transformers_class.from_pretrained(model_id).to(torch_device) + + peft_config = LoraConfig() + state_dict_path = hf_hub_download(peft_model_id, "adapter_model.bin") + dummy_state_dict = torch.load(state_dict_path) + + # remove a key so that we have missing keys + key = next(iter(dummy_state_dict.keys())) + del dummy_state_dict[key] + + with CaptureLogger(logger) as cl: + model.load_adapter( + adapter_state_dict=dummy_state_dict, + peft_config=peft_config, + low_cpu_mem_usage=False, + adapter_name="other", + ) + + # Here we need to adjust the key name a bit to account for PEFT-specific naming. + # 1. Remove PEFT-specific prefix + # If merged after dropping Python 3.8, we can use: key = key.removeprefix(peft_prefix) + peft_prefix = "base_model.model." + key = key[len(peft_prefix) :] + # 2. Insert adapter name + prefix, _, suffix = key.rpartition(".") + key = f"{prefix}.other.{suffix}" + + msg = f"Loading adapter weights from state_dict led to missing keys in the model: {key}" + self.assertIn(msg, cl.out) From 1c5918d9106ba530c700f08ad7847b09a8b68457 Mon Sep 17 00:00:00 2001 From: Michael Benayoun Date: Thu, 24 Oct 2024 18:34:28 +0200 Subject: [PATCH 20/99] Fix `torch.fx` issue related to the new `loss_kwargs` keyword argument (#34380) * Fix FX * Unskip tests --- src/transformers/utils/fx.py | 2 +- tests/models/cohere/test_modeling_cohere.py | 1 - tests/models/mistral/test_modeling_mistral.py | 1 - tests/models/mixtral/test_modeling_mixtral.py | 1 - tests/models/qwen2/test_modeling_qwen2.py | 1 - tests/models/qwen2_moe/test_modeling_qwen2_moe.py | 1 - 6 files changed, 1 insertion(+), 6 deletions(-) diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py index c78b4c34c33..3764f1ee4ce 100755 --- a/src/transformers/utils/fx.py +++ b/src/transformers/utils/fx.py @@ -1416,7 +1416,7 @@ def keys(self, obj: "Proxy") -> Any: your custom tracer. """ attribute = HFAttribute(obj, "keys")() - if obj.node.target == "**kwargs": + if obj.node.target.startswith("**"): return attribute._metadata return attribute diff --git a/tests/models/cohere/test_modeling_cohere.py b/tests/models/cohere/test_modeling_cohere.py index b8a5aec9d41..3a05867dfdf 100644 --- a/tests/models/cohere/test_modeling_cohere.py +++ b/tests/models/cohere/test_modeling_cohere.py @@ -304,7 +304,6 @@ def test_model_various_embeddings(self): config_and_inputs[0].position_embedding_type = type self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip(reason="PR #34283 made changes to the forward function.") def test_torch_fx_output_loss(self): super().test_torch_fx_output_loss() diff --git a/tests/models/mistral/test_modeling_mistral.py b/tests/models/mistral/test_modeling_mistral.py index 13e5e3d1f60..600c4ffa14b 100644 --- a/tests/models/mistral/test_modeling_mistral.py +++ b/tests/models/mistral/test_modeling_mistral.py @@ -356,7 +356,6 @@ def test_model_various_embeddings(self): config_and_inputs[0].position_embedding_type = type self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip(reason="PR #34283 made changes to the forward function.") def test_torch_fx_output_loss(self): super().test_torch_fx_output_loss() diff --git a/tests/models/mixtral/test_modeling_mixtral.py b/tests/models/mixtral/test_modeling_mixtral.py index 0bfb5126ebd..0688435e814 100644 --- a/tests/models/mixtral/test_modeling_mixtral.py +++ b/tests/models/mixtral/test_modeling_mixtral.py @@ -356,7 +356,6 @@ def test_model_various_embeddings(self): config_and_inputs[0].position_embedding_type = type self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip(reason="PR #34283 made changes to the forward function.") def test_torch_fx_output_loss(self): super().test_torch_fx_output_loss() diff --git a/tests/models/qwen2/test_modeling_qwen2.py b/tests/models/qwen2/test_modeling_qwen2.py index 769d6caabd9..301937079ae 100644 --- a/tests/models/qwen2/test_modeling_qwen2.py +++ b/tests/models/qwen2/test_modeling_qwen2.py @@ -368,7 +368,6 @@ def test_model_various_embeddings(self): config_and_inputs[0].position_embedding_type = type self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip(reason="PR #34283 made changes to the forward function.") def test_torch_fx_output_loss(self): super().test_torch_fx_output_loss() diff --git a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py index 374d9472ca2..30d7996d7e7 100644 --- a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py +++ b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py @@ -391,7 +391,6 @@ def test_model_various_embeddings(self): config_and_inputs[0].position_embedding_type = type self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip(reason="PR #34283 made changes to the forward function.") def test_torch_fx_output_loss(self): super().test_torch_fx_output_loss() From 4c6e0c92527f54c51fc20c1781ab42aeb946f25e Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Thu, 24 Oct 2024 18:42:03 +0200 Subject: [PATCH 21/99] Correct the new defaults (#34377) * Correct the new defaults * CIs * add check * Update utils.py * Update utils.py * Add the max_length in generate test checking shape without passing length * style * CIs * fix fx CI issue --- src/transformers/generation/utils.py | 5 ++++- .../encoder_decoder/test_modeling_encoder_decoder.py | 4 +++- .../test_modeling_speech_encoder_decoder.py | 4 +++- .../test_modeling_vision_encoder_decoder.py | 7 ++++++- 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 3938457155d..efe953db051 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -1440,8 +1440,11 @@ def _prepare_generated_length( and not self.config.is_encoder_decoder ): generation_config.max_length -= inputs_tensor.shape[1] - else: # by default let's always generate 10 new tokens + elif has_default_max_length: # by default let's always generate 20 new tokens generation_config.max_length = generation_config.max_length + input_ids_length + max_position_embeddings = getattr(self.config, "max_position_embeddings", None) + if max_position_embeddings is not None: + generation_config.max_length = min(generation_config.max_length, max_position_embeddings) # same for min length if generation_config.min_new_tokens is not None: diff --git a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py index 0ee4b75ed80..64ebedcb459 100644 --- a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py +++ b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py @@ -488,7 +488,9 @@ def check_encoder_decoder_model_generate(self, input_ids, config, decoder_config # Bert does not have a bos token id, so use pad_token_id instead generated_output = enc_dec_model.generate( - input_ids, decoder_start_token_id=enc_dec_model.config.decoder.pad_token_id + input_ids, + decoder_start_token_id=enc_dec_model.config.decoder.pad_token_id, + max_length=decoder_config.max_length, ) self.assertEqual(generated_output.shape, (input_ids.shape[0],) + (decoder_config.max_length,)) diff --git a/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py b/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py index 6e0b7fa9782..7dcb7c406ae 100644 --- a/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py +++ b/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py @@ -362,7 +362,9 @@ def check_encoder_decoder_model_generate( # Bert does not have a bos token id, so use pad_token_id instead generated_output = enc_dec_model.generate( - inputs, decoder_start_token_id=enc_dec_model.config.decoder.pad_token_id + inputs, + decoder_start_token_id=enc_dec_model.config.decoder.pad_token_id, + max_length=decoder_config.max_length, ) self.assertEqual(generated_output.shape, (inputs.shape[0],) + (decoder_config.max_length,)) diff --git a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py index 7def8a9ac96..77e2a19fea4 100644 --- a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py +++ b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py @@ -306,7 +306,9 @@ def check_encoder_decoder_model_generate(self, config, decoder_config, pixel_val # Bert does not have a bos token id, so use pad_token_id instead generated_output = enc_dec_model.generate( - inputs, decoder_start_token_id=enc_dec_model.config.decoder.pad_token_id + inputs, + decoder_start_token_id=enc_dec_model.config.decoder.pad_token_id, + max_length=decoder_config.max_length, ) self.assertEqual(generated_output.shape, (inputs.shape[0],) + (decoder_config.max_length,)) @@ -873,6 +875,7 @@ def check_encoder_decoder_model_generate(self, config, decoder_config, pixel_val generated_output = enc_dec_model.generate( pixel_values=pixel_values, decoder_start_token_id=enc_dec_model.config.decoder.bos_token_id, + max_length=decoder_config.max_length, **kwargs, ) self.assertEqual(generated_output.shape, (pixel_values.shape[0],) + (decoder_config.max_length,)) @@ -990,6 +993,7 @@ def check_encoder_decoder_model_generate(self, config, decoder_config, pixel_val generated_output = enc_dec_model.generate( pixel_values=pixel_values, decoder_start_token_id=enc_dec_model.config.decoder.bos_token_id, + max_length=decoder_config.max_length, **kwargs, ) self.assertEqual(generated_output.shape, (pixel_values.shape[0],) + (decoder_config.max_length,)) @@ -1107,6 +1111,7 @@ def check_encoder_decoder_model_generate(self, config, decoder_config, pixel_val generated_output = enc_dec_model.generate( pixel_values=pixel_values, decoder_start_token_id=enc_dec_model.config.decoder.bos_token_id, + max_length=decoder_config.max_length, **kwargs, ) self.assertEqual(generated_output.shape, (pixel_values.shape[0],) + (decoder_config.max_length,)) From a308d28d397af77c6a6b6d3b397991b555677007 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Thu, 24 Oct 2024 19:07:23 +0200 Subject: [PATCH 22/99] [auto. ping] Avoid sending empty info + add more team members (#34383) * update * update --------- Co-authored-by: ydshieh --- .github/workflows/check_failed_model_tests.yml | 2 +- utils/check_bad_commit.py | 10 +++++++++- utils/process_bad_commit_report.py | 13 ++++++++++++- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/.github/workflows/check_failed_model_tests.yml b/.github/workflows/check_failed_model_tests.yml index f229765994d..f3ea8646900 100644 --- a/.github/workflows/check_failed_model_tests.yml +++ b/.github/workflows/check_failed_model_tests.yml @@ -106,7 +106,7 @@ jobs: } >> "$GITHUB_ENV" - name: Send processed report - if: ${{ env.REPORT_TEXT != '' }} + if: ${{ !endsWith(env.REPORT_TEXT, '{}') }} uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001 with: # Slack channel id, channel name, or user id to post message. diff --git a/utils/check_bad_commit.py b/utils/check_bad_commit.py index 091ed5c4a42..adb25f11264 100644 --- a/utils/check_bad_commit.py +++ b/utils/check_bad_commit.py @@ -182,7 +182,15 @@ def get_commit_info(commit): info = {"test": test, "commit": commit} info.update(get_commit_info(commit)) failed_tests_with_bad_commits.append(info) - reports[model]["single-gpu"] = failed_tests_with_bad_commits + + # If no single-gpu test failures, remove the key + if len(failed_tests_with_bad_commits) > 0: + reports[model]["single-gpu"] = failed_tests_with_bad_commits + else: + reports[model].pop("single-gpu", None) + + # remove the models without any test failure + reports = {k: v for k, v in reports.items() if len(v) > 0} with open(args.output_file, "w", encoding="UTF-8") as fp: json.dump(reports, fp, ensure_ascii=False, indent=4) diff --git a/utils/process_bad_commit_report.py b/utils/process_bad_commit_report.py index f61f1b10664..513dc8df3a3 100644 --- a/utils/process_bad_commit_report.py +++ b/utils/process_bad_commit_report.py @@ -28,7 +28,18 @@ data = json.load(fp) # TODO: extend - team_members = ["ydshieh", "zucchini-nlp", "ArthurZucker", "gante", "LysandreJik", "molbap", "qubvel"] + team_members = [ + "ydshieh", + "zucchini-nlp", + "ArthurZucker", + "gante", + "LysandreJik", + "molbap", + "qubvel", + "Rocketknight1", + "muellerzr", + "SunMarc", + ] # Counting the number of failures grouped by authors new_data = {} From 3d99f1746e0d667cbec9e69b4ec11289c4752630 Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Thu, 24 Oct 2024 19:17:52 +0200 Subject: [PATCH 23/99] Fix glm (#34388) * Fix duplicated * fix import --- src/transformers/models/glm/modeling_glm.py | 13 ++----------- src/transformers/models/glm/modular_glm.py | 4 +--- src/transformers/models/phi3/modeling_phi3.py | 5 +---- 3 files changed, 4 insertions(+), 18 deletions(-) diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py index 1e7c6eae0ee..5f8eaf89ed9 100644 --- a/src/transformers/models/glm/modeling_glm.py +++ b/src/transformers/models/glm/modeling_glm.py @@ -30,6 +30,7 @@ from ...cache_utils import Cache, DynamicCache, StaticCache from ...generation import GenerationMixin from ...modeling_attn_mask_utils import AttentionMaskConverter +from ...modeling_flash_attention_utils import FlashAttentionKwargs, _flash_attention_forward from ...modeling_outputs import ( BaseModelOutputWithPast, CausalLMOutputWithPast, @@ -37,11 +38,11 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel +from ...processing_utils import Unpack from ...utils import ( add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, - is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10, logging, replace_return_docstrings, @@ -49,16 +50,6 @@ from .configuration_glm import GlmConfig -if is_flash_attn_2_available(): - from ...modeling_flash_attention_utils import _flash_attention_forward - -from ...modeling_flash_attention_utils import FlashAttentionKwargs, _flash_attention_forward -from ...processing_utils import Unpack - - -_CHECKPOINT_FOR_DOC = "dummy" - - _CHECKPOINT_FOR_DOC = "THUDM/glm-4-9b" diff --git a/src/transformers/models/glm/modular_glm.py b/src/transformers/models/glm/modular_glm.py index 9cfd617eeb2..39ee4a2ad58 100644 --- a/src/transformers/models/glm/modular_glm.py +++ b/src/transformers/models/glm/modular_glm.py @@ -44,11 +44,9 @@ from .configuration_glm import GlmConfig -_CHECKPOINT_FOR_DOC = "THUDM/glm-4-9b" - logger = logging.get_logger(__name__) -_CHECKPOINT_FOR_DOC = "dummy" +_CHECKPOINT_FOR_DOC = "THUDM/glm-4-9b" class GlmRMSNorm(Phi3RMSNorm): diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py index 9e638c27afa..a1a86e3672d 100644 --- a/src/transformers/models/phi3/modeling_phi3.py +++ b/src/transformers/models/phi3/modeling_phi3.py @@ -28,6 +28,7 @@ from ...cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache from ...generation import GenerationMixin from ...modeling_attn_mask_utils import AttentionMaskConverter +from ...modeling_flash_attention_utils import _flash_attention_forward from ...modeling_outputs import ( BaseModelOutputWithPast, CausalLMOutputWithPast, @@ -39,7 +40,6 @@ add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, - is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10, logging, replace_return_docstrings, @@ -47,9 +47,6 @@ from .configuration_phi3 import Phi3Config -if is_flash_attn_2_available(): - from ...modeling_flash_attention_utils import _flash_attention_forward - logger = logging.get_logger(__name__) _CHECKPOINT_FOR_DOC = "microsoft/Phi-3-mini-4k-instruct" From 940a6bd343cfd2ff4f4425b4cbc548d1e1d316da Mon Sep 17 00:00:00 2001 From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com> Date: Thu, 24 Oct 2024 20:00:13 -0400 Subject: [PATCH 24/99] Use non nested images and batched text Idefics2/3 (#34222) * add support for non nested images and add tests * add tests error scenario * fix style * added single and no image to error tests --- .../idefics2/image_processing_idefics2.py | 1 + .../models/idefics2/processing_idefics2.py | 17 +++- .../idefics3/image_processing_idefics3.py | 3 + .../models/idefics3/processing_idefics3.py | 38 ++++++--- .../pixtral/image_processing_pixtral.py | 1 + .../idefics2/test_processor_idefics2.py | 77 +++++++++++++++--- .../idefics3/test_processor_idefics3.py | 79 ++++++++++++++++--- 7 files changed, 183 insertions(+), 33 deletions(-) diff --git a/src/transformers/models/idefics2/image_processing_idefics2.py b/src/transformers/models/idefics2/image_processing_idefics2.py index ac9df68871e..ce0032f80c5 100644 --- a/src/transformers/models/idefics2/image_processing_idefics2.py +++ b/src/transformers/models/idefics2/image_processing_idefics2.py @@ -99,6 +99,7 @@ def make_list_of_images(images: ImageInput) -> List[List[np.ndarray]]: isinstance(images, (list, tuple)) and len(images) > 0 and isinstance(images[0], (list, tuple)) + and len(images[0]) > 0 and is_valid_image(images[0][0]) ): pass diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py index 68566d18267..9a041257c36 100644 --- a/src/transformers/models/idefics2/processing_idefics2.py +++ b/src/transformers/models/idefics2/processing_idefics2.py @@ -16,6 +16,7 @@ Processor class for IDEFICS2. """ +from itertools import accumulate from typing import TYPE_CHECKING, List, Optional, Union from ...feature_extraction_utils import BatchFeature @@ -218,7 +219,21 @@ def __call__( if is_image_or_image_url(images): images = [[images]] elif isinstance(images, list) and is_image_or_image_url(images[0]): - images = [images] + if text is not None: + if sum(n_images_in_text) != len(images): + raise ValueError( + f"The total number of {image_token} tokens in the prompts should be the same as the number of images passed." + f" Found {sum(n_images_in_text)} {image_token} tokens and {len(images)} images." + ) + # Reorganize the images to match the prompts + cumsum_images_in_text = [0] + list(accumulate(n_images_in_text)) + images = [ + images[cumsum_images_in_text[i] : cumsum_images_in_text[i + 1]] + for i in range(len(n_images_in_text)) + ] + else: + images = [images] + elif ( not isinstance(images, list) and not isinstance(images[0], list) diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py index 495ac04595f..05a1a396dc7 100644 --- a/src/transformers/models/idefics3/image_processing_idefics3.py +++ b/src/transformers/models/idefics3/image_processing_idefics3.py @@ -151,9 +151,11 @@ def get_resize_output_image_size( def make_list_of_images(images: ImageInput) -> List[List[np.ndarray]]: """ Convert a single image or a list of images to a list of numpy arrays. + Args: images (`ImageInput`): A single image or a list of images. + Returns: A list of numpy arrays. """ @@ -168,6 +170,7 @@ def make_list_of_images(images: ImageInput) -> List[List[np.ndarray]]: isinstance(images, (list, tuple)) and len(images) > 0 and isinstance(images[0], (list, tuple)) + and len(images[0]) > 0 and is_valid_image(images[0][0]) ): pass diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py index ceafa26a8b1..872f5206f20 100644 --- a/src/transformers/models/idefics3/processing_idefics3.py +++ b/src/transformers/models/idefics3/processing_idefics3.py @@ -17,6 +17,7 @@ """ import re +from itertools import accumulate from typing import TYPE_CHECKING, Dict, List, Optional, Union from ...feature_extraction_utils import BatchFeature @@ -241,11 +242,31 @@ def __call__( n_images_in_images = [] inputs = BatchFeature() + if text is not None: + if isinstance(text, str): + text = [text] + elif not isinstance(text, list) and not isinstance(text[0], str): + raise ValueError("Invalid input text. Please provide a string, or a list of strings") + n_images_in_text = [sample.count(self.image_token.content) for sample in text] + if images is not None: if is_image_or_image_url(images): images = [[images]] elif isinstance(images, list) and is_image_or_image_url(images[0]): - images = [images] + if text is not None: + if sum(n_images_in_text) != len(images): + raise ValueError( + f"The total number of {self.image_token.content} tokens in the prompts should be the same as the number of images passed." + f" Found {sum(n_images_in_text)} {self.image_token.content} tokens and {len(images)} images." + ) + # Reorganize the images to match the prompts + cumsum_images_in_text = [0] + list(accumulate(n_images_in_text)) + images = [ + images[cumsum_images_in_text[i] : cumsum_images_in_text[i + 1]] + for i in range(len(n_images_in_text)) + ] + else: + images = [images] elif ( not isinstance(images, list) and not isinstance(images[0], list) @@ -263,10 +284,10 @@ def __call__( inputs.update(image_inputs) if text is not None: - if isinstance(text, str): - text = [text] - elif not isinstance(text, list) and not isinstance(text[0], str): - raise ValueError("Invalid input text. Please provide a string, or a list of strings") + if n_images_in_images != n_images_in_text: + raise ValueError( + f"The number of images in the text {n_images_in_text} and images {n_images_in_images} should be the same." + ) image_rows = inputs.pop("rows", [[0] * len(text)]) image_cols = inputs.pop("cols", [[0] * len(text)]) @@ -277,8 +298,6 @@ def __call__( prompt_strings = [] for sample, sample_rows, sample_cols in zip(text, image_rows, image_cols): - n_images_in_text.append(sample.count(image_token)) - # Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len` image_prompt_strings = [] for n_rows, n_cols in zip(sample_rows, sample_cols): @@ -305,11 +324,6 @@ def __call__( text_inputs = self.tokenizer(text=prompt_strings, **output_kwargs["text_kwargs"]) inputs.update(text_inputs) - if n_images_in_images != n_images_in_text: - raise ValueError( - f"The number of images in the text {n_images_in_text} and images {n_images_in_images} should be the same." - ) - return inputs def batch_decode(self, *args, **kwargs): diff --git a/src/transformers/models/pixtral/image_processing_pixtral.py b/src/transformers/models/pixtral/image_processing_pixtral.py index a75704fc3db..b4ec0e50c9c 100644 --- a/src/transformers/models/pixtral/image_processing_pixtral.py +++ b/src/transformers/models/pixtral/image_processing_pixtral.py @@ -120,6 +120,7 @@ def make_list_of_images(images: ImageInput) -> List[List[np.ndarray]]: isinstance(images, (list, tuple)) and len(images) > 0 and isinstance(images[0], (list, tuple)) + and len(images[0]) > 0 and is_valid_image(images[0][0]) ): pass diff --git a/tests/models/idefics2/test_processor_idefics2.py b/tests/models/idefics2/test_processor_idefics2.py index bf713c6fb8c..d89004679ae 100644 --- a/tests/models/idefics2/test_processor_idefics2.py +++ b/tests/models/idefics2/test_processor_idefics2.py @@ -226,6 +226,73 @@ def test_add_special_tokens_processor(self): self.assertEqual(inputs["input_ids"], expected_input_ids) # fmt: on + def test_non_nested_images_with_batched_text(self): + processor = self.get_processor() + processor.image_processor.do_image_splitting = False + + image_str = "" + text_str_1 = "In this image, we see" + text_str_2 = "bla, bla" + + text = [ + image_str + text_str_1, + text_str_2 + image_str + image_str, + ] + images = [self.image1, self.image2, self.image3] + + inputs = processor(text=text, images=images, padding=True) + + self.assertEqual(inputs["pixel_values"].shape, (2, 2, 3, 767, 980)) + self.assertEqual(inputs["pixel_attention_mask"].shape, (2, 2, 767, 980)) + + def test_process_interleaved_images_prompts_image_error(self): + processor = self.get_processor() + + text = [ + "This is a test sentence.", + "In this other sentence we try some good things", + ] + images = [[self.image1], [self.image2]] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [[self.image1], []] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + + text = [ + "This is a test sentence.", + "In this other sentence we try some good things", + ] + images = [[self.image1], [self.image2, self.image3]] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [[], [self.image2]] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [self.image1, self.image2, self.image3] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [self.image1] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + + text = [ + "This is a test sentence.", + "In this other sentence we try some good things", + ] + images = [[self.image1], []] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [[], [self.image2]] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [self.image1, self.image2] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [self.image1] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + def test_apply_chat_template(self): # Message contains content which a mix of lists with images and image urls and string messages = [ @@ -275,13 +342,3 @@ def prepare_text_inputs(self, batch_size: Optional[int] = None): return ["lower newer ", " upper older longer string"] + [" lower newer"] * ( batch_size - 2 ) - - # Override as PixtralProcessor needs nested images to work properly with batched inputs - @require_vision - def prepare_image_inputs(self, batch_size: Optional[int] = None): - """This function prepares a list of PIL images for testing""" - if batch_size is None: - return super().prepare_image_inputs() - if batch_size < 1: - raise ValueError("batch_size must be greater than 0") - return [[super().prepare_image_inputs()]] * batch_size diff --git a/tests/models/idefics3/test_processor_idefics3.py b/tests/models/idefics3/test_processor_idefics3.py index a53109b02b6..52d2f1539a4 100644 --- a/tests/models/idefics3/test_processor_idefics3.py +++ b/tests/models/idefics3/test_processor_idefics3.py @@ -250,6 +250,74 @@ def test_add_special_tokens_processor(self): self.assertEqual(inputs["input_ids"], expected_input_ids) # fmt: on + def test_non_nested_images_with_batched_text(self): + processor = self.get_processor() + processor.image_processor.do_image_splitting = False + + image_str = "" + text_str_1 = "In this image, we see" + text_str_2 = "In this image, we see" + + text = [ + image_str + text_str_1, + image_str + image_str + text_str_2, + ] + images = [self.image1, self.image2, self.image3] + + inputs = processor(text=text, images=images, padding=True) + + self.assertEqual(np.array(inputs["pixel_values"]).shape, (2, 2, 3, 364, 364)) + self.assertEqual(np.array(inputs["pixel_attention_mask"]).shape, (2, 2, 364, 364)) + + # Copied from tests.models.idefics2.test_processor_idefics2.Idefics2ProcessorTest.test_process_interleaved_images_prompts_image_error + def test_process_interleaved_images_prompts_image_error(self): + processor = self.get_processor() + + text = [ + "This is a test sentence.", + "In this other sentence we try some good things", + ] + images = [[self.image1], [self.image2]] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [[self.image1], []] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + + text = [ + "This is a test sentence.", + "In this other sentence we try some good things", + ] + images = [[self.image1], [self.image2, self.image3]] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [[], [self.image2]] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [self.image1, self.image2, self.image3] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [self.image1] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + + text = [ + "This is a test sentence.", + "In this other sentence we try some good things", + ] + images = [[self.image1], []] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [[], [self.image2]] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [self.image1, self.image2] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + images = [self.image1] + with self.assertRaises(ValueError): + processor(text=text, images=images, padding=True) + def test_apply_chat_template(self): # Message contains content which a mix of lists with images and image urls and string messages = [ @@ -299,16 +367,7 @@ def prepare_text_inputs(self, batch_size: Optional[int] = None): batch_size - 2 ) - # Override as Idefics3Processor needs nested images to work properly with batched inputs - @require_vision - def prepare_image_inputs(self, batch_size: Optional[int] = None): - """This function prepares a list of PIL images for testing""" - if batch_size is None: - return super().prepare_image_inputs() - if batch_size < 1: - raise ValueError("batch_size must be greater than 0") - return [[super().prepare_image_inputs()]] * batch_size - + # Override tests as inputs_ids padded dimension is the second one but not the last one @require_vision @require_torch def test_kwargs_overrides_default_tokenizer_kwargs(self): From 5779bac4c45b2c881603cafd20663892869d5860 Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Fri, 25 Oct 2024 09:44:09 +0200 Subject: [PATCH 25/99] Fix onnx non-expotable inplace aten op (#34376) * fix onnx non-expotable inplace op * mistral, qwen2, qwen2_vl, starcoder2 * fixup copies --- src/transformers/models/mimi/modeling_mimi.py | 2 +- src/transformers/models/mistral/modeling_mistral.py | 2 +- src/transformers/models/mixtral/modeling_mixtral.py | 2 +- src/transformers/models/moshi/modeling_moshi.py | 4 ++-- src/transformers/models/phi3/modeling_phi3.py | 2 +- src/transformers/models/phimoe/modeling_phimoe.py | 2 +- src/transformers/models/qwen2/modeling_qwen2.py | 2 +- src/transformers/models/qwen2_moe/modeling_qwen2_moe.py | 2 +- src/transformers/models/qwen2_vl/modeling_qwen2_vl.py | 2 +- src/transformers/models/starcoder2/modeling_starcoder2.py | 2 +- 10 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/transformers/models/mimi/modeling_mimi.py b/src/transformers/models/mimi/modeling_mimi.py index 514f9de706e..cbdd2c663c5 100644 --- a/src/transformers/models/mimi/modeling_mimi.py +++ b/src/transformers/models/mimi/modeling_mimi.py @@ -1156,7 +1156,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( sliding_attend_mask = torch.arange(target_length, device=device) <= ( cache_position.reshape(-1, 1) - config.sliding_window ) - diagonal_attend_mask |= sliding_attend_mask + diagonal_attend_mask.bitwise_or_(sliding_attend_mask) causal_mask *= diagonal_attend_mask causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) if attention_mask is not None: diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py index f198e4abc85..321d3dc0daf 100644 --- a/src/transformers/models/mistral/modeling_mistral.py +++ b/src/transformers/models/mistral/modeling_mistral.py @@ -961,7 +961,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( sliding_attend_mask = torch.arange(target_length, device=device) <= ( cache_position.reshape(-1, 1) - config.sliding_window ) - diagonal_attend_mask |= sliding_attend_mask + diagonal_attend_mask.bitwise_or_(sliding_attend_mask) causal_mask *= diagonal_attend_mask causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) if attention_mask is not None: diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py index 192b7801af0..78a17178ecd 100644 --- a/src/transformers/models/mixtral/modeling_mixtral.py +++ b/src/transformers/models/mixtral/modeling_mixtral.py @@ -1174,7 +1174,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( sliding_attend_mask = torch.arange(target_length, device=device) <= ( cache_position.reshape(-1, 1) - config.sliding_window ) - diagonal_attend_mask |= sliding_attend_mask + diagonal_attend_mask.bitwise_or_(sliding_attend_mask) causal_mask *= diagonal_attend_mask causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) if attention_mask is not None: diff --git a/src/transformers/models/moshi/modeling_moshi.py b/src/transformers/models/moshi/modeling_moshi.py index 97200b7d042..9975996d21d 100644 --- a/src/transformers/models/moshi/modeling_moshi.py +++ b/src/transformers/models/moshi/modeling_moshi.py @@ -1385,7 +1385,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( sliding_attend_mask = torch.arange(target_length, device=device) <= ( cache_position.reshape(-1, 1) - config.sliding_window ) - diagonal_attend_mask |= sliding_attend_mask + diagonal_attend_mask.bitwise_or_(sliding_attend_mask) causal_mask *= diagonal_attend_mask causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) if attention_mask is not None: @@ -1689,7 +1689,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( sliding_attend_mask = torch.arange(target_length, device=device) <= ( cache_position.reshape(-1, 1) - config.sliding_window ) - diagonal_attend_mask |= sliding_attend_mask + diagonal_attend_mask.bitwise_or_(sliding_attend_mask) causal_mask *= diagonal_attend_mask causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) if attention_mask is not None: diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py index a1a86e3672d..bae3f6d4cda 100644 --- a/src/transformers/models/phi3/modeling_phi3.py +++ b/src/transformers/models/phi3/modeling_phi3.py @@ -1136,7 +1136,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( sliding_attend_mask = torch.arange(target_length, device=device) <= ( cache_position.reshape(-1, 1) - config.sliding_window ) - diagonal_attend_mask |= sliding_attend_mask + diagonal_attend_mask.bitwise_or_(sliding_attend_mask) causal_mask *= diagonal_attend_mask causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) if attention_mask is not None: diff --git a/src/transformers/models/phimoe/modeling_phimoe.py b/src/transformers/models/phimoe/modeling_phimoe.py index 791f6df50bb..f3690e5f686 100644 --- a/src/transformers/models/phimoe/modeling_phimoe.py +++ b/src/transformers/models/phimoe/modeling_phimoe.py @@ -1305,7 +1305,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( sliding_attend_mask = torch.arange(target_length, device=device) <= ( cache_position.reshape(-1, 1) - config.sliding_window ) - diagonal_attend_mask |= sliding_attend_mask + diagonal_attend_mask.bitwise_or_(sliding_attend_mask) causal_mask *= diagonal_attend_mask causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) if attention_mask is not None: diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py index 0d97f2ffb72..0883fac1aeb 100644 --- a/src/transformers/models/qwen2/modeling_qwen2.py +++ b/src/transformers/models/qwen2/modeling_qwen2.py @@ -1059,7 +1059,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( sliding_attend_mask = torch.arange(target_length, device=device) <= ( cache_position.reshape(-1, 1) - config.sliding_window ) - diagonal_attend_mask |= sliding_attend_mask + diagonal_attend_mask.bitwise_or_(sliding_attend_mask) causal_mask *= diagonal_attend_mask causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) if attention_mask is not None: diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py index 36de586265c..7f4f19aba1f 100644 --- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py @@ -1239,7 +1239,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( sliding_attend_mask = torch.arange(target_length, device=device) <= ( cache_position.reshape(-1, 1) - config.sliding_window ) - diagonal_attend_mask |= sliding_attend_mask + diagonal_attend_mask.bitwise_or_(sliding_attend_mask) causal_mask *= diagonal_attend_mask causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) if attention_mask is not None: diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py index 4e9401c77e4..90bf29c8b5d 100644 --- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py @@ -1321,7 +1321,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( sliding_attend_mask = torch.arange(target_length, device=device) <= ( cache_position.reshape(-1, 1) - config.sliding_window ) - diagonal_attend_mask |= sliding_attend_mask + diagonal_attend_mask.bitwise_or_(sliding_attend_mask) causal_mask *= diagonal_attend_mask causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) if attention_mask is not None: diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py index c8f22dee43f..1a8b6412e73 100644 --- a/src/transformers/models/starcoder2/modeling_starcoder2.py +++ b/src/transformers/models/starcoder2/modeling_starcoder2.py @@ -1033,7 +1033,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( sliding_attend_mask = torch.arange(target_length, device=device) <= ( cache_position.reshape(-1, 1) - config.sliding_window ) - diagonal_attend_mask |= sliding_attend_mask + diagonal_attend_mask.bitwise_or_(sliding_attend_mask) causal_mask *= diagonal_attend_mask causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) if attention_mask is not None: From 9f365fe0ac7fda3aa8adac6707f9368ac981cdd3 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Fri, 25 Oct 2024 11:02:07 +0200 Subject: [PATCH 26/99] Fix right padding in LLaVA models (#34305) * fix right pad llavas * device mismatch --- src/transformers/models/llava/modeling_llava.py | 7 ++++++- .../models/video_llava/modeling_video_llava.py | 7 ++++++- src/transformers/models/vipllava/modeling_vipllava.py | 7 ++++++- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py index 50b3d4c6a89..0b2492fc711 100644 --- a/src/transformers/models/llava/modeling_llava.py +++ b/src/transformers/models/llava/modeling_llava.py @@ -354,7 +354,12 @@ def _merge_input_ids_with_image_features(self, image_features, inputs_embeds, in (batch_size, max_embed_dim), True, dtype=torch.bool, device=inputs_embeds.device ) image_to_overwrite[batch_indices, text_to_overwrite] = False - image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device) + if left_padding: + image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device) + else: + mask = torch.ones_like(image_to_overwrite, dtype=torch.bool).cumsum(-1) - 1 + padding_mask = mask <= new_token_positions[:, -1:].to(target_device) + image_to_overwrite &= padding_mask if image_to_overwrite.sum() != image_features.shape[:-1].numel(): raise ValueError( diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py index 0fe89676b92..a9bd8b745a6 100644 --- a/src/transformers/models/video_llava/modeling_video_llava.py +++ b/src/transformers/models/video_llava/modeling_video_llava.py @@ -339,7 +339,12 @@ def _merge_input_ids_with_visual_features( # 5. Fill the embeddings corresponding to the images. Anything that is still zeros needs filling image_to_overwrite = torch.full((batch_size, max_seq_len), True, dtype=torch.bool, device=inputs_embeds.device) image_to_overwrite[batch_indices, text_to_overwrite] = False - image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device) + if left_padding: + image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device) + else: + mask = torch.ones_like(image_to_overwrite, dtype=torch.bool).cumsum(-1) - 1 + padding_mask = mask <= new_token_positions[:, -1:].to(target_device) + image_to_overwrite &= padding_mask if image_to_overwrite.sum() != visual_features.shape[:-1].numel(): visual_type = "videos" if num_frames == 8 else "images" diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py index dd7baa34406..987ae0ad0c6 100644 --- a/src/transformers/models/vipllava/modeling_vipllava.py +++ b/src/transformers/models/vipllava/modeling_vipllava.py @@ -350,7 +350,12 @@ def _merge_input_ids_with_image_features(self, image_features, inputs_embeds, in (batch_size, max_embed_dim), True, dtype=torch.bool, device=inputs_embeds.device ) image_to_overwrite[batch_indices, text_to_overwrite] = False - image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device) + if left_padding: + image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device) + else: + mask = torch.ones_like(image_to_overwrite, dtype=torch.bool).cumsum(-1) - 1 + padding_mask = mask <= new_token_positions[:, -1:].to(target_device) + image_to_overwrite &= padding_mask if image_to_overwrite.sum() != image_features.shape[:-1].numel(): raise ValueError( From 223855314f879f99ace727cb11d748a2f5f1d48d Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Fri, 25 Oct 2024 12:32:39 +0200 Subject: [PATCH 27/99] no filter (#34391) * no filter * no filter * no filter --------- Co-authored-by: ydshieh --- utils/tests_fetcher.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py index 9e15f2e115e..906e85e1de6 100644 --- a/utils/tests_fetcher.py +++ b/utils/tests_fetcher.py @@ -997,7 +997,7 @@ def _print_list(l) -> str: def infer_tests_to_run( output_file: str, diff_with_last_commit: bool = False, - filter_models: bool = True, + filter_models: bool = False, ): """ The main function called by the test fetcher. Determines the tests to run from the diff. @@ -1229,6 +1229,6 @@ def create_test_list_from_filter(full_test_list, out_path): infer_tests_to_run( args.output_file, diff_with_last_commit=diff_with_last_commit, - filter_models=(not (commit_flags["no_filter"] or is_main_branch)), + filter_models=False, ) filter_tests(args.output_file, ["repo_utils"]) From 8814043c8c62034277b04e73a44e25231ab020ad Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Fri, 25 Oct 2024 11:46:46 +0100 Subject: [PATCH 28/99] SynthID: better example (#34372) * better example * Update src/transformers/generation/configuration_utils.py * Update src/transformers/generation/logits_process.py * nits --- docs/source/en/internal/generation_utils.md | 4 +--- src/transformers/generation/configuration_utils.py | 10 +++++----- src/transformers/generation/logits_process.py | 10 +++++----- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/docs/source/en/internal/generation_utils.md b/docs/source/en/internal/generation_utils.md index 946940cb019..eb25ddb6329 100644 --- a/docs/source/en/internal/generation_utils.md +++ b/docs/source/en/internal/generation_utils.md @@ -428,13 +428,11 @@ A [`Constraint`] can be used to force the generation to include specific tokens - __call__ [[autodoc]] BayesianDetectorConfig - - __call__ [[autodoc]] BayesianDetectorModel - - __call__ + - forward [[autodoc]] SynthIDTextWatermarkingConfig - - __call__ [[autodoc]] SynthIDTextWatermarkDetector - __call__ diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py index c460a19885a..3c204481b04 100644 --- a/src/transformers/generation/configuration_utils.py +++ b/src/transformers/generation/configuration_utils.py @@ -1471,8 +1471,8 @@ class SynthIDTextWatermarkingConfig(BaseWatermarkingConfig): ```python >>> from transformers import AutoModelForCausalLM, AutoTokenizer, SynthIDTextWatermarkingConfig - >>> tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b-it') - >>> model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b-it') + >>> tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b', padding_side="left") + >>> model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b') >>> # SynthID Text configuration >>> watermarking_config = SynthIDTextWatermarkingConfig( @@ -1481,11 +1481,11 @@ class SynthIDTextWatermarkingConfig(BaseWatermarkingConfig): ... ) >>> # Generation with watermarking - >>> tokenized_prompts = tokenizer(["your prompts here"]) + >>> tokenized_prompts = tokenizer(["Once upon a time, "], return_tensors="pt", padding=True) >>> output_sequences = model.generate( - ... **tokenized_prompts, watermarking_config=watermarking_config, do_sample=True, + ... **tokenized_prompts, watermarking_config=watermarking_config, do_sample=True, max_new_tokens=10 ... ) - >>> watermarked_text = tokenizer.batch_decode(output_sequences) + >>> watermarked_text = tokenizer.batch_decode(output_sequences, skip_special_tokens=True) ``` """ diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py index fde95c7a856..9d244191da8 100644 --- a/src/transformers/generation/logits_process.py +++ b/src/transformers/generation/logits_process.py @@ -2565,8 +2565,8 @@ class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor): ```python >>> from transformers import AutoModelForCausalLM, AutoTokenizer, SynthIDTextWatermarkingConfig - >>> tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b-it') - >>> model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b-it') + >>> tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b', padding_side="left") + >>> model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b') >>> # SynthID Text configuration >>> watermarking_config = SynthIDTextWatermarkingConfig( @@ -2575,11 +2575,11 @@ class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor): ... ) >>> # Generation with watermarking - >>> tokenized_prompts = tokenizer(["your prompts here"]) + >>> tokenized_prompts = tokenizer(["Once upon a time, "], return_tensors="pt", padding=True) >>> output_sequences = model.generate( - ... **tokenized_prompts, watermarking_config=watermarking_config, do_sample=True, + ... **tokenized_prompts, watermarking_config=watermarking_config, do_sample=True, max_new_tokens=10 ... ) - >>> watermarked_text = tokenizer.batch_decode(output_sequences) + >>> watermarked_text = tokenizer.batch_decode(output_sequences, skip_special_tokens=True) ``` """ From 186b8dc190481032892d0a5d68b3db64f4ad4543 Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Fri, 25 Oct 2024 11:55:07 +0100 Subject: [PATCH 29/99] Tests: upgrade `test_eager_matches_sdpa_generate` (#34386) --- tests/generation/test_utils.py | 82 +++++++++++ tests/models/bert/test_modeling_bert.py | 74 ---------- tests/models/cohere/test_modeling_cohere.py | 58 -------- tests/models/falcon/test_modeling_falcon.py | 74 ---------- tests/models/glm/test_modeling_glm.py | 71 --------- .../models/gpt_neox/test_modeling_gpt_neox.py | 64 +-------- tests/models/jetmoe/test_modeling_jetmoe.py | 9 -- tests/models/llama/test_modeling_llama.py | 62 -------- tests/models/mistral/test_modeling_mistral.py | 8 -- tests/models/mixtral/test_modeling_mixtral.py | 9 -- tests/models/mllama/test_modeling_mllama.py | 12 -- tests/models/moshi/test_modeling_moshi.py | 6 +- .../models/musicgen/test_modeling_musicgen.py | 136 ------------------ .../test_modeling_musicgen_melody.py | 68 --------- tests/models/olmo/test_modeling_olmo.py | 9 -- tests/models/olmoe/test_modeling_olmoe.py | 9 -- tests/models/opt/test_modeling_opt.py | 63 -------- tests/models/qwen2/test_modeling_qwen2.py | 8 -- .../qwen2_moe/test_modeling_qwen2_moe.py | 6 - .../models/stablelm/test_modeling_stablelm.py | 66 --------- .../test_modeling_xlm_roberta_xl.py | 81 +---------- tests/test_modeling_common.py | 56 -------- 22 files changed, 85 insertions(+), 946 deletions(-) diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 4e5d8f30265..6f2eaf734df 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -15,6 +15,7 @@ import copy +import gc import inspect import tempfile import unittest @@ -33,6 +34,7 @@ require_torch_gpu, require_torch_multi_accelerator, require_torch_multi_gpu, + require_torch_sdpa, slow, torch_device, ) @@ -2046,6 +2048,86 @@ def test_inherits_generation_mixin(self): for model_class in self.all_generative_model_classes: self.assertTrue("GenerationMixin" in str(model_class.__bases__)) + @require_torch_sdpa + @slow + def test_eager_matches_sdpa_generate(self): + max_new_tokens = 30 + + for model_class in self.all_generative_model_classes: + if not model_class._supports_sdpa: + self.skipTest(f"{model_class.__name__} does not support SDPA") + + config, original_inputs_dict = self.prepare_config_and_inputs_for_generate() + inputs_dict = {} + for input_name, input_data in original_inputs_dict.items(): + if isinstance(input_data, torch.Tensor) and input_data.dtype in [torch.float32, torch.bfloat16]: + inputs_dict[input_name] = input_data.to(torch.float16) + else: + inputs_dict[input_name] = input_data + main_input = inputs_dict[model_class.main_input_name] + + # make sure that all models have enough positions for generation + if hasattr(config, "max_position_embeddings"): + config.max_position_embeddings = max_new_tokens + main_input.shape[1] + 1 + + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + del model + gc.collect() + + generate_kwargs = { + "max_new_tokens": max_new_tokens, + "do_sample": False, + "return_dict_in_generate": True, + "output_scores": True, + } + + model_sdpa = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + ).to(torch_device) + res_sdpa = model_sdpa.generate(**inputs_dict, **generate_kwargs) + del model_sdpa + gc.collect() + + model_eager = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + attn_implementation="eager", + ).to(torch_device) + res_eager = model_eager.generate(**inputs_dict, **generate_kwargs) + del model_eager + gc.collect() + + # Eager and SDPA are very similar, but not exactly the same. Because we are using random models, this + # test would be flaky if we only checked the sequences. Two situations in which this test passes: + # 1. The sequences are the same + # 2. The sequences are different, but the scores up until the first mismatch are nearly identical + output_matches = res_eager.sequences == res_sdpa.sequences + has_matching_outputs = output_matches.all() + has_matching_scores = None + if not has_matching_outputs: + input_length = main_input.shape[1] + for batch_idx in range(res_eager.sequences.shape[0]): + batch_matches = output_matches[batch_idx] + if batch_matches.all(): + continue + first_mismatch_idx = batch_matches.int().argmin() # gets the index of the first False + first_mismatch_idx -= input_length # scores doesn't include data regarding input tokens + sdpa_first_mismatch_scores = res_sdpa.scores[first_mismatch_idx][batch_idx] + eager_first_mismatch_scores = res_eager.scores[first_mismatch_idx][batch_idx] + has_matching_scores = torch.allclose( + sdpa_first_mismatch_scores, eager_first_mismatch_scores, rtol=1e-3, atol=1e-3 + ) + if not has_matching_scores: + break + + self.assertTrue(has_matching_outputs or has_matching_scores) + def _check_outputs(self, output, main_input, config, use_cache=False, num_return_sequences=1): # we can be sure what is batch size from main input but seq length depends on model type and whether input is text/audio/image # so we infer actual text seq length from model_tester, same was as it is done in `test_modeling_common.py` tests` diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py index 5c87fbea8ee..8ac1c3d2b40 100644 --- a/tests/models/bert/test_modeling_bert.py +++ b/tests/models/bert/test_modeling_bert.py @@ -22,7 +22,6 @@ CaptureLogger, require_torch, require_torch_accelerator, - require_torch_sdpa, slow, torch_device, ) @@ -672,79 +671,6 @@ def test_torchscript_device_change(self): loaded = torch.jit.load(os.path.join(tmp, "bert.pt"), map_location=torch_device) loaded(inputs_dict["input_ids"].to(torch_device), inputs_dict["attention_mask"].to(torch_device)) - # This test was copied from the common test_eager_matches_sdpa_generate(), but without low_cpu_mem_usage=True. - # TODO: Remove this and use the parent method (in common tests) once BERT supports low_cpu_mem_usage=True. - @require_torch_sdpa - @slow - def test_eager_matches_sdpa_generate(self): - max_new_tokens = 30 - - if len(self.all_generative_model_classes) == 0: - self.skipTest(f"{self.__class__.__name__} tests a model that does support generate: skipping this test") - - for model_class in self.all_generative_model_classes: - if not model_class._supports_sdpa: - self.skipTest(f"{model_class.__name__} does not support SDPA") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - - model_sdpa = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - # low_cpu_mem_usage=True, - ).to(torch_device) - - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - - model_eager = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - # low_cpu_mem_usage=True, - attn_implementation="eager", - ).to(torch_device) - - self.assertTrue(model_eager.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - raise ValueError("The eager model should not have SDPA attention layers") - - has_sdpa = False - for name, submodule in model_sdpa.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - has_sdpa = True - break - if not has_sdpa: - raise ValueError("The SDPA model should have SDPA attention layers") - - # Just test that a large cache works as expected - res_eager = model_eager.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - res_sdpa = model_sdpa.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - self.assertTrue(torch.allclose(res_eager, res_sdpa)) - @require_torch class BertModelIntegrationTest(unittest.TestCase): diff --git a/tests/models/cohere/test_modeling_cohere.py b/tests/models/cohere/test_modeling_cohere.py index 3a05867dfdf..cd3b2f978e7 100644 --- a/tests/models/cohere/test_modeling_cohere.py +++ b/tests/models/cohere/test_modeling_cohere.py @@ -307,64 +307,6 @@ def test_model_various_embeddings(self): def test_torch_fx_output_loss(self): super().test_torch_fx_output_loss() - @require_bitsandbytes - @require_torch_sdpa - @require_torch_multi_gpu - @slow - def test_eager_matches_sdpa_generate(self): - """ - Overwritting the common test as the test is flaky on tiny models - """ - max_new_tokens = 30 - - model_id = "CohereForAI/c4ai-command-r-v01-4bit" - tokenizer = AutoTokenizer.from_pretrained(model_id) - - model_sdpa = CohereForCausalLM.from_pretrained( - model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="auto" - ) - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - - model_eager = CohereForCausalLM.from_pretrained( - model_id, torch_dtype=torch.float16, attn_implementation="eager", device_map="auto" - ) - - self.assertTrue(model_eager.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - raise ValueError("The eager model should not have SDPA attention layers") - - has_sdpa = False - for name, submodule in model_sdpa.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - has_sdpa = True - break - if not has_sdpa: - raise ValueError("The SDPA model should have SDPA attention layers") - - texts = [ - "hi here's a longer context, getting longer and", - "Hello this is a very long sentence my friend, very long for real", - "Today I am in Paris and", - ] - - for padding_side in ["left", "right"]: - tokenizer.padding_side = padding_side - tokenizer.pad_token = tokenizer.eos_token - - inputs = tokenizer(texts, return_tensors="pt", padding=True).to(torch_device) - - res_eager = model_eager.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False) - res_sdpa = model_sdpa.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False) - - with self.subTest(f"{padding_side}"): - torch.testing.assert_close( - res_eager, - res_sdpa, - msg=f"\n{tokenizer.batch_decode(res_eager)} \nvs\n{tokenizer.batch_decode(res_sdpa)}", - ) - @require_torch @slow diff --git a/tests/models/falcon/test_modeling_falcon.py b/tests/models/falcon/test_modeling_falcon.py index a1a2b0155cb..ce04fae94ea 100644 --- a/tests/models/falcon/test_modeling_falcon.py +++ b/tests/models/falcon/test_modeling_falcon.py @@ -14,7 +14,6 @@ # limitations under the License. """Testing suite for the PyTorch Falcon model.""" -import tempfile import unittest from parameterized import parameterized @@ -27,7 +26,6 @@ set_seed, ) from transformers.testing_utils import ( - is_flaky, require_bitsandbytes, require_torch, require_torch_sdpa, @@ -520,78 +518,6 @@ def test_model_rope_scaling(self): torch.testing.assert_close(ntk_sin_long, original_sin_long) self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all()) - # TODO: @Fxmarty - @is_flaky(max_attempts=3, description="flaky on some models.") - @require_torch_sdpa - @slow - def test_eager_matches_sdpa_generate(self): - max_new_tokens = 30 - - if len(self.all_generative_model_classes) == 0: - self.skipTest(f"{self.__class__.__name__} tests a model that does support generate: skipping this test") - - for model_class in self.all_generative_model_classes: - if not model_class._supports_sdpa: - self.skipTest(f"{model_class.__name__} does not support SDPA") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - - model_sdpa = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - ).to(torch_device) - - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - - model_eager = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - attn_implementation="eager", - ).to(torch_device) - - self.assertTrue(model_eager.config._attn_implementation == "eager") - - # NOTE: This check is disabled for Falcon as the non-SDPA/SDPA implementation is in the same class (legacy reason). - # for name, submodule in model_eager.named_modules(): - # if "SdpaAttention" in submodule.__class__.__name__: - # raise ValueError("The eager model should not have SDPA attention layers") - - # has_sdpa = False - # for name, submodule in model_sdpa.named_modules(): - # if "SdpaAttention" in submodule.__class__.__name__: - # has_sdpa = True - # break - # if not has_sdpa: - # raise ValueError("The SDPA model should have SDPA attention layers") - - # Just test that a large cache works as expected - res_eager = model_eager.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - res_sdpa = model_sdpa.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - self.assertTrue(torch.allclose(res_eager, res_sdpa)) - @require_torch class FalconLanguageGenerationTest(unittest.TestCase): diff --git a/tests/models/glm/test_modeling_glm.py b/tests/models/glm/test_modeling_glm.py index f703ccd5096..32bce7cbfa6 100644 --- a/tests/models/glm/test_modeling_glm.py +++ b/tests/models/glm/test_modeling_glm.py @@ -758,77 +758,6 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases)) - @require_torch_sdpa - @slow - @is_flaky() - def test_eager_matches_sdpa_generate(self): - """Overwrite to add flakyness: outputs sometimes start to diverge after some tokens""" - - max_new_tokens = 30 - - for model_class in self.all_generative_model_classes: - if not model_class._supports_sdpa: - self.skipTest(f"{model_class.__name__} does not support SDPA") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - - model_sdpa = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - ).to(torch_device) - - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - - model_eager = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - attn_implementation="eager", - ).to(torch_device) - - self.assertTrue(model_eager.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - raise ValueError("The eager model should not have SDPA attention layers") - - has_sdpa = False - for name, submodule in model_sdpa.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - has_sdpa = True - break - if not has_sdpa: - raise ValueError("The SDPA model should have SDPA attention layers") - - # Just test that a large cache works as expected - res_eager = model_eager.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - res_sdpa = model_sdpa.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - self.assertTrue(torch.allclose(res_eager, res_sdpa)) - @slow @require_torch_accelerator diff --git a/tests/models/gpt_neox/test_modeling_gpt_neox.py b/tests/models/gpt_neox/test_modeling_gpt_neox.py index 196f873696e..2c3319f0247 100644 --- a/tests/models/gpt_neox/test_modeling_gpt_neox.py +++ b/tests/models/gpt_neox/test_modeling_gpt_neox.py @@ -19,7 +19,7 @@ from parameterized import parameterized from transformers import AutoTokenizer, GPTNeoXConfig, is_torch_available, set_seed -from transformers.testing_utils import require_torch, require_torch_sdpa, slow, torch_device +from transformers.testing_utils import require_torch, slow, torch_device from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester @@ -434,68 +434,6 @@ def test_model_rope_scaling(self): torch.testing.assert_close(ntk_sin_long, original_sin_long) self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all()) - @require_torch_sdpa - @slow - def test_eager_matches_sdpa_generate(self): - """ - Based on tests.models.llama.test_modeling_llama.LlamaModelTest.test_eager_matches_sdpa_generate - which also overwrites the common test as the test is flaky on tiny models. - """ - max_new_tokens = 30 - - tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-1b") - - model_sdpa = GPTNeoXForCausalLM.from_pretrained( - "EleutherAI/pythia-1b", - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - ).to(torch_device) - - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - - model_eager = GPTNeoXForCausalLM.from_pretrained( - "EleutherAI/pythia-1b", - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - attn_implementation="eager", - ).to(torch_device) - - self.assertTrue(model_eager.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - raise ValueError("The eager model should not have SDPA attention layers") - - has_sdpa = False - for name, submodule in model_sdpa.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - has_sdpa = True - break - if not has_sdpa: - raise ValueError("The SDPA model should have SDPA attention layers") - - texts = [ - "hi here's a longer context, getting longer and", - "Hello this is a very long sentence my friend, very long for real", - "Today I am in Paris and", - ] - - for padding_side in ["left", "right"]: - tokenizer.padding_side = padding_side - tokenizer.pad_token = tokenizer.eos_token - - inputs = tokenizer(texts, return_tensors="pt", padding=True).to(torch_device) - - res_eager = model_eager.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False) - res_sdpa = model_sdpa.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False) - - with self.subTest(f"{padding_side}"): - torch.testing.assert_close( - res_eager, - res_sdpa, - msg=f"\n{tokenizer.batch_decode(res_eager)} \nvs\n{tokenizer.batch_decode(res_sdpa)}", - ) - @require_torch class GPTNeoXLanguageGenerationTest(unittest.TestCase): diff --git a/tests/models/jetmoe/test_modeling_jetmoe.py b/tests/models/jetmoe/test_modeling_jetmoe.py index 867f97c48a6..a04d8bba741 100644 --- a/tests/models/jetmoe/test_modeling_jetmoe.py +++ b/tests/models/jetmoe/test_modeling_jetmoe.py @@ -24,11 +24,9 @@ from transformers import AutoTokenizer, JetMoeConfig, is_torch_available from transformers.testing_utils import ( backend_empty_cache, - is_flaky, require_flash_attn, require_torch, require_torch_gpu, - require_torch_sdpa, slow, torch_device, ) @@ -302,13 +300,6 @@ class JetMoeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix test_disk_offload_bin = False test_disk_offload_safetensors = False - # TODO: @Fxmarty - @is_flaky(max_attempts=3, description="flaky on some models.") - @require_torch_sdpa - @slow - def test_eager_matches_sdpa_generate(self): - super().test_eager_matches_sdpa_generate() - @parameterized.expand([(1, False), (1, True), (4, False)]) def test_new_cache_format(self, num_beams, do_sample): pass diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py index bf7ca784895..824337d8bdd 100644 --- a/tests/models/llama/test_modeling_llama.py +++ b/tests/models/llama/test_modeling_llama.py @@ -32,7 +32,6 @@ require_torch, require_torch_accelerator, require_torch_gpu, - require_torch_sdpa, slow, torch_device, ) @@ -651,67 +650,6 @@ def test_use_flash_attention_2_true(self): if not has_flash: raise ValueError("The flash model should have flash attention layers") - @require_torch_sdpa - @slow - def test_eager_matches_sdpa_generate(self): - """ - Overwritting the common test as the test is flaky on tiny models - """ - max_new_tokens = 30 - - tokenizer = LlamaTokenizer.from_pretrained("saibo/llama-1B") - - model_sdpa = LlamaForCausalLM.from_pretrained( - "saibo/llama-1B", - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - ).to(torch_device) - - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - - model_eager = LlamaForCausalLM.from_pretrained( - "saibo/llama-1B", - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - attn_implementation="eager", - ).to(torch_device) - - self.assertTrue(model_eager.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - raise ValueError("The eager model should not have SDPA attention layers") - - has_sdpa = False - for name, submodule in model_sdpa.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - has_sdpa = True - break - if not has_sdpa: - raise ValueError("The SDPA model should have SDPA attention layers") - - texts = [ - "hi here's a longer context, getting longer and", - "Hello this is a very long sentence my friend, very long for real", - "Today I am in Paris and", - ] - - for padding_side in ["left", "right"]: - tokenizer.padding_side = padding_side - tokenizer.pad_token = tokenizer.eos_token - - inputs = tokenizer(texts, return_tensors="pt", padding=True).to(torch_device) - - res_eager = model_eager.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False) - res_sdpa = model_sdpa.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False) - - with self.subTest(f"{padding_side}"): - torch.testing.assert_close( - res_eager, - res_sdpa, - msg=f"\n{tokenizer.batch_decode(res_eager)} \nvs\n{tokenizer.batch_decode(res_sdpa)}", - ) - @unittest.skip("Broken by the loss update will fix soon @ArthurZucker") def test_torch_fx_output_loss(self, *args, **kwargs): pass diff --git a/tests/models/mistral/test_modeling_mistral.py b/tests/models/mistral/test_modeling_mistral.py index 600c4ffa14b..f2ee714bcdb 100644 --- a/tests/models/mistral/test_modeling_mistral.py +++ b/tests/models/mistral/test_modeling_mistral.py @@ -24,7 +24,6 @@ from transformers import AutoTokenizer, MistralConfig, is_torch_available, set_seed from transformers.testing_utils import ( backend_empty_cache, - is_flaky, require_bitsandbytes, require_flash_attn, require_read_token, @@ -332,13 +331,6 @@ def is_pipeline_test_to_skip( ): return True - # TODO: @Fxmarty - @is_flaky(max_attempts=3, description="flaky on some models.") - @require_torch_sdpa - @slow - def test_eager_matches_sdpa_generate(self): - super().test_eager_matches_sdpa_generate() - def setUp(self): self.model_tester = MistralModelTester(self) self.config_tester = ConfigTester(self, config_class=MistralConfig, hidden_size=37) diff --git a/tests/models/mixtral/test_modeling_mixtral.py b/tests/models/mixtral/test_modeling_mixtral.py index 0688435e814..b9b5faed851 100644 --- a/tests/models/mixtral/test_modeling_mixtral.py +++ b/tests/models/mixtral/test_modeling_mixtral.py @@ -21,11 +21,9 @@ from transformers import MixtralConfig, is_torch_available from transformers.testing_utils import ( - is_flaky, require_flash_attn, require_torch, require_torch_gpu, - require_torch_sdpa, slow, torch_device, ) @@ -332,13 +330,6 @@ def is_pipeline_test_to_skip( ): return True - # TODO: @Fxmarty - @is_flaky(max_attempts=3, description="flaky on some models.") - @require_torch_sdpa - @slow - def test_eager_matches_sdpa_generate(self): - super().test_eager_matches_sdpa_generate() - def setUp(self): self.model_tester = MixtralModelTester(self) self.config_tester = ConfigTester(self, config_class=MixtralConfig, hidden_size=37) diff --git a/tests/models/mllama/test_modeling_mllama.py b/tests/models/mllama/test_modeling_mllama.py index fafa2f71331..3efa7b778fb 100644 --- a/tests/models/mllama/test_modeling_mllama.py +++ b/tests/models/mllama/test_modeling_mllama.py @@ -132,12 +132,6 @@ def setUp(self): self.model_tester = MllamaText2TextModelTester(self) self.config_tester = ConfigTester(self, config_class=MllamaTextConfig, has_text_modality=True) - @require_torch_sdpa - @slow - @is_flaky() - def test_eager_matches_sdpa_generate(self): - super().test_eager_matches_sdpa_generate() - class MllamaVisionText2TextModelTester: def __init__( @@ -360,12 +354,6 @@ def _check_attentions_for_generate( self.assertListEqual([layer_attention.shape for layer_attention in iter_attentions], expected_shapes) - @require_torch_sdpa - @slow - @is_flaky() - def test_eager_matches_sdpa_generate(self): - super().test_eager_matches_sdpa_generate() - @require_torch_sdpa @slow @is_flaky() diff --git a/tests/models/moshi/test_modeling_moshi.py b/tests/models/moshi/test_modeling_moshi.py index b299b414d60..dd9302ee2c5 100644 --- a/tests/models/moshi/test_modeling_moshi.py +++ b/tests/models/moshi/test_modeling_moshi.py @@ -788,14 +788,10 @@ def test_left_padding_compatibility(self): @slow @is_flaky(max_attempts=5, description="flaky on some models.") def test_eager_matches_sdpa_generate(self): - if not self.has_attentions: - self.skipTest(reason="Model architecture does not support attentions") + """Overwritten -- mochi has custom inputs and custom output checks""" max_new_tokens = 5 - if len(self.all_generative_model_classes) == 0: - self.skipTest(f"{self.__class__.__name__} tests a model that does support generate: skipping this test") - for model_class in self.all_generative_model_classes: if not model_class._supports_sdpa: self.skipTest(f"{model_class.__name__} does not support SDPA") diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py index 438178bfc6f..346ad60debe 100644 --- a/tests/models/musicgen/test_modeling_musicgen.py +++ b/tests/models/musicgen/test_modeling_musicgen.py @@ -819,74 +819,6 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases)) - @require_torch_sdpa - @slow - # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_generate - def test_eager_matches_sdpa_generate(self): - max_new_tokens = 30 - - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_sdpa: - self.skipTest(f"{model_class.__name__} does not support SDPA") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - - model_sdpa = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - ).to(torch_device) - - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - - model_eager = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - attn_implementation="eager", - ).to(torch_device) - - self.assertTrue(model_eager.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - raise ValueError("The eager model should not have SDPA attention layers") - - has_sdpa = False - for name, submodule in model_sdpa.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - has_sdpa = True - break - if not has_sdpa: - raise ValueError("The SDPA model should have SDPA attention layers") - - # Just test that a large cache works as expected - res_eager = model_eager.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - res_sdpa = model_sdpa.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - self.assertTrue(torch.allclose(res_eager, res_sdpa)) - def prepare_musicgen_inputs_dict( config, @@ -2085,74 +2017,6 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases)) - @require_torch_sdpa - @slow - # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_generate - def test_eager_matches_sdpa_generate(self): - max_new_tokens = 30 - - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_sdpa: - self.skipTest(f"{model_class.__name__} does not support SDPA") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - - model_sdpa = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - ).to(torch_device) - - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - - model_eager = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - attn_implementation="eager", - ).to(torch_device) - - self.assertTrue(model_eager.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - raise ValueError("The eager model should not have SDPA attention layers") - - has_sdpa = False - for name, submodule in model_sdpa.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - has_sdpa = True - break - if not has_sdpa: - raise ValueError("The SDPA model should have SDPA attention layers") - - # Just test that a large cache works as expected - res_eager = model_eager.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - res_sdpa = model_sdpa.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - self.assertTrue(torch.allclose(res_eager, res_sdpa)) - def test_requires_grad_with_frozen_encoders(self): config = self.model_tester.get_config() for model_class in self.all_model_classes: diff --git a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py index f53fc21ba80..f3b6be0ac65 100644 --- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py +++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py @@ -1866,74 +1866,6 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases)) - @require_torch_sdpa - @slow - # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_generate - def test_eager_matches_sdpa_generate(self): - max_new_tokens = 30 - - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_sdpa: - self.skipTest(f"{model_class.__name__} does not support SDPA") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - - model_sdpa = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - ).to(torch_device) - - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - - model_eager = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - attn_implementation="eager", - ).to(torch_device) - - self.assertTrue(model_eager.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - raise ValueError("The eager model should not have SDPA attention layers") - - has_sdpa = False - for name, submodule in model_sdpa.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - has_sdpa = True - break - if not has_sdpa: - raise ValueError("The SDPA model should have SDPA attention layers") - - # Just test that a large cache works as expected - res_eager = model_eager.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - res_sdpa = model_sdpa.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - self.assertTrue(torch.allclose(res_eager, res_sdpa)) - def test_requires_grad_with_frozen_encoders(self): config = self.model_tester.get_config() for model_class in self.all_model_classes: diff --git a/tests/models/olmo/test_modeling_olmo.py b/tests/models/olmo/test_modeling_olmo.py index fbe73248d00..a85e9db3458 100644 --- a/tests/models/olmo/test_modeling_olmo.py +++ b/tests/models/olmo/test_modeling_olmo.py @@ -24,10 +24,8 @@ from transformers.models.auto.tokenization_auto import AutoTokenizer from transformers.models.gpt_neox.tokenization_gpt_neox_fast import GPTNeoXTokenizerFast from transformers.testing_utils import ( - is_flaky, require_tokenizers, require_torch, - require_torch_sdpa, slow, torch_device, ) @@ -317,13 +315,6 @@ def test_model_various_embeddings(self): def test_save_load_fast_init_from_base(self): pass - # TODO: @Fxmarty - @is_flaky(max_attempts=3, description="flaky on some models.") - @require_torch_sdpa - @slow - def test_eager_matches_sdpa_generate(self): - super().test_eager_matches_sdpa_generate() - @parameterized.expand([("linear",), ("dynamic",)]) def test_model_rope_scaling(self, scaling_type): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/olmoe/test_modeling_olmoe.py b/tests/models/olmoe/test_modeling_olmoe.py index 08ec1458efe..9efadb06eb4 100644 --- a/tests/models/olmoe/test_modeling_olmoe.py +++ b/tests/models/olmoe/test_modeling_olmoe.py @@ -22,10 +22,8 @@ from transformers.models.auto.tokenization_auto import AutoTokenizer from transformers.models.gpt_neox.tokenization_gpt_neox_fast import GPTNeoXTokenizerFast from transformers.testing_utils import ( - is_flaky, require_tokenizers, require_torch, - require_torch_sdpa, slow, torch_device, ) @@ -330,13 +328,6 @@ def test_model_various_embeddings(self): def test_save_load_fast_init_from_base(self): pass - # TODO: @Fxmarty - @is_flaky(max_attempts=3, description="flaky on some models.") - @require_torch_sdpa - @slow - def test_eager_matches_sdpa_generate(self): - super().test_eager_matches_sdpa_generate() - @parameterized.expand([("linear",), ("dynamic",)]) def test_model_rope_scaling(self, scaling_type): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/opt/test_modeling_opt.py b/tests/models/opt/test_modeling_opt.py index 2093dfe685b..8bae2af8045 100644 --- a/tests/models/opt/test_modeling_opt.py +++ b/tests/models/opt/test_modeling_opt.py @@ -25,7 +25,6 @@ require_torch, require_torch_accelerator, require_torch_fp16, - require_torch_sdpa, slow, torch_device, ) @@ -339,68 +338,6 @@ def test_opt_sequence_classification_model_for_multi_label(self): result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels) self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels)) - @require_torch_sdpa - @slow - def test_eager_matches_sdpa_generate(self): - """ - Overwritting the common test as the test is flaky on tiny models - """ - max_new_tokens = 30 - - tokenizer = GPT2Tokenizer.from_pretrained("facebook/opt-350M") - - texts = [ - "hi here's a longer context, getting longer and", - "Hello this is a very long sentence my friend, very long for real", - "Today I am in Paris and", - ] - - model_sdpa = OPTForCausalLM.from_pretrained( - "facebook/opt-350M", - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - attn_implementation="sdpa", - ).to(torch_device) - - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - - model_eager = OPTForCausalLM.from_pretrained( - "facebook/opt-350M", - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - attn_implementation="eager", - ).to(torch_device) - - self.assertTrue(model_eager.config._attn_implementation == "eager") - - for _, submodule in model_eager.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - raise ValueError("The eager model should not have SDPA attention layers") - - has_sdpa = False - for _, submodule in model_sdpa.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - has_sdpa = True - break - if not has_sdpa: - raise ValueError("The SDPA model should have SDPA attention layers") - - for padding_side in ["left", "right"]: - tokenizer.padding_side = padding_side - tokenizer.pad_token = tokenizer.eos_token - - inputs = tokenizer(texts, return_tensors="pt", padding=True).to(torch_device) - - res_eager = model_eager.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False) - res_sdpa = model_sdpa.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False) - - with self.subTest(f"{padding_side}"): - torch.testing.assert_close( - res_eager, - res_sdpa, - msg=f"\n{tokenizer.batch_decode(res_eager)} \nvs\n{tokenizer.batch_decode(res_sdpa)}", - ) - @unittest.skip(reason="Does not work on the tiny model as we keep hitting edge cases.") def test_model_parallelism(self): super().test_model_parallelism() diff --git a/tests/models/qwen2/test_modeling_qwen2.py b/tests/models/qwen2/test_modeling_qwen2.py index 301937079ae..4e57f8e0f00 100644 --- a/tests/models/qwen2/test_modeling_qwen2.py +++ b/tests/models/qwen2/test_modeling_qwen2.py @@ -343,14 +343,6 @@ def is_pipeline_test_to_skip( ): return True - # Ignore copy - # TODO: @Fxmarty - @require_torch_sdpa - @slow - @unittest.skip(reason="Currently failing.") - def test_eager_matches_sdpa_generate(self): - super().test_eager_matches_sdpa_generate() - def setUp(self): self.model_tester = Qwen2ModelTester(self) self.config_tester = ConfigTester(self, config_class=Qwen2Config, hidden_size=37) diff --git a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py index 30d7996d7e7..c545e882fae 100644 --- a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py +++ b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py @@ -368,12 +368,6 @@ def is_pipeline_test_to_skip( ): return True - # Ignore copy - @require_torch_sdpa - @slow - def test_eager_matches_sdpa_generate(self): - super().test_eager_matches_sdpa_generate() - def setUp(self): self.model_tester = Qwen2MoeModelTester(self) self.config_tester = ConfigTester(self, config_class=Qwen2MoeConfig, hidden_size=37) diff --git a/tests/models/stablelm/test_modeling_stablelm.py b/tests/models/stablelm/test_modeling_stablelm.py index e1f9bc2b8e8..91044a4eb75 100644 --- a/tests/models/stablelm/test_modeling_stablelm.py +++ b/tests/models/stablelm/test_modeling_stablelm.py @@ -21,11 +21,9 @@ from transformers import StableLmConfig, is_torch_available, set_seed from transformers.testing_utils import ( - is_flaky, require_bitsandbytes, require_flash_attn, require_torch, - require_torch_sdpa, slow, torch_device, ) @@ -558,67 +556,3 @@ def test_model_3b_long_prompt(self): input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device) generated_ids = model.generate(input_ids, max_new_tokens=4, temperature=0) self.assertEqual(EXPECTED_OUTPUT_TOKEN_IDS, generated_ids[0][-3:].tolist()) - - # Copied from transformers.tests.models.llama.test_modeling_llama.LlamaModelTest.test_eager_matches_sdpa_generate with Llama->StableLm,saibo/llama-1B->stabilityai/stablelm-3b-4e1t - # TODO: @Fxmarty - @is_flaky(max_attempts=3, description="flaky on some models.") - @require_torch_sdpa - @slow - def test_eager_matches_sdpa_generate(self): - """ - Overwritting the common test as the test is flaky on tiny models - """ - max_new_tokens = 30 - - tokenizer = AutoTokenizer.from_pretrained("stabilityai/stablelm-3b-4e1t") - - model_sdpa = StableLmForCausalLM.from_pretrained( - "stabilityai/stablelm-3b-4e1t", - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - ).to(torch_device) - - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - - model_eager = StableLmForCausalLM.from_pretrained( - "stabilityai/stablelm-3b-4e1t", - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - attn_implementation="eager", - ).to(torch_device) - - self.assertTrue(model_eager.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - raise ValueError("The eager model should not have SDPA attention layers") - - has_sdpa = False - for name, submodule in model_sdpa.named_modules(): - if "SdpaAttention" in submodule.__class__.__name__: - has_sdpa = True - break - if not has_sdpa: - raise ValueError("The SDPA model should have SDPA attention layers") - - texts = [ - "hi here's a longer context, getting longer and", - "Hello this is a very long sentence my friend, very long for real", - "Today I am in Paris and", - ] - - for padding_side in ["left", "right"]: - tokenizer.padding_side = padding_side - tokenizer.pad_token = tokenizer.eos_token - - inputs = tokenizer(texts, return_tensors="pt", padding=True).to(torch_device) - - res_eager = model_eager.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False) - res_sdpa = model_sdpa.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False) - - with self.subTest(f"{padding_side}"): - torch.testing.assert_close( - res_eager, - res_sdpa, - msg=f"\n{tokenizer.batch_decode(res_eager)} \nvs\n{tokenizer.batch_decode(res_sdpa)}", - ) diff --git a/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py b/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py index 5b426d27799..5d9abb238e7 100644 --- a/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py +++ b/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py @@ -14,11 +14,10 @@ # limitations under the License. -import tempfile import unittest from transformers import XLMRobertaXLConfig, is_torch_available -from transformers.testing_utils import require_torch, require_torch_sdpa, slow, torch_device +from transformers.testing_utils import require_torch, slow, torch_device from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester @@ -523,84 +522,6 @@ def test_create_position_ids_from_inputs_embeds(self): self.assertEqual(position_ids.shape, expected_positions.shape) self.assertTrue(torch.all(torch.eq(position_ids, expected_positions))) - # TODO: Remove this and use the parent method (in common tests) once XLM RoBERTa XL supports low_cpu_mem_usage=True. - @require_torch_sdpa - @slow - # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_generate - def test_eager_matches_sdpa_generate(self): - if not self.has_attentions: - self.skipTest(reason="Model architecture does not support attentions") - - max_new_tokens = 30 - - if len(self.all_generative_model_classes) == 0: - self.skipTest(f"{self.__class__.__name__} tests a model that does support generate: skipping this test") - - for model_class in self.all_generative_model_classes: - if not model_class._supports_sdpa: - self.skipTest(f"{model_class.__name__} does not support SDPA") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - - # Ignore copy - model_sdpa = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - low_cpu_mem_usage=False, - ).to(torch_device) - - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - - # Ignore copy - model_eager = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - low_cpu_mem_usage=False, - attn_implementation="eager", - ).to(torch_device) - - self.assertTrue(model_eager.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - raise ValueError("The eager model should not have SDPA attention layers") - - has_sdpa = False - for name, submodule in model_sdpa.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - has_sdpa = True - break - if not has_sdpa: - raise ValueError("The SDPA model should have SDPA attention layers") - - # Just test that a large cache works as expected - res_eager = model_eager.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - res_sdpa = model_sdpa.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - self.assertTrue(torch.allclose(res_eager, res_sdpa)) - @require_torch class XLMRobertaModelXLIntegrationTest(unittest.TestCase): diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 964b7b912b4..51d51dfcc28 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -4469,62 +4469,6 @@ def test_sdpa_can_compile_dynamic(self): with torch.no_grad(): _ = model(**inputs_dict) - @require_torch_sdpa - @slow - def test_eager_matches_sdpa_generate(self): - if not self.has_attentions: - self.skipTest(reason="Model architecture does not support attentions") - - max_new_tokens = 30 - - if len(self.all_generative_model_classes) == 0: - self.skipTest(f"{self.__class__.__name__} tests a model that does support generate: skipping this test") - - for model_class in self.all_generative_model_classes: - if not model_class._supports_sdpa: - self.skipTest(f"{model_class.__name__} does not support SDPA") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - - model_sdpa = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - ).to(torch_device) - - model_eager = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - low_cpu_mem_usage=True, - attn_implementation="eager", - ).to(torch_device) - - # Just test that a large cache works as expected - res_eager = model_eager.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - res_sdpa = model_sdpa.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False - ) - - self.assertTrue(torch.allclose(res_eager, res_sdpa)) - @require_torch_sdpa def test_sdpa_matches_eager_sliding_window(self): if not self.has_attentions: From e447185b1f19df3032b11b586506225bfdf6d111 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Fri, 25 Oct 2024 10:23:20 -0400 Subject: [PATCH 30/99] Fix bnb training test failure (#34414) * Fix bnb training test: compatibility with OPTSdpaAttention --- tests/quantization/bnb/test_4bit.py | 3 ++- tests/quantization/bnb/test_mixed_int8.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py index 0ac9b3d82fc..3eae429abb2 100644 --- a/tests/quantization/bnb/test_4bit.py +++ b/tests/quantization/bnb/test_4bit.py @@ -29,6 +29,7 @@ BitsAndBytesConfig, pipeline, ) +from transformers.models.opt.modeling_opt import OPTAttention from transformers.testing_utils import ( apply_skip_if_not_implemented, is_bitsandbytes_available, @@ -565,7 +566,7 @@ def test_training(self): # Step 2: add adapters for _, module in model.named_modules(): - if "OPTAttention" in repr(type(module)): + if isinstance(module, OPTAttention): module.q_proj = LoRALayer(module.q_proj, rank=16) module.k_proj = LoRALayer(module.k_proj, rank=16) module.v_proj = LoRALayer(module.v_proj, rank=16) diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py index 5a99ab32e42..567aa956271 100644 --- a/tests/quantization/bnb/test_mixed_int8.py +++ b/tests/quantization/bnb/test_mixed_int8.py @@ -29,6 +29,7 @@ BitsAndBytesConfig, pipeline, ) +from transformers.models.opt.modeling_opt import OPTAttention from transformers.testing_utils import ( apply_skip_if_not_implemented, is_accelerate_available, @@ -868,7 +869,7 @@ def test_training(self): # Step 2: add adapters for _, module in model.named_modules(): - if "OPTAttention" in repr(type(module)): + if isinstance(module, OPTAttention): module.q_proj = LoRALayer(module.q_proj, rank=16) module.k_proj = LoRALayer(module.k_proj, rank=16) module.v_proj = LoRALayer(module.v_proj, rank=16) From f73f5e62e2383c1cb6975fca70082d6dc51ec6f2 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Fri, 25 Oct 2024 17:14:07 +0200 Subject: [PATCH 31/99] Avoid check expected exception when it is on CUDA (#34408) * update * update --------- Co-authored-by: ydshieh --- .../pipelines/test_pipelines_summarization.py | 5 +++-- .../test_pipelines_text_generation.py | 18 ++++++++++-------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/tests/pipelines/test_pipelines_summarization.py b/tests/pipelines/test_pipelines_summarization.py index 465dba9743c..613b9dca8e1 100644 --- a/tests/pipelines/test_pipelines_summarization.py +++ b/tests/pipelines/test_pipelines_summarization.py @@ -85,8 +85,9 @@ def run_pipeline_test(self, summarizer, _): and len(summarizer.model.trainable_weights) > 0 and "GPU" in summarizer.model.trainable_weights[0].device ): - with self.assertRaises(Exception): - outputs = summarizer("This " * 1000) + if str(summarizer.device) == "cpu": + with self.assertRaises(Exception): + outputs = summarizer("This " * 1000) outputs = summarizer("This " * 1000, truncation=TruncationStrategy.ONLY_FIRST) @require_torch diff --git a/tests/pipelines/test_pipelines_text_generation.py b/tests/pipelines/test_pipelines_text_generation.py index 277c870b4d1..51f3cae5e31 100644 --- a/tests/pipelines/test_pipelines_text_generation.py +++ b/tests/pipelines/test_pipelines_text_generation.py @@ -493,17 +493,19 @@ def run_pipeline_test(self, text_generator, _): and text_generator.model.__class__.__name__ not in EXTRA_MODELS_CAN_HANDLE_LONG_INPUTS ): # Handling of large generations - with self.assertRaises((RuntimeError, IndexError, ValueError, AssertionError)): - text_generator("This is a test" * 500, max_new_tokens=20) + if str(text_generator.device) == "cpu": + with self.assertRaises((RuntimeError, IndexError, ValueError, AssertionError)): + text_generator("This is a test" * 500, max_new_tokens=20) outputs = text_generator("This is a test" * 500, handle_long_generation="hole", max_new_tokens=20) # Hole strategy cannot work - with self.assertRaises(ValueError): - text_generator( - "This is a test" * 500, - handle_long_generation="hole", - max_new_tokens=tokenizer.model_max_length + 10, - ) + if str(text_generator.device) == "cpu": + with self.assertRaises(ValueError): + text_generator( + "This is a test" * 500, + handle_long_generation="hole", + max_new_tokens=tokenizer.model_max_length + 10, + ) @require_torch @require_accelerate From 6a62a6d1b54123ede3a1e3bda57c924c64e78124 Mon Sep 17 00:00:00 2001 From: Rudy Delouya Date: Fri, 25 Oct 2024 17:52:29 +0200 Subject: [PATCH 32/99] Fix typos in agents_advanced.md (#34405) --- docs/source/en/agents_advanced.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/en/agents_advanced.md b/docs/source/en/agents_advanced.md index 2327357525d..ddcc619b4f9 100644 --- a/docs/source/en/agents_advanced.md +++ b/docs/source/en/agents_advanced.md @@ -66,10 +66,10 @@ manager_agent.run("Who is the CEO of Hugging Face?") Let's take again the tool example from main documentation, for which we had implemented a `tool` decorator. -If you need to add variation, like custom attributes for your too, you can build your tool following the fine-grained method: building a class that inherits from the [`Tool`] superclass. +If you need to add variation, like custom attributes for your tool, you can build your tool following the fine-grained method: building a class that inherits from the [`Tool`] superclass. The custom tool needs: -- An attribute `name`, which corresponds to the name of the tool itself. The name usually describes what the tool does. Since the code returns the model with the most downloads for a task, let's name is `model_download_counter`. +- An attribute `name`, which corresponds to the name of the tool itself. The name usually describes what the tool does. Since the code returns the model with the most downloads for a task, let's name it `model_download_counter`. - An attribute `description` is used to populate the agent's system prompt. - An `inputs` attribute, which is a dictionary with keys `"type"` and `"description"`. It contains information that helps the Python interpreter make educated choices about the input. - An `output_type` attribute, which specifies the output type. @@ -240,4 +240,4 @@ with gr.Blocks() as demo: if __name__ == "__main__": demo.launch() -``` \ No newline at end of file +``` From 1d063793318b20654ebb850f48f43e0a247ab7bb Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Fri, 25 Oct 2024 08:52:45 -0700 Subject: [PATCH 33/99] [docs] Cache implementations (#34325) cache --- src/transformers/generation/configuration_utils.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py index 3c204481b04..9b543f6c357 100644 --- a/src/transformers/generation/configuration_utils.py +++ b/src/transformers/generation/configuration_utils.py @@ -172,7 +172,15 @@ class GenerationConfig(PushToHubMixin): speed up decoding. cache_implementation (`str`, *optional*, default to `None`): Name of the cache class that will be instantiated in `generate`, for faster decoding. Possible values are: - {ALL_CACHE_IMPLEMENTATIONS}. We support other cache types, but they must be manually instantiated and + + - `"static"`: [`StaticCache`] + - `"offloaded_static"`: [`OffloadedStaticCache`] + - `"sliding_window"`: [`SlidingWindowCache`] + - `"hybrid"`: [`HybridCache`] + - `"mamba"`: [`MambaCache`] + - `"quantized"`: [`QuantizedCache`] + + We support other cache types, but they must be manually instantiated and passed to `generate` through the `past_key_values` argument. See our [cache documentation](https://huggingface.co/docs/transformers/en/kv_cache) for further information. cache_config (`CacheConfig` or `dict`, *optional*, default to `None`): From fddbd3c13cca7a51515a039c6f2497e94905acb4 Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Mon, 28 Oct 2024 11:24:56 +0100 Subject: [PATCH 34/99] Fix pix2struct (#34374) * fix * fix and test use_cache test * style * remove atol --- .../models/pix2struct/modeling_pix2struct.py | 60 +++++++++++-------- .../pix2struct/test_modeling_pix2struct.py | 11 ++++ 2 files changed, 46 insertions(+), 25 deletions(-) diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py index b1ac81bb1f2..176dadd5b88 100644 --- a/src/transformers/models/pix2struct/modeling_pix2struct.py +++ b/src/transformers/models/pix2struct/modeling_pix2struct.py @@ -762,11 +762,14 @@ def _relative_position_bucket(relative_position, bidirectional=True, num_buckets return relative_buckets # Adapted from transformers.models.t5.modeling_t5.T5Attention.compute_bias - def compute_bias(self, query_length, key_length, device=None): + def compute_bias(self, query_length, key_length, device=None, cache_position=None): """Compute binned relative position bias""" if device is None: device = self.relative_attention_bias.weight.device - context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None] + if cache_position is None: + context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None] + else: + context_position = cache_position[:, None].to(device) memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :] relative_position = memory_position - context_position # shape (query_length, key_length) relative_position_bucket = self._relative_position_bucket( @@ -779,6 +782,7 @@ def compute_bias(self, query_length, key_length, device=None): values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, query_length, key_length) return values + # Adapted from transformers.models.t5.modeling_t5.T5Attention.forward def forward( self, hidden_states, @@ -796,61 +800,66 @@ def forward( Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). """ # Input is (batch_size, seq_length, dim) - # Mask is (batch_size, 1, 1, key_length) (non-causal) or (batch_size, 1, query_length, key_length) + # Mask is (batch_size, 1, 1, key_length) (non-causal) or (batch_size, 1, seq_length, key_length) (causal decoder) batch_size, seq_length = hidden_states.shape[:2] # if key_value_states are provided this layer is used as a cross-attention layer for the decoder is_cross_attention = key_value_states is not None - query_states = self.query(hidden_states).contiguous() + query_states = self.query(hidden_states) query_states = query_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) if past_key_value is not None: is_updated = past_key_value.is_updated.get(self.layer_idx) if is_cross_attention: # after the first generated id, we can subsequently re-use all key/value_states from cache - past_key_value = past_key_value.cross_attention_cache + curr_past_key_value = past_key_value.cross_attention_cache else: - past_key_value = past_key_value.self_attention_cache + curr_past_key_value = past_key_value.self_attention_cache - # get key/value states current_states = key_value_states if is_cross_attention else hidden_states if is_cross_attention and past_key_value and is_updated: # reuse k,v, cross_attentions - key_states = past_key_value.key_cache[self.layer_idx] - value_states = past_key_value.value_cache[self.layer_idx] + key_states = curr_past_key_value.key_cache[self.layer_idx] + value_states = curr_past_key_value.value_cache[self.layer_idx] else: - key_states = self.key(current_states).contiguous() - value_states = self.value(current_states).contiguous() + key_states = self.key(current_states) + value_states = self.value(current_states) key_states = key_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) value_states = value_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + if past_key_value is not None: # save all key/value_states to cache to be re-used for fast auto-regressive generation cache_position = cache_position if not is_cross_attention else None - key_states, value_states = past_key_value.update( + key_states, value_states = curr_past_key_value.update( key_states, value_states, self.layer_idx, {"cache_position": cache_position} ) # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls if is_cross_attention: past_key_value.is_updated[self.layer_idx] = True - # compute scores + # compute scores, equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9 scores = torch.matmul(query_states, key_states.transpose(3, 2)) if position_bias is None: - real_seq_length = cache_position[-1] + 1 if query_length is None else query_length - key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] + key_length = key_states.shape[-2] + # cache position is 0-indexed so we add 1 to get the real length of queries (aka with past) + real_seq_length = query_length if query_length is not None else cache_position[-1] + 1 if not self.has_relative_attention_bias: position_bias = torch.zeros( - (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype + (1, self.n_heads, seq_length, key_length), device=scores.device, dtype=scores.dtype ) if self.gradient_checkpointing and self.training: position_bias.requires_grad = True else: - position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device) + position_bias = self.compute_bias( + real_seq_length, key_length, device=scores.device, cache_position=cache_position + ) + position_bias = position_bias[:, :, -seq_length:, :] if mask is not None: - position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) + causal_mask = mask[:, :, :, : key_states.shape[-2]] + position_bias = position_bias + causal_mask if self.pruned_heads: mask = torch.ones(position_bias.shape[1]) @@ -860,10 +869,9 @@ def forward( position_bias_masked = position_bias scores += position_bias_masked - # (batch_size, n_heads, seq_length, key_length) - attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores) # (batch_size, n_heads, seq_length, key_length) + attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores) attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) # Mask heads if we want to @@ -871,12 +879,12 @@ def forward( attn_weights = attn_weights * layer_head_mask attn_output = torch.matmul(attn_weights, value_states) - # (batch_size, seq_length, dim) - attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(batch_size, -1, self.inner_dim) attn_output = self.output(attn_output) - outputs = (attn_output,) + (past_key_value,) + (position_bias,) + outputs = (attn_output, past_key_value, position_bias) if output_attentions: outputs = outputs + (attn_weights,) @@ -969,7 +977,10 @@ def __init__(self, config, has_relative_attention_bias=False, layer_idx: Optiona layer_idx=layer_idx, ) - self.encoder_decoder_attention = Pix2StructTextLayerCrossAttention(config) + self.encoder_decoder_attention = Pix2StructTextLayerCrossAttention( + config, + layer_idx=layer_idx, + ) self.mlp = Pix2StructTextLayerFF(config) @@ -1019,7 +1030,6 @@ def forward( query_length=cache_position[-1] + 1, use_cache=use_cache, output_attentions=output_attentions, - cache_position=cache_position, ) hidden_states, past_key_value = cross_attention_outputs[:2] diff --git a/tests/models/pix2struct/test_modeling_pix2struct.py b/tests/models/pix2struct/test_modeling_pix2struct.py index 2d762008cbb..18b79f3fbc9 100644 --- a/tests/models/pix2struct/test_modeling_pix2struct.py +++ b/tests/models/pix2struct/test_modeling_pix2struct.py @@ -419,6 +419,7 @@ def prepare_config_and_inputs_for_common(self): @require_torch class Pix2StructModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): all_model_classes = (Pix2StructForConditionalGeneration,) if is_torch_available() else () + all_generative_model_classes = (Pix2StructForConditionalGeneration,) if is_torch_available() else {} pipeline_model_mapping = {"image-to-text": Pix2StructForConditionalGeneration} if is_torch_available() else {} fx_compatible = False test_head_masking = False @@ -445,6 +446,16 @@ def test_model(self): ), ) + def test_generative_model(self): + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_generative_model_classes: + model = model_class(config).eval().to(torch_device) + + output = model.generate(**input_dict, use_cache=False, min_new_tokens=10, max_new_tokens=10) + output_use_cache = model.generate(**input_dict, use_cache=True, min_new_tokens=10, max_new_tokens=10) + + torch.testing.assert_close(output, output_use_cache) + @unittest.skip(reason="Hidden_states is tested in individual model tests") def test_hidden_states_output(self): pass From fc465bb196c3f014b1be43aa599a6183e660cccc Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Mon, 28 Oct 2024 11:59:46 +0100 Subject: [PATCH 35/99] pin `tensorflow_probability<0.22` in docker files (#34381) 0.21 Co-authored-by: ydshieh --- docker/transformers-all-latest-gpu/Dockerfile | 2 +- docker/transformers-tensorflow-gpu/Dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile index 93f9b6f6a17..7ad4e96d62c 100644 --- a/docker/transformers-all-latest-gpu/Dockerfile +++ b/docker/transformers-all-latest-gpu/Dockerfile @@ -26,7 +26,7 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers && # 1. Put several commands in a single `RUN` to avoid image/layer exporting issue. Could be revised in the future. # 2. Regarding `torch` part, We might need to specify proper versions for `torchvision` and `torchaudio`. # Currently, let's not bother to specify their versions explicitly (so installed with their latest release versions). -RUN python3 -m pip install --no-cache-dir -U tensorflow==2.13 protobuf==3.20.3 tensorflow_text tensorflow_probability && python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA +RUN python3 -m pip install --no-cache-dir -U tensorflow==2.13 protobuf==3.20.3 "tensorflow_text<2.16" "tensorflow_probability<0.22" && python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA RUN python3 -m pip uninstall -y flax jax diff --git a/docker/transformers-tensorflow-gpu/Dockerfile b/docker/transformers-tensorflow-gpu/Dockerfile index d765767780f..378491a6c60 100644 --- a/docker/transformers-tensorflow-gpu/Dockerfile +++ b/docker/transformers-tensorflow-gpu/Dockerfile @@ -18,7 +18,7 @@ RUN [ ${#TENSORFLOW} -gt 0 ] && VERSION='tensorflow=='$TENSORFLOW'.*' || VERSIO RUN python3 -m pip uninstall -y torch flax RUN python3 -m pip install -U "itsdangerous<2.1.0" -RUN python3 -m pip install --no-cache-dir -U tensorflow_probability +RUN python3 -m pip install --no-cache-dir -U "tensorflow_probability<0.22" # When installing in editable mode, `transformers` is not recognized as a package. # this line must be added in order for python to be aware of transformers. From 9360f1827d620c00d64755d40cd526dceabf5060 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Mon, 28 Oct 2024 12:01:05 +0100 Subject: [PATCH 36/99] Tiny update after #34383 (#34404) * update * update * update --------- Co-authored-by: ydshieh --- utils/check_bad_commit.py | 3 +++ utils/notification_service.py | 3 ++- utils/process_bad_commit_report.py | 2 ++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/utils/check_bad_commit.py b/utils/check_bad_commit.py index adb25f11264..45b01537127 100644 --- a/utils/check_bad_commit.py +++ b/utils/check_bad_commit.py @@ -75,6 +75,9 @@ def find_bad_commit(target_test, start_commit, end_commit): `str`: The earliest commit at which `target_test` fails. """ + if start_commit == end_commit: + return start_commit + create_script(target_test=target_test) bash = f""" diff --git a/utils/notification_service.py b/utils/notification_service.py index 629b7933378..039ee8b29a3 100644 --- a/utils/notification_service.py +++ b/utils/notification_service.py @@ -547,7 +547,8 @@ def payload(self) -> str: items = re.findall(pattern, line) elif "tests/models/" in line: model = line.split("/")[2] - new_failed_tests[model] = {"single-gpu": [], "multi-gpu": []} + if model not in new_failed_tests: + new_failed_tests[model] = {"single-gpu": [], "multi-gpu": []} for url, device in items: new_failed_tests[model][f"{device}-gpu"].append(line) file_path = os.path.join(os.getcwd(), f"ci_results_{job_name}/new_model_failures.json") diff --git a/utils/process_bad_commit_report.py b/utils/process_bad_commit_report.py index 513dc8df3a3..19812ff21f7 100644 --- a/utils/process_bad_commit_report.py +++ b/utils/process_bad_commit_report.py @@ -64,6 +64,8 @@ for device, failed_tests in model_result.items(): failed_tests = [x for x in failed_tests if x["author"] == author or x["merged_by"] == author] model_result[device] = failed_tests + _data[model] = {k: v for k, v in model_result.items() if len(v) > 0} + new_data_full[author] = {k: v for k, v in _data.items() if len(v) > 0} # Upload to Hub and get the url with open("new_model_failures_with_bad_commit_grouped_by_authors.json", "w") as fp: From 92bcdff2ef0932cf6dec4c3538389d7ccfd92f59 Mon Sep 17 00:00:00 2001 From: Nischay Date: Mon, 28 Oct 2024 17:53:52 +0530 Subject: [PATCH 37/99] Fix batch size handling in prediction_loop for DataLoaderShard (#34343) * Fix batch size handling in prediction_loop for DataLoaderShard Updated the prediction_loop method in the Trainer class to correctly handle batch size when using DataLoaderShard. This ensures that the batch size is retrieved from total_batch_size for distributed training scenarios, preventing TypeError related to NoneType during evaluation. * Update src/transformers/trainer.py Co-authored-by: Zach Mueller * Applied the fix to remove unused imports --------- Co-authored-by: Zach Mueller --- src/transformers/trainer.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 1b13787007e..8fe25b74661 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -4714,7 +4714,17 @@ def prediction_loop( elif args.bf16_full_eval: model = model.to(dtype=torch.bfloat16, device=args.device) - batch_size = dataloader.batch_size + batch_size = ( + dataloader.total_batch_size + if getattr(dataloader, "_is_accelerate_prepared", False) + else dataloader.batch_size + ) + + if batch_size is None: + raise ValueError( + "Batch size cannot be None. Ensure the dataloader has a valid batch_size or total_batch_size." + ) + num_examples = self.num_examples(dataloader) logger.info(f"\n***** Running {description} *****") logger.info(f" Num examples = {num_examples}") From 8b3b9b48fcd6bc06bd9c576f1b09266d577db257 Mon Sep 17 00:00:00 2001 From: AbdelKarim ELJANDOUBI <78537694+eljandoubi@users.noreply.github.com> Date: Mon, 28 Oct 2024 13:50:16 +0100 Subject: [PATCH 38/99] exclude fsdp from delay_optimizer_creation (#34140) * exclude fsdp from delay_optimizer_creation * add test case for trainer: FSDP mode and fp8 as mixed precision * rearrange imports * ruff formatted * adapt _init_fsdp to fp8 * use _init_fsdp only when resume_from_checkpoint * In case of FDP, self.layer will be CheckpointWrapper which has no len() method * delete _init_fsdp * solve conflict * fix conflict * make fixup --- src/transformers/testing_utils.py | 8 ++++++++ src/transformers/trainer.py | 7 +++++-- tests/trainer/test_trainer_fsdp.py | 32 ++++++++++++++++++++++++++++++ 3 files changed, 45 insertions(+), 2 deletions(-) diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 7bb2d5049dc..2781e9e102e 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -144,6 +144,7 @@ if is_accelerate_available(): from accelerate.state import AcceleratorState, PartialState + from accelerate.utils.imports import is_fp8_available if is_pytest_available(): @@ -1000,6 +1001,13 @@ def require_torch_fp16(test_case): )(test_case) +def require_fp8(test_case): + """Decorator marking a test that requires supports for fp8""" + return unittest.skipUnless(is_accelerate_available() and is_fp8_available(), "test requires fp8 support")( + test_case + ) + + def require_torch_bf16(test_case): """Decorator marking a test that requires a device that supports bf16""" return unittest.skipUnless( diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 8fe25b74661..64cb5c6bd4d 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -2209,7 +2209,7 @@ def _inner_training_loop( else: debug_overflow = DebugUnderflowOverflow(self.model) # noqa - delay_optimizer_creation = is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled or self.is_fsdp_enabled + delay_optimizer_creation = is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled # We need to reset the scheduler, as its parameters may be different on subsequent calls if self._created_lr_scheduler: @@ -2258,9 +2258,12 @@ def _inner_training_loop( # FSDP-XLA, SageMaker MP/DP, DataParallel, IPEX use_accelerator_prepare = True if model is self.model else False + # configure fsdp plugin for qlora if any + if use_accelerator_prepare: + self._fsdp_qlora_plugin_updates() + if delay_optimizer_creation: if use_accelerator_prepare: - self._fsdp_qlora_plugin_updates() self.model = self.accelerator.prepare(self.model) self.create_optimizer_and_scheduler(num_training_steps=max_steps) diff --git a/tests/trainer/test_trainer_fsdp.py b/tests/trainer/test_trainer_fsdp.py index 994a82a8db0..4bcf5de0452 100644 --- a/tests/trainer/test_trainer_fsdp.py +++ b/tests/trainer/test_trainer_fsdp.py @@ -20,6 +20,8 @@ execute_subprocess_async, get_torch_dist_unique_port, require_accelerate, + require_fp8, + require_fsdp, require_torch_multi_gpu, ) @@ -64,6 +66,7 @@ def __getitem__(self, i: int) -> str: class TestFSDPTrainer(TestCasePlus): @require_accelerate @require_torch_multi_gpu + @require_fsdp def test_trainer(self): output_dir = self.get_auto_remove_tmp_dir() cmd = [ @@ -86,6 +89,35 @@ def test_trainer(self): # successful return here == success - any errors would have caused an error in the sub-call +class TestFSDPTrainerFP8(TestCasePlus): + @require_accelerate + @require_torch_multi_gpu + @require_fsdp + @require_fp8 + def test_trainer(self): + output_dir = self.get_auto_remove_tmp_dir() + cmd = [ + "accelerate", + "launch", + "--use_fsdp", + "--main_process_port", + f"{get_torch_dist_unique_port()}", + "--num_processes", + f"{torch.cuda.device_count()}", + "--mixed_precision", + "fp8", + "--fsdp_transformer_layer_cls_to_wrap", + "GPT2Block", + f"{self.test_file_dir}/test_trainer_fsdp.py", + "--output_dir", + f"{output_dir}", + "--report_to", + "none", + ] + execute_subprocess_async(cmd, env=self.get_env()) + # successful return here == success - any errors would have caused an error in the sub-call + + if __name__ == "__main__": parser = HfArgumentParser((Seq2SeqTrainingArguments,)) training_args = parser.parse_args_into_dataclasses()[0] From c1753436dbb8bcbcee183cdd6eba9f08a90d602a Mon Sep 17 00:00:00 2001 From: "Sean (Seok-Won) Yi" Date: Tue, 29 Oct 2024 00:02:22 +0900 Subject: [PATCH 39/99] New option called `"best"` for `args.save_strategy`. (#31817) * Add _determine_best_metric and new saving logic. 1. Logic to determine the best logic was separated out from `_save_checkpoint`. 2. In `_maybe_log_save_evaluate`, whether or not a new best metric was achieved is determined after each evaluation, and if the save strategy is "best' then the TrainerControl is updated accordingly. * Added SaveStrategy. Same as IntervalStrategy, but with a new attribute called BEST. * IntervalStrategy -> SaveStrategy * IntervalStratgy -> SaveStrategy for save_strat. * Interval -> Save in docstring. * Updated docstring for save_strategy. * Added SaveStrategy and made according changes. `save_strategy` previously followed `IntervalStrategy` but now follows `SaveStrategy`. Changes were made accordingly to the code and the docstring. * Changes from `make fixup`. * Removed redundant metrics argument. * Added new test_save_best_checkpoint test. 1. Checks for both cases where `metric_for_best_model` is explicitly provided and when it's not provided. 2. The first case should have two checkpoints saved, whereas the second should have three saved. * Changed should_training_end saving logic. The Trainer saves a checkpoints at the end of training by default as long as `save_strategy != SaveStrategy.NO`. This condition was modified to include `SaveStrategy.BEST` because it would be counterintuitive that we'd only want the best checkpoint to be saved but the last one is as well. * `args.metric_for_best_model` default to loss. * Undo metric_for_best_model update. * Remove checking metric_for_best_model. * Added test cases for loss and no metric. * Added error for metric and changed default best_metric. * Removed unused import. * `new_best_metric` -> `is_new_best_metric` Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Applied `is_new_best_metric` to all. Changes were made for consistency and also to fix a potential bug. --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> Co-authored-by: Zach Mueller --- src/transformers/trainer.py | 84 ++++++++++++++++++---------- src/transformers/trainer_callback.py | 8 +-- src/transformers/trainer_utils.py | 7 +++ src/transformers/training_args.py | 14 +++-- src/transformers/training_args_tf.py | 2 +- tests/trainer/test_trainer.py | 83 +++++++++++++++++++++++++++ 6 files changed, 158 insertions(+), 40 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 64cb5c6bd4d..4315e54a42f 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -117,9 +117,9 @@ EvalPrediction, HPSearchBackend, HubStrategy, - IntervalStrategy, PredictionOutput, RemoveColumnsCollator, + SaveStrategy, TrainerMemoryTracker, TrainOutput, check_target_module_exists, @@ -419,6 +419,12 @@ def __init__( raise ValueError( f"You have set `args.eval_strategy` to {args.eval_strategy} but you didn't pass an `eval_dataset` to `Trainer`. Either set `args.eval_strategy` to `no` or pass an `eval_dataset`. " ) + if args.save_strategy == SaveStrategy.BEST or args.load_best_model_at_end: + if args.metric_for_best_model is None: + raise ValueError( + "`args.metric_for_best_model` must be provided when using 'best' save_strategy or if `args.load_best_model_at_end` is set to `True`." + ) + self.args = args self.compute_loss_func = compute_loss_func # Seed must be set before instantiating the model when using model @@ -2998,9 +3004,13 @@ def _maybe_log_save_evaluate(self, tr_loss, grad_norm, model, trial, epoch, igno metrics = None if self.control.should_evaluate: metrics = self._evaluate(trial, ignore_keys_for_eval) + is_new_best_metric = self._determine_best_metric(metrics=metrics, trial=trial) + + if self.args.save_strategy == SaveStrategy.BEST: + self.control.should_save = is_new_best_metric if self.control.should_save: - self._save_checkpoint(model, trial, metrics=metrics) + self._save_checkpoint(model, trial) self.control = self.callback_handler.on_save(self.args, self.state, self.control) def _load_rng_state(self, checkpoint): @@ -3077,7 +3087,48 @@ def _load_rng_state(self, checkpoint): "\nThis won't yield the same results as if the training had not been interrupted." ) - def _save_checkpoint(self, model, trial, metrics=None): + def _determine_best_metric(self, metrics, trial): + """ + Determine if the model should be saved based on the evaluation metrics. + If args.metric_for_best_model is not set, the loss is used. + + Returns: + bool: True if a new best metric was found, else False + """ + is_new_best_metric = False + + if self.args.metric_for_best_model is not None: + metric_to_check = self.args.metric_for_best_model + + if not metric_to_check.startswith("eval_"): + metric_to_check = f"eval_{metric_to_check}" + + try: + metric_value = metrics[metric_to_check] + except KeyError as exc: + raise KeyError( + f"The `metric_for_best_model` training argument is set to '{metric_to_check}', which is not found in the evaluation metrics. " + f"The available evaluation metrics are: {list(metrics.keys())}. Consider changing the `metric_for_best_model` via the TrainingArguments." + ) from exc + + operator = np.greater if self.args.greater_is_better else np.less + + if self.state.best_metric is None: + self.state.best_metric = float("-inf") if self.args.greater_is_better else float("inf") + + if operator(metric_value, self.state.best_metric): + run_dir = self._get_output_dir(trial=trial) + checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}" + output_dir = os.path.join(run_dir, checkpoint_folder) + + self.state.best_metric = metric_value + self.state.best_model_checkpoint = output_dir + + is_new_best_metric = True + + return is_new_best_metric + + def _save_checkpoint(self, model, trial): # In all cases, including ddp/dp/deepspeed, self.model is always a reference to the model we # want to save except FullyShardedDDP. # assert unwrap_model(model) is self.model, "internal model should be a reference to self.model" @@ -3098,31 +3149,6 @@ def _save_checkpoint(self, model, trial, metrics=None): # Save RNG state self._save_rng_state(output_dir) - # Determine the new best metric / best model checkpoint - if metrics is not None and self.args.metric_for_best_model is not None: - metric_to_check = self.args.metric_for_best_model - if not metric_to_check.startswith("eval_"): - metric_to_check = f"eval_{metric_to_check}" - try: - metric_value = metrics[metric_to_check] - except KeyError as exc: - raise KeyError( - f"The `metric_for_best_model` training argument is set to '{metric_to_check}', " - f"which is not found in the evaluation metrics. " - f"The available evaluation metrics are: {list(metrics.keys())}. " - f"Please ensure that the `compute_metrics` function returns a dictionary that includes '{metric_to_check}' or " - f"consider changing the `metric_for_best_model` via the TrainingArguments." - ) from exc - - operator = np.greater if self.args.greater_is_better else np.less - if ( - self.state.best_metric is None - or self.state.best_model_checkpoint is None - or operator(metric_value, self.state.best_metric) - ): - self.state.best_metric = metric_value - self.state.best_model_checkpoint = output_dir - # Save the Trainer state if self.args.should_save: # Update `ExportableState` callbacks and `TrainerControl` state to where we are currently @@ -4543,7 +4569,7 @@ def _push_from_checkpoint(self, checkpoint_folder): # Same for the training arguments torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME)) - if self.args.save_strategy == IntervalStrategy.STEPS: + if self.args.save_strategy == SaveStrategy.STEPS: commit_message = f"Training in progress, step {self.state.global_step}" else: commit_message = f"Training in progress, epoch {int(self.state.epoch)}" diff --git a/src/transformers/trainer_callback.py b/src/transformers/trainer_callback.py index 405874acf8f..ce9f2a26732 100644 --- a/src/transformers/trainer_callback.py +++ b/src/transformers/trainer_callback.py @@ -24,7 +24,7 @@ import numpy as np from tqdm.auto import tqdm -from .trainer_utils import IntervalStrategy, has_length +from .trainer_utils import IntervalStrategy, SaveStrategy, has_length from .training_args import TrainingArguments from .utils import logging @@ -555,7 +555,7 @@ def on_step_end(self, args: TrainingArguments, state: TrainerState, control: Tra # Save if ( - args.save_strategy == IntervalStrategy.STEPS + args.save_strategy == SaveStrategy.STEPS and state.save_steps > 0 and state.global_step % state.save_steps == 0 ): @@ -565,7 +565,7 @@ def on_step_end(self, args: TrainingArguments, state: TrainerState, control: Tra if state.global_step >= state.max_steps: control.should_training_stop = True # Save the model at the end if we have a save strategy - if args.save_strategy != IntervalStrategy.NO: + if args.save_strategy not in [SaveStrategy.NO, SaveStrategy.BEST]: control.should_save = True return control @@ -580,7 +580,7 @@ def on_epoch_end(self, args: TrainingArguments, state: TrainerState, control: Tr control.should_evaluate = True # Save - if args.save_strategy == IntervalStrategy.EPOCH: + if args.save_strategy == SaveStrategy.EPOCH: control.should_save = True return control diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py index 02c298cf7d2..42088cd7306 100644 --- a/src/transformers/trainer_utils.py +++ b/src/transformers/trainer_utils.py @@ -227,6 +227,13 @@ class IntervalStrategy(ExplicitEnum): EPOCH = "epoch" +class SaveStrategy(ExplicitEnum): + NO = "no" + STEPS = "steps" + EPOCH = "epoch" + BEST = "best" + + class EvaluationStrategy(ExplicitEnum): NO = "no" STEPS = "steps" diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index 485610dd9ba..c98e8bc41b9 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -33,6 +33,7 @@ FSDPOption, HubStrategy, IntervalStrategy, + SaveStrategy, SchedulerType, ) from .utils import ( @@ -349,12 +350,13 @@ class TrainingArguments: - save_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"steps"`): + save_strategy (`str` or [`~trainer_utils.SaveStrategy`], *optional*, defaults to `"steps"`): The checkpoint save strategy to adopt during training. Possible values are: - `"no"`: No save is done during training. - `"epoch"`: Save is done at the end of each epoch. - `"steps"`: Save is done every `save_steps`. + - `"best"`: Save is done whenever a new `best_metric` is achieved. If `"epoch"` or `"steps"` is chosen, saving will also be performed at the very end of training, always. @@ -962,7 +964,7 @@ class TrainingArguments: }, ) logging_nan_inf_filter: bool = field(default=True, metadata={"help": "Filter nan and inf losses for logging."}) - save_strategy: Union[IntervalStrategy, str] = field( + save_strategy: Union[SaveStrategy, str] = field( default="steps", metadata={"help": "The checkpoint save strategy to use."}, ) @@ -1580,7 +1582,7 @@ def __post_init__(self): self.eval_strategy = IntervalStrategy(self.eval_strategy) self.logging_strategy = IntervalStrategy(self.logging_strategy) - self.save_strategy = IntervalStrategy(self.save_strategy) + self.save_strategy = SaveStrategy(self.save_strategy) self.hub_strategy = HubStrategy(self.hub_strategy) self.lr_scheduler_type = SchedulerType(self.lr_scheduler_type) @@ -1616,7 +1618,7 @@ def __post_init__(self): if self.eval_steps != int(self.eval_steps): raise ValueError(f"--eval_steps must be an integer if bigger than 1: {self.eval_steps}") self.eval_steps = int(self.eval_steps) - if self.save_strategy == IntervalStrategy.STEPS and self.save_steps > 1: + if self.save_strategy == SaveStrategy.STEPS and self.save_steps > 1: if self.save_steps != int(self.save_steps): raise ValueError(f"--save_steps must be an integer if bigger than 1: {self.save_steps}") self.save_steps = int(self.save_steps) @@ -2750,8 +2752,8 @@ def set_save( 100 ``` """ - self.save_strategy = IntervalStrategy(strategy) - if self.save_strategy == IntervalStrategy.STEPS and steps == 0: + self.save_strategy = SaveStrategy(strategy) + if self.save_strategy == SaveStrategy.STEPS and steps == 0: raise ValueError("Setting `strategy` as 'steps' requires a positive value for `steps`.") self.save_steps = steps self.save_total_limit = total_limit diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py index 9df53c3f1d6..3716a78879d 100644 --- a/src/transformers/training_args_tf.py +++ b/src/transformers/training_args_tf.py @@ -114,7 +114,7 @@ class TFTrainingArguments(TrainingArguments): Whether to log and evaluate the first `global_step` or not. logging_steps (`int`, *optional*, defaults to 500): Number of update steps between two logs if `logging_strategy="steps"`. - save_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"steps"`): + save_strategy (`str` or [`~trainer_utils.SaveStrategy`], *optional*, defaults to `"steps"`): The checkpoint save strategy to adopt during training. Possible values are: - `"no"`: No save is done during training. diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 5c03355785d..b6fe807fa49 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -4041,6 +4041,89 @@ def test_trainer_saves_processor(self): reloaded_tokenizer(test_sentence, padding="max_length").input_ids, ) + def test_save_best_checkpoint(self): + freq = int(64 / self.batch_size) + total = int(self.n_epochs * 64 / self.batch_size) + + # Case 1: args.metric_for_best_model == "accuracy". + with tempfile.TemporaryDirectory() as tmpdir: + trainer = get_regression_trainer( + a=1.5, + b=2.5, + output_dir=tmpdir, + learning_rate=0.1, + eval_strategy="epoch", + save_strategy="best", + metric_for_best_model="accuracy", + compute_metrics=AlmostAccuracy(), + ) + self.assertTrue(trainer.args.metric_for_best_model == "accuracy") + + with patch.object( + trainer, + "_evaluate", + side_effect=[ + {"eval_loss": 0.03, "eval_accuracy": 0.60, "epoch": 1.0}, + {"eval_loss": 0.02, "eval_accuracy": 0.65, "epoch": 2.0}, + {"eval_loss": 0.01, "eval_accuracy": 0.64, "epoch": 3.0}, + ], + ): + trainer.train() + + self.assertEqual(len(os.listdir(tmpdir)), 2) + self.check_saved_checkpoints( + output_dir=tmpdir, + freq=freq, + total=total, + ) + + # Case 2: args.metric_for_best_model == "loss". + with tempfile.TemporaryDirectory() as tmpdir: + trainer = get_regression_trainer( + a=1.5, + b=2.5, + output_dir=tmpdir, + learning_rate=0.1, + eval_strategy="epoch", + save_strategy="best", + metric_for_best_model="loss", + compute_metrics=AlmostAccuracy(), + ) + self.assertTrue(trainer.args.metric_for_best_model == "loss") + + with patch.object( + trainer, + "_evaluate", + side_effect=[ + {"eval_loss": 0.03, "eval_accuracy": 0.60, "epoch": 1.0}, + {"eval_loss": 0.02, "eval_accuracy": 0.65, "epoch": 2.0}, + {"eval_loss": 0.03, "eval_accuracy": 0.66, "epoch": 3.0}, + ], + ): + trainer.train() + + self.assertEqual(len(os.listdir(tmpdir)), 2) + self.check_saved_checkpoints( + output_dir=tmpdir, + freq=freq, + total=total, + ) + + # Case 3: Metric name not provided; throw error. + with tempfile.TemporaryDirectory() as tmpdir: + with self.assertRaises(ValueError) as context: + trainer = get_regression_trainer( + a=1.5, + b=2.5, + output_dir=tmpdir, + learning_rate=0.1, + eval_strategy="epoch", + save_strategy="best", + compute_metrics=AlmostAccuracy(), + ) + + self.assertIn("`args.metric_for_best_model` must be provided", str(context.exception)) + @require_torch @is_staging_test From fc1ae7f30f1d16c7652c28dd8d91c5d8a8ed2f15 Mon Sep 17 00:00:00 2001 From: Vijay Date: Mon, 28 Oct 2024 21:44:07 +0530 Subject: [PATCH 40/99] [docs] update input documentation for MAMBA2 and MISTRAL models to include cache_position and attention_mask details (#34322) * [docs] update input documentation for MAMBA2 and MISTRAL models to include cache_position and attention_mask details * [docs] correct input documentation for MISTRAL model to reference `input_ids` instead of `decoder_input_ids` * [docs] clarify cache_position description in MISTRAL model documentation --- src/transformers/models/mamba2/modeling_mamba2.py | 10 ++++++++++ src/transformers/models/mistral/modeling_mistral.py | 6 +++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/mamba2/modeling_mamba2.py b/src/transformers/models/mamba2/modeling_mamba2.py index 110ae09a388..c312b9b9435 100644 --- a/src/transformers/models/mamba2/modeling_mamba2.py +++ b/src/transformers/models/mamba2/modeling_mamba2.py @@ -805,6 +805,16 @@ class Mamba2CausalLMOutput(ModelOutput): more detail. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + The position of the current input in the cache. This is used to ensure that the cache is correctly updated. + If `cache_params` is passed, `cache_position` should also be passed. + attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) """ diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py index 321d3dc0daf..3b0fb75a4cb 100644 --- a/src/transformers/models/mistral/modeling_mistral.py +++ b/src/transformers/models/mistral/modeling_mistral.py @@ -619,7 +619,7 @@ def _init_weights(self, module): Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. - If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + If `past_key_values` is used, optionally only the last `input_ids` have to be input (see `past_key_values`). If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] @@ -666,6 +666,10 @@ def _init_weights(self, module): more detail. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices indicating the position of the input sequence tokens in the sequence. Unlike `position_ids`, + this tensor is not affected by padding. It is used to update the cache in the correct position and to infer + the complete sequence length. """ From 1f7539c829531810e96501156598ffeaee8cd7e7 Mon Sep 17 00:00:00 2001 From: wony617 <49024958+Jwaminju@users.noreply.github.com> Date: Tue, 29 Oct 2024 02:46:49 +0900 Subject: [PATCH 41/99] =?UTF-8?q?=F0=9F=8C=90=20[i18n-KO]=20Translated=20`?= =?UTF-8?q?model=5Fdoc/barthez.md`=20to=20Korean=20(#33980)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * docs: ko: model_doc/barthez.md * feat: nmt draft --------- Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/ko/_toctree.yml | 4 +- docs/source/ko/model_doc/barthez.md | 60 +++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 2 deletions(-) create mode 100644 docs/source/ko/model_doc/barthez.md diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml index 27102f123dd..51d54b697b2 100644 --- a/docs/source/ko/_toctree.yml +++ b/docs/source/ko/_toctree.yml @@ -322,8 +322,8 @@ title: (번역중) ALBERT - local: model_doc/bart title: BART - - local: in_translation - title: (번역중) BARThez + - local: model_doc/barthez + title: BARThez - local: model_doc/bartpho title: BARTpho - local: in_translation diff --git a/docs/source/ko/model_doc/barthez.md b/docs/source/ko/model_doc/barthez.md new file mode 100644 index 00000000000..131db38856c --- /dev/null +++ b/docs/source/ko/model_doc/barthez.md @@ -0,0 +1,60 @@ + + +# BARThez [[barthez]] + +## 개요 [[overview]] + +BARThez 모델은 2020년 10월 23일, Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis에 의해 [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321)에서 제안되었습니다. + +이 논문의 초록: + + +*자기지도 학습에 의해 가능해진 귀납적 전이 학습은 자연어 처리(NLP) 분야 전반에 걸쳐 큰 반향을 일으켰으며, +BERT와 BART와 같은 모델들은 수많은 자연어 이해 작업에서 새로운 최첨단 성과를 기록했습니다. 일부 주목할 만한 예외가 있지만, +대부분의 사용 가능한 모델과 연구는 영어에 집중되어 있었습니다. 본 연구에서는 BARThez를 소개합니다. +이는 (우리가 아는 한) 프랑스어를 위한 첫 번째 BART 모델입니다. +BARThez는 과거 연구에서 얻은 매우 큰 프랑스어 단일 언어 말뭉치로 사전훈련되었으며, +BART의 변형 방식에 맞게 조정되었습니다. +CamemBERT 및 FlauBERT와 같은 기존의 BERT 기반 프랑스어 모델과 달리, BARThez는 생성 작업에 특히 적합합니다. +이는 인코더뿐만 아니라 디코더도 사전훈련되었기 때문입니다. +우리는 FLUE 벤치마크에서의 판별 작업 외에도 이 논문과 함께 공개하는 새로운 요약 데이터셋인 OrangeSum에서 BARThez를 평가했습니다. +또한 이미 사전훈련된 다국어 BART의 사전훈련을 BARThez의 말뭉치로 계속 진행하였으며, +결과적으로 얻어진 모델인 mBARTHez가 기본 BARThez보다 유의미한 성능 향상을 보였고, +CamemBERT 및 FlauBERT와 동등하거나 이를 능가함을 보였습니다.* + +이 모델은 [moussakam](https://huggingface.co/moussakam)이 기여했습니다. 저자의 코드는 [여기](https://github.com/moussaKam/BARThez)에서 찾을 수 있습니다. + + + +BARThez 구현은 🤗 BART와 동일하나, 토큰화에서 차이가 있습니다. 구성 클래스와 그 매개변수에 대한 정보는 [BART 문서](bart)를 참조하십시오. +BARThez 전용 토크나이저는 아래에 문서화되어 있습니다. + + + +## 리소스 [[resources]] + +- BARThez는 🤗 BART와 유사한 방식으로 시퀀스-투-시퀀스 작업에 맞춰 미세 조정될 수 있습니다. 다음을 확인하세요: + [examples/pytorch/summarization/](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization/README.md). + + +## BarthezTokenizer [[bartheztokenizer]] + +[[autodoc]] BarthezTokenizer + +## BarthezTokenizerFast [[bartheztokenizerfast]] + +[[autodoc]] BarthezTokenizerFast From 084e946cfdf4ecd37e8004db68018c042630c18e Mon Sep 17 00:00:00 2001 From: Shubham S Jagtap <63872951+ShubhamJagtap2000@users.noreply.github.com> Date: Mon, 28 Oct 2024 23:18:18 +0530 Subject: [PATCH 42/99] Apply linting to the important code blocks to make it readable (#34449) Enhance user experience using py-linting --- docs/README.md | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/docs/README.md b/docs/README.md index 7dbcefc0483..bb54d700413 100644 --- a/docs/README.md +++ b/docs/README.md @@ -276,14 +276,14 @@ building the return. Here's an example of a single value return: -``` +```python Returns: `List[int]`: A list of integers in the range [0, 1] --- 1 for a special token, 0 for a sequence token. ``` Here's an example of a tuple return, comprising several objects: -``` +```python Returns: `tuple(torch.FloatTensor)` comprising various elements depending on the configuration ([`BertConfig`]) and inputs: - ** loss** (*optional*, returned when `masked_lm_labels` is provided) `torch.FloatTensor` of shape `(1,)` -- @@ -322,10 +322,9 @@ includes an example of how to transcribe speech to text in the The syntax for Example docstrings can look as follows: -``` +```python Example: - ```python >>> from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC >>> from datasets import load_dataset >>> import torch @@ -347,7 +346,6 @@ The syntax for Example docstrings can look as follows: >>> transcription = processor.batch_decode(predicted_ids) >>> transcription[0] 'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL' - ``` ``` The docstring should give a minimal, clear example of how the respective model From a17f287ac039f92835b5cd9bd8ee28b584c9f65e Mon Sep 17 00:00:00 2001 From: Ahmed Almaghz <53489256+AhmedAlmaghz@users.noreply.github.com> Date: Mon, 28 Oct 2024 20:54:37 +0300 Subject: [PATCH 43/99] [i18n-ar] Translated file : `docs/source/ar/fast_tokenizers.md` into Arabic (#33034) * Add docs/source/ar/fast_tokenizers.md to Add_docs_source_ar_fast_tokenizers.md * Update _toctree.yml * Update _toctree.yml * Update docs/source/ar/_toctree.yml Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/fast_tokenizers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/fast_tokenizers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/fast_tokenizers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/fast_tokenizers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/fast_tokenizers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/fast_tokenizers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/fast_tokenizers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/fast_tokenizers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/fast_tokenizers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/fast_tokenizers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> --------- Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> --- docs/source/ar/_toctree.yml | 8 ++--- docs/source/ar/fast_tokenizers.md | 51 +++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 4 deletions(-) create mode 100644 docs/source/ar/fast_tokenizers.md diff --git a/docs/source/ar/_toctree.yml b/docs/source/ar/_toctree.yml index 6f7899b53b8..bd45925c64c 100644 --- a/docs/source/ar/_toctree.yml +++ b/docs/source/ar/_toctree.yml @@ -108,9 +108,9 @@ # title: دليل إرشادي لمحفزات النماذج اللغوية الكبيرة # title: الإرشاد # title: أدلة المهام -# - sections: -# - local: fast_tokenizers -# title: استخدم برامج التجزئة السريعة من 🤗 Tokenizers +- sections: + - local: fast_tokenizers + title: استخدم مجزئيات النصوص السريعة من 🤗 Tokenizers # - local: multilingual # title: تشغيل الاستنتاج باستخدام نماذج متعددة اللغات # - local: create_a_model @@ -139,7 +139,7 @@ # title: استكشاف الأخطاء وإصلاحها # - local: gguf # title: التوافق مع ملفات GGUF -# title: أدلة المطورين + title: أدلة المطورين # - sections: # - local: quantization/overview # title: نظرة عامة diff --git a/docs/source/ar/fast_tokenizers.md b/docs/source/ar/fast_tokenizers.md new file mode 100644 index 00000000000..539712969e8 --- /dev/null +++ b/docs/source/ar/fast_tokenizers.md @@ -0,0 +1,51 @@ +# استخدام مجزئيات النصوص من 🤗 Tokenizers + +يعتمد [`PreTrainedTokenizerFast`] على مكتبة [🤗 Tokenizers](https://huggingface.co/docs/tokenizers). يمكن تحميل المجزئات اللغويين الذين تم الحصول عليهم من مكتبة 🤗 Tokenizers ببساطة شديدة في 🤗 Transformers. + +قبل الدخول في التفاصيل، دعونا نبدأ أولاً بإنشاء مُجزىء لغوي تجريبي في بضع سطور: + +```python +>>> from tokenizers import Tokenizer +>>> from tokenizers.models import BPE +>>> from tokenizers.trainers import BpeTrainer +>>> from tokenizers.pre_tokenizers import Whitespace + +>>> tokenizer = Tokenizer(BPE(unk_token="[UNK]")) +>>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) + +>>> tokenizer.pre_tokenizer = Whitespace() +>>> files = [...] +>>> tokenizer.train(files, trainer) +``` + +الآن لدينا مُجزىء لغوي مدرب على الملفات التي حددناها. يمكننا إما الاستمرار في استخدامه في وقت التشغيل هذا، أو حفظه في ملف JSON لإعادة استخدامه لاحقًا. + +## تحميل مُجزئ النّصوص مُباشرةً + +دعونا نرى كيف يمكننا الاستفادة من كائن (مُجزئ النصوص) في مكتبة 🤗 Transformers. تسمح فئة [`PreTrainedTokenizerFast`] سهولة إنشاء *tokenizer*، من خلال قبول كائن *المُجزئ النصوص* مُهيّأ مُسبقًا كمعامل: + +```python +>>> from transformers import PreTrainedTokenizerFast + +>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer) +``` + +يمكن الآن استخدام هذا الكائن مع جميع الطرق المُشتركة بين مُجزّئي النّصوص لـ 🤗 Transformers! انتقل إلى [صفحة مُجزّئ النّصوص](main_classes/tokenizer) لمزيد من المعلومات. + +## التحميل من ملف JSON + +لتحميل مُجزّئ النص من ملف JSON، دعونا نبدأ أولاً بحفظ مُجزّئ النّصوص: + +```python +>>> tokenizer.save("tokenizer.json") +``` + +يمكن تمرير المسار الذي حفظنا به هذا الملف إلى طريقة تهيئة [`PreTrainedTokenizerFast`] باستخدام المُعامل `tokenizer_file`: + +```python +>>> from transformers import PreTrainedTokenizerFast + +>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json") +``` + +يمكن الآن استخدام هذا الكائن مع جميع الطرق التي تشترك فيها مُجزّئي النّصوص لـ 🤗 Transformers! انتقل إلى [صفحة مُجزّئ النص](main_classes/tokenizer) لمزيد من المعلومات. \ No newline at end of file From d21dbd1520937c993de1409215b1418bd6be74a1 Mon Sep 17 00:00:00 2001 From: kang sheng Date: Tue, 29 Oct 2024 01:59:38 +0800 Subject: [PATCH 44/99] enable average tokens across devices (#34373) * enable average tokens across devices * reduce earlier in case model needs it * simplify if statement * reformat code to make ruff happy * add doc for argument: average_tokens_across_devices * cannot find world size when pytorch is unavailable * format code --------- Co-authored-by: Zach Mueller Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- src/transformers/trainer.py | 10 +++++++++- src/transformers/training_args.py | 22 ++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 4315e54a42f..9176bd72a55 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -3631,7 +3631,12 @@ def training_step( with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: - loss *= self.args.gradient_accumulation_steps + if num_items_in_batch is not None: + if self.compute_loss_func or self.model_accepts_loss_kwargs: + loss *= self.args.gradient_accumulation_steps + # Average tokens across devices is orthogonal to gradient accumulation + if self.args.average_tokens_across_devices: + loss *= self.args.world_size self.accelerator.backward(loss, **kwargs) return loss.detach() / self.args.gradient_accumulation_steps @@ -3646,6 +3651,9 @@ def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=N labels = inputs.pop("labels") else: labels = None + if self.args.average_tokens_across_devices and num_items_in_batch is not None: + num_items_in_batch_tensor = torch.tensor(num_items_in_batch, device=self.args.device) + num_items_in_batch = int(self.accelerator.gather(num_items_in_batch_tensor).sum().cpu()) if self.model_accepts_loss_kwargs: loss_kwargs = {} if num_items_in_batch is not None: diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index c98e8bc41b9..3e5c6cc2f37 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -1532,6 +1532,15 @@ class TrainingArguments: }, ) + average_tokens_across_devices: Optional[bool] = field( + default=False, + metadata={ + "help": "Whether or not to average tokens across devices. If enabled, will use all_reduce to " + "synchronize num_tokens_in_batch for precise loss calculation. Reference: " + "https://github.com/huggingface/transformers/issues/34242" + }, + ) + def __post_init__(self): # Parse in args that could be `dict` sent in from the CLI as a string for field in _VALID_DICT_FIELDS: @@ -1765,6 +1774,19 @@ def __post_init__(self): if self.framework == "pt" and is_torch_available(): self.device + # Disable average tokens when using single device + if self.average_tokens_across_devices: + try: + if self.world_size == 1: + logger.warning( + "average_tokens_across_devices is set to True but it is invalid when world size is" + "1. Turn it to False automatically." + ) + self.average_tokens_across_devices = False + except ImportError as e: + logger.warning(f"Can not specify world size due to {e}. Turn average_tokens_across_devices to False.") + self.average_tokens_across_devices = False + if self.torchdynamo is not None: warnings.warn( "`torchdynamo` is deprecated and will be removed in version 5 of 🤗 Transformers. Use" From 6cc4a67b3d22445cd17e26922ba4435a5e97f759 Mon Sep 17 00:00:00 2001 From: Luc Georges Date: Mon, 28 Oct 2024 19:33:17 +0100 Subject: [PATCH 45/99] feat: run benchmarks on A100 (#34287) --- .github/workflows/benchmark.yml | 9 +- benchmark/grafana_dashboard.json | 1593 ++++++++++++++++-------------- benchmark/llama.py | 4 + 3 files changed, 885 insertions(+), 721 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 79f0652e192..a65b8cafe56 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -16,8 +16,11 @@ env: jobs: benchmark: name: Benchmark + strategy: + matrix: + group: [aws-g5-4xlarge-cache, aws-p4d-24xlarge-plus] runs-on: - group: aws-g5-4xlarge-cache + group: ${{ matrix.group }} if: | (github.event_name == 'pull_request' && contains( github.event.pull_request.labels.*.name, 'run-benchmark') )|| (github.event_name == 'push' && github.ref == 'refs/heads/main') @@ -60,9 +63,13 @@ jobs: commit_id=$GITHUB_SHA fi commit_msg=$(git show -s --format=%s | cut -c1-70) + df -h python3 benchmark/llama.py "${{ github.head_ref || github.ref_name }}" "$commit_id" "$commit_msg" env: HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} + # Enable this to see debug logs + # HF_HUB_VERBOSITY: debug + # TRANSFORMERS_VERBOSITY: debug PGHOST: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGHOST }} PGUSER: transformers_benchmarks PGPASSWORD: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGPASSWORD }} diff --git a/benchmark/grafana_dashboard.json b/benchmark/grafana_dashboard.json index 2375663ffbc..3d579f7b368 100644 --- a/benchmark/grafana_dashboard.json +++ b/benchmark/grafana_dashboard.json @@ -39,7 +39,7 @@ "datasource": { "default": true, "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" + "uid": "be28nkzirtb0gd" }, "fieldConfig": { "defaults": { @@ -77,7 +77,7 @@ "properties": [ { "id": "custom.width", - "value": 364 + "value": 196 } ] }, @@ -101,7 +101,7 @@ "properties": [ { "id": "custom.width", - "value": 708 + "value": 581 } ] }, @@ -113,7 +113,7 @@ "properties": [ { "id": "custom.width", - "value": 388 + "value": 379 } ] } @@ -148,7 +148,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT commit_id as commit_id, commit_message, gpu_name FROM benchmarks WHERE branch = '${branch}';", + "rawSql": "SELECT commit_id as commit_id, commit_message, gpu_name, created_at AS date FROM benchmarks WHERE branch = '${branch}' ORDER BY benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -232,7 +232,7 @@ "datasource": { "default": true, "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" + "uid": "be28nkzirtb0gd" }, "fieldConfig": { "defaults": { @@ -312,7 +312,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'first_eager_forward_pass_time_secs' AS double precision) AS first_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}'", + "rawSql": "SELECT CAST(m.measurements->'first_eager_forward_pass_time_secs' AS double precision) AS first_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -334,6 +334,19 @@ } ], "title": "First eager forward pass", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "time" + } + ] + } + } + ], "transparent": true, "type": "barchart" }, @@ -341,7 +354,7 @@ "datasource": { "default": true, "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" + "uid": "be28nkzirtb0gd" }, "fieldConfig": { "defaults": { @@ -424,7 +437,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'second_eager_forward_pass_time_secs' AS double precision) AS second_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}'", + "rawSql": "SELECT CAST(m.measurements->'second_eager_forward_pass_time_secs' AS double precision) AS second_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -446,6 +459,19 @@ } ], "title": "Second eager forward pass", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "time" + } + ] + } + } + ], "transparent": true, "type": "barchart" }, @@ -466,7 +492,7 @@ "datasource": { "default": true, "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" + "uid": "be28nkzirtb0gd" }, "fieldConfig": { "defaults": { @@ -545,7 +571,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'time_to_first_token_secs' AS double precision) AS time_to_first_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}'", + "rawSql": "SELECT CAST(m.measurements->'time_to_first_token_secs' AS double precision) AS time_to_first_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -567,6 +593,19 @@ } ], "title": "Time to first token", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "time" + } + ] + } + } + ], "transparent": true, "type": "barchart" }, @@ -574,7 +613,7 @@ "datasource": { "default": true, "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" + "uid": "be28nkzirtb0gd" }, "fieldConfig": { "defaults": { @@ -653,7 +692,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'time_to_second_token_secs' AS double precision) AS time_to_second_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}'", + "rawSql": "SELECT CAST(m.measurements->'time_to_second_token_secs' AS double precision) AS time_to_second_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -675,6 +714,19 @@ } ], "title": "Time to second token", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "time" + } + ] + } + } + ], "transparent": true, "type": "barchart" }, @@ -682,7 +734,7 @@ "datasource": { "default": true, "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" + "uid": "be28nkzirtb0gd" }, "fieldConfig": { "defaults": { @@ -761,7 +813,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'time_to_third_token_secs' AS double precision) AS time_to_third_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}'", + "rawSql": "SELECT CAST(m.measurements->'time_to_third_token_secs' AS double precision) AS time_to_third_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -783,6 +835,19 @@ } ], "title": "Time to third token", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "time" + } + ] + } + } + ], "transparent": true, "type": "barchart" }, @@ -790,7 +855,7 @@ "datasource": { "default": true, "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" + "uid": "be28nkzirtb0gd" }, "fieldConfig": { "defaults": { @@ -869,7 +934,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'time_to_next_token_mean_secs' AS double precision) AS time_to_next_token_mean_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}'", + "rawSql": "SELECT CAST(m.measurements->'time_to_next_token_mean_secs' AS double precision) AS time_to_next_token_mean_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -891,6 +956,19 @@ } ], "title": "Time to subsequent next tokens mean", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "time" + } + ] + } + } + ], "transparent": true, "type": "barchart" }, @@ -911,7 +989,7 @@ "datasource": { "default": true, "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" + "uid": "be28nkzirtb0gd" }, "fieldConfig": { "defaults": { @@ -990,7 +1068,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'first_compile_generate_time_secs' AS double precision) AS first_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}'", + "rawSql": "SELECT CAST(m.measurements->'first_compile_generate_time_secs' AS double precision) AS first_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -1012,6 +1090,19 @@ } ], "title": "First compile generate", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "time" + } + ] + } + } + ], "transparent": true, "type": "barchart" }, @@ -1019,7 +1110,7 @@ "datasource": { "default": true, "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" + "uid": "be28nkzirtb0gd" }, "fieldConfig": { "defaults": { @@ -1098,7 +1189,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'second_compile_generate_time_secs' AS double precision) AS second_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}';", + "rawSql": "SELECT CAST(m.measurements->'second_compile_generate_time_secs' AS double precision) AS second_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -1120,6 +1211,19 @@ } ], "title": "Second compile generate", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "time" + } + ] + } + } + ], "transparent": true, "type": "barchart" }, @@ -1127,7 +1231,7 @@ "datasource": { "default": true, "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" + "uid": "be28nkzirtb0gd" }, "fieldConfig": { "defaults": { @@ -1206,7 +1310,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'third_compile_generate_time_secs' AS double precision) AS third_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}';", + "rawSql": "SELECT CAST(m.measurements->'third_compile_generate_time_secs' AS double precision) AS third_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -1228,6 +1332,19 @@ } ], "title": "Third compile generate", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "time" + } + ] + } + } + ], "transparent": true, "type": "barchart" }, @@ -1235,7 +1352,7 @@ "datasource": { "default": true, "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" + "uid": "be28nkzirtb0gd" }, "fieldConfig": { "defaults": { @@ -1314,7 +1431,7 @@ "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'fourth_compile_generate_time_secs' AS double precision) AS fourth_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}';", + "rawSql": "SELECT CAST(m.measurements->'fourth_compile_generate_time_secs' AS double precision) AS fourth_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -1336,11 +1453,24 @@ } ], "title": "Fourth compile generate", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "time" + } + ] + } + } + ], "transparent": true, "type": "barchart" }, { - "collapsed": false, + "collapsed": true, "gridPos": { "h": 1, "w": 24, @@ -1348,751 +1478,753 @@ "y": 64 }, "id": 15, - "panels": [], - "title": "Usage metrics", - "type": "row" - }, - { - "datasource": { - "default": true, - "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" + "panels": [ + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": 60000, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": 60000, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 65 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - "thresholdsStyle": { - "mode": "off" + "tooltip": { + "mode": "single", + "sort": "none" } }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 65 - }, - "id": 1, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT\n d.cpu_util,\n d.time\nFROM\n benchmarks AS b\n JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id\nWHERE\n branch = '${branch}';", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [ + { + "name": "cpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "\"time\"", + "type": "functionParameter" + } + ], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50, + "whereJsonTree": { + "children1": [ + { + "id": "baa888b8-89ab-4cde-b012-31922f8671e9", + "properties": { + "field": "commit_id", + "fieldSrc": "field", + "operator": "equal", + "value": [ + "${commit}" + ], + "valueError": [ + null + ], + "valueSrc": [ + "value" + ], + "valueType": [ + "text" + ] + }, + "type": "rule" + } + ], + "id": "bab88a98-0123-4456-b89a-b1922f7d4f11", + "type": "group" + }, + "whereString": "commit_id = '${commit}'" + }, + "table": "measurements" + } + ], + "title": "CPU Utilization", + "transparent": true, + "type": "timeseries" }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ { "datasource": { + "default": true, "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" + "uid": "be28nkzirtb0gd" }, - "editorMode": "code", - "format": "table", - "rawQuery": true, - "rawSql": "SELECT\n d.cpu_util,\n d.time\nFROM\n benchmarks AS b\n JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id\nWHERE\n branch = '${branch}'", - "refId": "A", - "sql": { - "columns": [ - { - "parameters": [ + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": 60000, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ { - "name": "cpu_util", - "type": "functionParameter" + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 } - ], - "type": "function" + ] }, - { - "parameters": [ + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 65 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT\n b.commit_id,\n d.gpu_util,\n d.time\nFROM\n benchmarks AS b\n JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id\nWHERE\n branch = '${branch}';", + "refId": "A", + "sql": { + "columns": [ { - "name": "mem_megabytes", - "type": "functionParameter" + "parameters": [ + { + "name": "cpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "\"time\"", + "type": "functionParameter" + } + ], + "type": "function" } ], - "type": "function" - }, - { - "parameters": [ + "groupBy": [ { - "name": "gpu_util", - "type": "functionParameter" + "property": { + "type": "string" + }, + "type": "groupBy" } ], - "type": "function" + "limit": 50, + "whereJsonTree": { + "children1": [ + { + "id": "baa888b8-89ab-4cde-b012-31922f8671e9", + "properties": { + "field": "commit_id", + "fieldSrc": "field", + "operator": "equal", + "value": [ + "${commit}" + ], + "valueError": [ + null + ], + "valueSrc": [ + "value" + ], + "valueType": [ + "text" + ] + }, + "type": "rule" + } + ], + "id": "bab88a98-0123-4456-b89a-b1922f7d4f11", + "type": "group" + }, + "whereString": "commit_id = '${commit}'" }, - { - "parameters": [ + "table": "measurements" + } + ], + "title": "GPU Utilization", + "transparent": true, + "type": "timeseries" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": 60000, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ { - "name": "gpu_mem_megabytes", - "type": "functionParameter" + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 } - ], - "type": "function" + ] }, - { - "parameters": [ + "unit": "decmbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 74 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT d.mem_megabytes, d.time FROM benchmarks AS b JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id WHERE branch = '${branch}';", + "refId": "A", + "sql": { + "columns": [ { - "name": "\"time\"", - "type": "functionParameter" + "parameters": [ + { + "name": "cpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "\"time\"", + "type": "functionParameter" + } + ], + "type": "function" } ], - "type": "function" - } - ], - "groupBy": [ - { - "property": { - "type": "string" - }, - "type": "groupBy" - } - ], - "limit": 50, - "whereJsonTree": { - "children1": [ - { - "id": "baa888b8-89ab-4cde-b012-31922f8671e9", - "properties": { - "field": "commit_id", - "fieldSrc": "field", - "operator": "equal", - "value": [ - "${commit}" - ], - "valueError": [ - null - ], - "valueSrc": [ - "value" - ], - "valueType": [ - "text" - ] - }, - "type": "rule" - } - ], - "id": "bab88a98-0123-4456-b89a-b1922f7d4f11", - "type": "group" - }, - "whereString": "commit_id = '${commit}'" - }, - "table": "measurements" - } - ], - "title": "CPU Utilization", - "transparent": true, - "type": "timeseries" - }, - { - "datasource": { - "default": true, - "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": 60000, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 65 - }, - "id": 4, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" - }, - "editorMode": "code", - "format": "table", - "rawQuery": true, - "rawSql": "SELECT\n b.commit_id,\n d.gpu_util,\n d.time\nFROM\n benchmarks AS b\n JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id\nWHERE\n branch = '${branch}'", - "refId": "A", - "sql": { - "columns": [ - { - "parameters": [ - { - "name": "cpu_util", - "type": "functionParameter" - } - ], - "type": "function" - }, - { - "parameters": [ + "groupBy": [ { - "name": "mem_megabytes", - "type": "functionParameter" + "property": { + "type": "string" + }, + "type": "groupBy" } ], - "type": "function" - }, - { - "parameters": [ - { - "name": "gpu_util", - "type": "functionParameter" - } - ], - "type": "function" - }, - { - "parameters": [ - { - "name": "gpu_mem_megabytes", - "type": "functionParameter" - } - ], - "type": "function" - }, - { - "parameters": [ - { - "name": "\"time\"", - "type": "functionParameter" - } - ], - "type": "function" - } - ], - "groupBy": [ - { - "property": { - "type": "string" + "limit": 50, + "whereJsonTree": { + "children1": [ + { + "id": "baa888b8-89ab-4cde-b012-31922f8671e9", + "properties": { + "field": "commit_id", + "fieldSrc": "field", + "operator": "equal", + "value": [ + "${commit}" + ], + "valueError": [ + null + ], + "valueSrc": [ + "value" + ], + "valueType": [ + "text" + ] + }, + "type": "rule" + } + ], + "id": "bab88a98-0123-4456-b89a-b1922f7d4f11", + "type": "group" }, - "type": "groupBy" - } - ], - "limit": 50, - "whereJsonTree": { - "children1": [ - { - "id": "baa888b8-89ab-4cde-b012-31922f8671e9", - "properties": { - "field": "commit_id", - "fieldSrc": "field", - "operator": "equal", - "value": [ - "${commit}" - ], - "valueError": [ - null - ], - "valueSrc": [ - "value" - ], - "valueType": [ - "text" - ] - }, - "type": "rule" - } - ], - "id": "bab88a98-0123-4456-b89a-b1922f7d4f11", - "type": "group" - }, - "whereString": "commit_id = '${commit}'" - }, - "table": "measurements" - } - ], - "title": "GPU Utilization", - "transparent": true, - "type": "timeseries" - }, - { - "datasource": { - "default": true, - "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": 60000, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null + "whereString": "commit_id = '${commit}'" }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "decmbytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 74 - }, - "id": 2, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "table": "measurements" + } + ], + "title": "Memory usage", + "transparent": true, + "type": "timeseries" }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ { "datasource": { + "default": true, "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" + "uid": "be28nkzirtb0gd" }, - "editorMode": "code", - "format": "table", - "rawQuery": true, - "rawSql": "SELECT d.mem_megabytes, d.time FROM benchmarks AS b JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id WHERE branch = '${branch}'", - "refId": "A", - "sql": { - "columns": [ - { - "parameters": [ - { - "name": "cpu_util", - "type": "functionParameter" - } - ], - "type": "function" + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" }, - { - "parameters": [ - { - "name": "mem_megabytes", - "type": "functionParameter" - } - ], - "type": "function" + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": 60000, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } }, - { - "parameters": [ + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ { - "name": "gpu_util", - "type": "functionParameter" - } - ], - "type": "function" - }, - { - "parameters": [ + "color": "green", + "value": null + }, { - "name": "gpu_mem_megabytes", - "type": "functionParameter" + "color": "red", + "value": 80 } - ], - "type": "function" + ] }, - { - "parameters": [ - { - "name": "\"time\"", - "type": "functionParameter" - } - ], - "type": "function" - } - ], - "groupBy": [ - { - "property": { - "type": "string" - }, - "type": "groupBy" - } - ], - "limit": 50, - "whereJsonTree": { - "children1": [ - { - "id": "baa888b8-89ab-4cde-b012-31922f8671e9", - "properties": { - "field": "commit_id", - "fieldSrc": "field", - "operator": "equal", - "value": [ - "${commit}" - ], - "valueError": [ - null - ], - "valueSrc": [ - "value" - ], - "valueType": [ - "text" - ] - }, - "type": "rule" - } - ], - "id": "bab88a98-0123-4456-b89a-b1922f7d4f11", - "type": "group" - }, - "whereString": "commit_id = '${commit}'" - }, - "table": "measurements" - } - ], - "title": "Memory usage", - "transparent": true, - "type": "timeseries" - }, - { - "datasource": { - "default": true, - "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": 60000, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" + "unit": "decmbytes" }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 74 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - "thresholdsStyle": { - "mode": "off" + "tooltip": { + "mode": "single", + "sort": "none" } }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null + "targets": [ + { + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "bdz2yss7sxo1sc" }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "decmbytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 74 - }, - "id": 3, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" - }, - "editorMode": "code", - "format": "table", - "rawQuery": true, - "rawSql": "SELECT\n d.gpu_mem_megabytes,\n d.time\nFROM\n benchmarks AS b\n JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id\nWHERE\n branch = '${branch}'", - "refId": "A", - "sql": { - "columns": [ - { - "parameters": [ + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT\n d.gpu_mem_megabytes,\n d.time\nFROM\n benchmarks AS b\n JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id\nWHERE\n branch = '${branch}';", + "refId": "A", + "sql": { + "columns": [ { - "name": "cpu_util", - "type": "functionParameter" - } - ], - "type": "function" - }, - { - "parameters": [ + "parameters": [ + { + "name": "cpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, { - "name": "mem_megabytes", - "type": "functionParameter" - } - ], - "type": "function" - }, - { - "parameters": [ + "parameters": [ + { + "name": "mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, { - "name": "gpu_util", - "type": "functionParameter" - } - ], - "type": "function" - }, - { - "parameters": [ + "parameters": [ + { + "name": "gpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, { - "name": "gpu_mem_megabytes", - "type": "functionParameter" + "parameters": [ + { + "name": "gpu_mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "\"time\"", + "type": "functionParameter" + } + ], + "type": "function" } ], - "type": "function" - }, - { - "parameters": [ + "groupBy": [ { - "name": "\"time\"", - "type": "functionParameter" + "property": { + "type": "string" + }, + "type": "groupBy" } ], - "type": "function" - } - ], - "groupBy": [ - { - "property": { - "type": "string" + "limit": 50, + "whereJsonTree": { + "children1": [ + { + "id": "baa888b8-89ab-4cde-b012-31922f8671e9", + "properties": { + "field": "commit_id", + "fieldSrc": "field", + "operator": "equal", + "value": [ + "${commit}" + ], + "valueError": [ + null + ], + "valueSrc": [ + "value" + ], + "valueType": [ + "text" + ] + }, + "type": "rule" + } + ], + "id": "bab88a98-0123-4456-b89a-b1922f7d4f11", + "type": "group" }, - "type": "groupBy" - } - ], - "limit": 50, - "whereJsonTree": { - "children1": [ - { - "id": "baa888b8-89ab-4cde-b012-31922f8671e9", - "properties": { - "field": "commit_id", - "fieldSrc": "field", - "operator": "equal", - "value": [ - "${commit}" - ], - "valueError": [ - null - ], - "valueSrc": [ - "value" - ], - "valueType": [ - "text" - ] - }, - "type": "rule" - } - ], - "id": "bab88a98-0123-4456-b89a-b1922f7d4f11", - "type": "group" - }, - "whereString": "commit_id = '${commit}'" - }, - "table": "measurements" + "whereString": "commit_id = '${commit}'" + }, + "table": "measurements" + } + ], + "title": "GPU memory usage", + "transparent": true, + "type": "timeseries" } ], - "title": "GPU memory usage", - "transparent": true, - "type": "timeseries" + "title": "Usage metrics", + "type": "row" } ], + "refresh": "", "schemaVersion": 39, "tags": [], "templating": { @@ -2105,7 +2237,7 @@ }, "datasource": { "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" + "uid": "be28nkzirtb0gd" }, "definition": "SELECT DISTINCT branch FROM benchmarks;", "description": "", @@ -2125,12 +2257,12 @@ { "current": { "selected": false, - "text": "1728662868776", - "value": "1728662868776" + "text": "1729701492845", + "value": "1729701492845" }, "datasource": { "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" + "uid": "be28nkzirtb0gd" }, "definition": "SELECT created_at - INTERVAL '5 secs' FROM benchmarks WHERE branch = '${branch}' ORDER BY benchmark_id ASC LIMIT 1;", "description": "", @@ -2149,12 +2281,12 @@ { "current": { "selected": false, - "text": "1728663254125", - "value": "1728663254125" + "text": "1730120430069", + "value": "1730120430069" }, "datasource": { "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" + "uid": "be28nkzirtb0gd" }, "definition": "SELECT time + INTERVAL '5 secs' FROM benchmarks AS b JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id WHERE branch = '${branch}' ORDER BY b.benchmark_id DESC, d.measurement_id DESC LIMIT 1;", "description": "", @@ -2164,7 +2296,7 @@ "name": "EndTime", "options": [], "query": "SELECT time + INTERVAL '5 secs' FROM benchmarks AS b JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id WHERE branch = '${branch}' ORDER BY b.benchmark_id DESC, d.measurement_id DESC LIMIT 1;", - "refresh": 2, + "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, @@ -2178,7 +2310,7 @@ }, "datasource": { "type": "grafana-postgresql-datasource", - "uid": "de0dbhs18ho1sc" + "uid": "be28nkzirtb0gd" }, "definition": "SELECT DISTINCT gpu_name FROM benchmarks;", "hide": 0, @@ -2188,11 +2320,32 @@ "name": "gpu_name", "options": [], "query": "SELECT DISTINCT gpu_name FROM benchmarks;", - "refresh": 1, + "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 0, "type": "query" + }, + { + "current": { + "selected": false, + "text": "10", + "value": "10" + }, + "description": "The number of commits to display, going from most recent to the nth commit.", + "hide": 0, + "label": "Last # of commits", + "name": "last_n_commits", + "options": [ + { + "selected": true, + "text": "10", + "value": "10" + } + ], + "query": "10", + "skipUrlSync": false, + "type": "textbox" } ] }, @@ -2206,6 +2359,6 @@ "timezone": "browser", "title": "Transformers benchmarks", "uid": "fdz33iyzln9c0a", - "version": 11, + "version": 4, "weekStart": "" } diff --git a/benchmark/llama.py b/benchmark/llama.py index a926f903486..4a2c57422e6 100644 --- a/benchmark/llama.py +++ b/benchmark/llama.py @@ -96,17 +96,21 @@ def run_benchmark(branch: str, commit_id: str, commit_msg: str, num_tokens_to_ge ) conn.commit() benchmark_id = cur.fetchone()[0] + logger.info(f"running benchmark #{benchmark_id} on {gpu_name}") metrics_thread = Thread(target=collect_metrics, args=[benchmark_id, continue_metric_collection]) metrics_thread.start() + logger.info("started background thread to fetch device metrics") os.environ["TOKENIZERS_PARALLELISM"] = "false" # silence warnings when compiling device = "cuda" ckpt = "meta-llama/Llama-2-7b-hf" + logger.info("downloading weights") # This is to avoid counting download in model load time measurement model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16) gen_config = GenerationConfig(do_sample=False, top_p=1, temperature=1) + logger.info("loading model") start = perf_counter() model = AutoModelForCausalLM.from_pretrained( ckpt, torch_dtype=torch.float16, generation_config=gen_config From a769ed45e17c44fd17b85c025863c4e4f2f73634 Mon Sep 17 00:00:00 2001 From: Alexandros Benetatos <34627055+alex-bene@users.noreply.github.com> Date: Mon, 28 Oct 2024 20:44:20 +0200 Subject: [PATCH 46/99] Add `post_process_depth_estimation` for GLPN (#34413) * add depth postprocessing for GLPN * remove previous temp fix for glpn tests * Style changes for GLPN's `post_process_depth_estimation` Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * additional style fix --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- .../models/glpn/image_processing_glpn.py | 54 ++++++++++++++++++- src/transformers/models/glpn/modeling_glpn.py | 16 +++--- tests/models/glpn/test_modeling_glpn.py | 8 --- 3 files changed, 59 insertions(+), 19 deletions(-) diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py index 9e69c8ae8a6..115cefc86be 100644 --- a/src/transformers/models/glpn/image_processing_glpn.py +++ b/src/transformers/models/glpn/image_processing_glpn.py @@ -14,7 +14,11 @@ # limitations under the License. """Image processor class for GLPN.""" -from typing import List, Optional, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union + + +if TYPE_CHECKING: + from ...modeling_outputs import DepthEstimatorOutput import numpy as np import PIL.Image @@ -27,12 +31,17 @@ get_image_size, infer_channel_dimension_format, is_scaled_image, + is_torch_available, make_list_of_images, to_numpy_array, valid_images, validate_preprocess_arguments, ) -from ...utils import TensorType, filter_out_non_signature_kwargs, logging +from ...utils import TensorType, filter_out_non_signature_kwargs, logging, requires_backends + + +if is_torch_available(): + import torch logger = logging.get_logger(__name__) @@ -218,3 +227,44 @@ def preprocess( data = {"pixel_values": images} return BatchFeature(data=data, tensor_type=return_tensors) + + def post_process_depth_estimation( + self, + outputs: "DepthEstimatorOutput", + target_sizes: Optional[Union[TensorType, List[Tuple[int, int]], None]] = None, + ) -> List[Dict[str, TensorType]]: + """ + Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images. + Only supports PyTorch. + + Args: + outputs ([`DepthEstimatorOutput`]): + Raw outputs of the model. + target_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size + (height, width) of each image in the batch. If left to None, predictions will not be resized. + + Returns: + `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth + predictions. + """ + requires_backends(self, "torch") + + predicted_depth = outputs.predicted_depth + + if (target_sizes is not None) and (len(predicted_depth) != len(target_sizes)): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the predicted depth" + ) + + results = [] + target_sizes = [None] * len(predicted_depth) if target_sizes is None else target_sizes + for depth, target_size in zip(predicted_depth, target_sizes): + if target_size is not None: + depth = depth[None, None, ...] + depth = torch.nn.functional.interpolate(depth, size=target_size, mode="bicubic", align_corners=False) + depth = depth.squeeze() + + results.append({"predicted_depth": depth}) + + return results diff --git a/src/transformers/models/glpn/modeling_glpn.py b/src/transformers/models/glpn/modeling_glpn.py index 9fd22ca0f7b..70f175df8c9 100755 --- a/src/transformers/models/glpn/modeling_glpn.py +++ b/src/transformers/models/glpn/modeling_glpn.py @@ -723,20 +723,18 @@ def forward( >>> with torch.no_grad(): ... outputs = model(**inputs) - ... predicted_depth = outputs.predicted_depth >>> # interpolate to original size - >>> prediction = torch.nn.functional.interpolate( - ... predicted_depth.unsqueeze(1), - ... size=image.size[::-1], - ... mode="bicubic", - ... align_corners=False, + >>> post_processed_output = image_processor.post_process_depth_estimation( + ... outputs, + ... target_sizes=[(image.height, image.width)], ... ) >>> # visualize the prediction - >>> output = prediction.squeeze().cpu().numpy() - >>> formatted = (output * 255 / np.max(output)).astype("uint8") - >>> depth = Image.fromarray(formatted) + >>> predicted_depth = post_processed_output[0]["predicted_depth"] + >>> depth = predicted_depth * 255 / predicted_depth.max() + >>> depth = depth.detach().cpu().numpy() + >>> depth = Image.fromarray(depth.astype("uint8")) ```""" return_dict = return_dict if return_dict is not None else self.config.use_return_dict output_hidden_states = ( diff --git a/tests/models/glpn/test_modeling_glpn.py b/tests/models/glpn/test_modeling_glpn.py index 254c1135357..81e95ab244f 100644 --- a/tests/models/glpn/test_modeling_glpn.py +++ b/tests/models/glpn/test_modeling_glpn.py @@ -157,14 +157,6 @@ def setUp(self): self.model_tester = GLPNModelTester(self) self.config_tester = GLPNConfigTester(self, config_class=GLPNConfig) - @unittest.skip(reason="Failing after #32550") - def test_pipeline_depth_estimation(self): - pass - - @unittest.skip(reason="Failing after #32550") - def test_pipeline_depth_estimation_fp16(self): - pass - def test_config(self): self.config_tester.run_common_tests() From fe76b603702c7ae7ee4acafd1bc8a7ed80d61950 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Tue, 29 Oct 2024 07:54:51 +0100 Subject: [PATCH 47/99] LLaVA: latency issues (#34460) * fix llavas * code style * green ci --- .../models/llava/modeling_llava.py | 127 ++++++++-------- .../models/llava_next/modeling_llava_next.py | 135 +++++++++--------- .../modeling_llava_next_video.py | 13 +- .../modular_llava_next_video.py | 13 +- .../video_llava/modeling_video_llava.py | 13 +- .../models/vipllava/modeling_vipllava.py | 123 ++++++++-------- 6 files changed, 186 insertions(+), 238 deletions(-) diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py index 0b2492fc711..a0079f1787a 100644 --- a/src/transformers/models/llava/modeling_llava.py +++ b/src/transformers/models/llava/modeling_llava.py @@ -472,6 +472,7 @@ def forward( (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length ) or (input_ids.shape[-1] == 1 and pixel_values is not None) + image_features = None if pixel_values is not None: image_features = self.get_image_features( pixel_values=pixel_values, @@ -479,69 +480,67 @@ def forward( vision_feature_select_strategy=vision_feature_select_strategy, ) - if legacy_processing: - logger.warning_once( - "Expanding inputs for image tokens in LLaVa should be done in processing. " - "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly " - "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + if legacy_processing: + logger.warning_once( + "Expanding inputs for image tokens in LLaVa should be done in processing. " + "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly " + "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. " + "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + ) + # prefill stage vs decoding stage (legacy behavior copied) + if input_ids.shape[1] != 1: + inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features( + image_features, inputs_embeds, input_ids, attention_mask, labels ) - # prefill stage vs decoding stage (legacy behavior copied) - if input_ids.shape[1] != 1: - inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features( - image_features, inputs_embeds, input_ids, attention_mask, labels - ) - cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device) - else: - # Retrieve the first layer to inspect the logits and mask out the hidden states - # that are set to 0 - first_layer_past_key_value = past_key_values[0][0][:, :, :, 0] - - # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941 - batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0) - - # Get the target length - target_length = input_ids.shape[1] - past_length = first_layer_past_key_value.shape[-1] - - extended_attention_mask = torch.ones( - (attention_mask.shape[0], past_length), - dtype=attention_mask.dtype, - device=attention_mask.device, - ) - - # Filter out only the tokens that can be un-attended, this can happen - # if one uses Llava + Fused modules where the cache on the - # first iteration is already big enough, or if one passes custom cache - valid_indices = non_attended_tokens < extended_attention_mask.size(-1) - new_batch_index = batch_index[valid_indices] - new_non_attended_tokens = non_attended_tokens[valid_indices] - - # Zero-out the places where we don't need to attend - extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0 - - attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1) - position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1 - cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[ - -target_length: - ] - - # TODO: @raushan retain only the new behavior after v4.47 + cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device) else: - n_image_tokens = (input_ids == self.config.image_token_index).sum(dim=-1)[0].item() - n_image_features = image_features.shape[1] - if n_image_tokens != n_image_features: - raise ValueError( - f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" - ) - special_image_mask = ( - (input_ids == self.config.image_token_index) - .unsqueeze(-1) - .expand_as(inputs_embeds) - .to(inputs_embeds.device) + # Retrieve the first layer to inspect the logits and mask out the hidden states + # that are set to 0 + first_layer_past_key_value = past_key_values[0][0][:, :, :, 0] + + # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941 + batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0) + + # Get the target length + target_length = input_ids.shape[1] + past_length = first_layer_past_key_value.shape[-1] + + extended_attention_mask = torch.ones( + (attention_mask.shape[0], past_length), + dtype=attention_mask.dtype, + device=attention_mask.device, ) - image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) - inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + # Filter out only the tokens that can be un-attended, this can happen + # if one uses Llava + Fused modules where the cache on the + # first iteration is already big enough, or if one passes custom cache + valid_indices = non_attended_tokens < extended_attention_mask.size(-1) + new_batch_index = batch_index[valid_indices] + new_non_attended_tokens = non_attended_tokens[valid_indices] + + # Zero-out the places where we don't need to attend + extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0 + + attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1) + position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1 + cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[-target_length:] + + # TODO: @raushan retain only the new behavior after v4.47 + elif image_features is not None: + n_image_tokens = (input_ids == self.config.image_token_index).sum(dim=-1)[0].item() + n_image_features = image_features.shape[1] + if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + special_image_mask = ( + (input_ids == self.config.image_token_index) + .unsqueeze(-1) + .expand_as(inputs_embeds) + .to(inputs_embeds.device) + ) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) outputs = self.language_model( attention_mask=attention_mask, @@ -602,12 +601,6 @@ def prepare_inputs_for_generation( ): # Overwritten -- in specific circumstances we don't want to forward image inputs to the model - # Trigger the new behavior if we have more than image embeddings seq length tokens for images - legacy_processing = ( - input_ids is not None - and (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length - ) - model_inputs = self.language_model.prepare_inputs_for_generation( input_ids, past_key_values=past_key_values, @@ -618,7 +611,7 @@ def prepare_inputs_for_generation( **kwargs, ) - if legacy_processing or cache_position[0] == 0: + if cache_position[0] == 0: # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore # Otherwise we need pixel values to be passed to model model_inputs["pixel_values"] = pixel_values diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index 0cbda9cfd64..5a49337b2b5 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -846,6 +846,7 @@ def forward( (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length ) or (input_ids.shape[-1] == 1 and pixel_values is not None) + image_features = None if pixel_values is not None and pixel_values.size(0) > 0: image_features = self.get_image_features( pixel_values, @@ -861,74 +862,73 @@ def forward( vision_feature_select_strategy=vision_feature_select_strategy, image_newline=self.image_newline, ) - if legacy_processing: - logger.warning_once( - "Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. " - "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly " - "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + + if legacy_processing: + logger.warning_once( + "Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. " + "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly " + "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. " + "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + ) + if input_ids.shape[1] != 1: + inputs_embeds = inputs_embeds.to(image_features.dtype) + inputs_embeds, attention_mask, position_ids, labels, _ = self._merge_input_ids_with_image_features( + image_features, + feature_lens, + inputs_embeds, + input_ids, + attention_mask, + position_ids, + labels=labels, ) - if input_ids.shape[1] != 1: - inputs_embeds = inputs_embeds.to(image_features.dtype) - inputs_embeds, attention_mask, position_ids, labels, _ = self._merge_input_ids_with_image_features( - image_features, - feature_lens, - inputs_embeds, - input_ids, - attention_mask, - position_ids, - labels=labels, - ) - cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device) - else: - # Retrieve the first layer to inspect the logits and mask out the hidden states - # that are set to 0 - first_layer_past_key_value = past_key_values[0][0][:, :, :, 0] - - # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941 - batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0) - - # Get the target length - target_length = input_ids.shape[1] - past_length = first_layer_past_key_value.shape[-1] - - extended_attention_mask = torch.ones( - (attention_mask.shape[0], past_length), - dtype=attention_mask.dtype, - device=attention_mask.device, - ) + cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device) + else: + # Retrieve the first layer to inspect the logits and mask out the hidden states + # that are set to 0 + first_layer_past_key_value = past_key_values[0][0][:, :, :, 0] - # Filter out only the tokens that can be un-attended, this can happen - # if one uses Llava + Fused modules where the cache on the - # first iteration is already big enough, or if one passes custom cache - valid_indices = non_attended_tokens < extended_attention_mask.size(-1) - new_batch_index = batch_index[valid_indices] - new_non_attended_tokens = non_attended_tokens[valid_indices] - - # Zero-out the places where we don't need to attend - extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0 - attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1) - position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1 - cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[ - -target_length: - ] + # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941 + batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0) - # TODO: @raushan retain only the new behavior after v4.47 - else: - n_image_tokens = (input_ids == self.config.image_token_index).sum().item() - n_image_features = image_features.shape[0] - if n_image_tokens != n_image_features: - raise ValueError( - f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" - ) - special_image_mask = ( - (input_ids == self.config.image_token_index) - .unsqueeze(-1) - .expand_as(inputs_embeds) - .to(inputs_embeds.device) + # Get the target length + target_length = input_ids.shape[1] + past_length = first_layer_past_key_value.shape[-1] + + extended_attention_mask = torch.ones( + (attention_mask.shape[0], past_length), + dtype=attention_mask.dtype, + device=attention_mask.device, + ) + + # Filter out only the tokens that can be un-attended, this can happen + # if one uses Llava + Fused modules where the cache on the + # first iteration is already big enough, or if one passes custom cache + valid_indices = non_attended_tokens < extended_attention_mask.size(-1) + new_batch_index = batch_index[valid_indices] + new_non_attended_tokens = non_attended_tokens[valid_indices] + + # Zero-out the places where we don't need to attend + extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0 + attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1) + position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1 + cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[-target_length:] + + # TODO: @raushan retain only the new behavior after v4.47 + elif image_features is not None: + n_image_tokens = (input_ids == self.config.image_token_index).sum().item() + n_image_features = image_features.shape[0] + if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" ) - image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) - inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + special_image_mask = ( + (input_ids == self.config.image_token_index) + .unsqueeze(-1) + .expand_as(inputs_embeds) + .to(inputs_embeds.device) + ) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) outputs = self.language_model( attention_mask=attention_mask, @@ -990,11 +990,6 @@ def prepare_inputs_for_generation( ): # Overwritten -- in specific circumstances we don't want to forward image inputs to the model - legacy_processing = ( - input_ids is not None - and (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length - ) - model_inputs = self.language_model.prepare_inputs_for_generation( input_ids, past_key_values=past_key_values, @@ -1007,7 +1002,7 @@ def prepare_inputs_for_generation( # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore # Otherwise we need pixel values to be passed to model - if legacy_processing or cache_position[0] == 0: + if cache_position[0] == 0: model_inputs["pixel_values"] = pixel_values model_inputs["image_sizes"] = image_sizes diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index 96f4373afd9..44b372535d7 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -1110,17 +1110,6 @@ def prepare_inputs_for_generation( ): # Overwritten -- extra custom processing - if input_ids is not None: - img_token_not_enough = (input_ids == self.config.image_token_index).sum( - 1 - ).max() < self.config.image_seq_length - video_token_not_enough = (input_ids == self.config.video_token_index).sum( - 1 - ).max() < self.config.video_seq_length - legacy_processing = (img_token_not_enough and pixel_values is not None) or ( - video_token_not_enough and pixel_values_videos is not None - ) - model_inputs = self.language_model.prepare_inputs_for_generation( input_ids, past_key_values=past_key_values, @@ -1133,7 +1122,7 @@ def prepare_inputs_for_generation( # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore # Otherwise we need pixel values to be passed to model - if legacy_processing or cache_position[0] == 0: + if cache_position[0] == 0: model_inputs["pixel_values"] = pixel_values model_inputs["pixel_values_videos"] = pixel_values_videos model_inputs["image_sizes"] = image_sizes diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py index c1ed7571941..e9974e92049 100644 --- a/src/transformers/models/llava_next_video/modular_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py @@ -623,17 +623,6 @@ def prepare_inputs_for_generation( ): # Overwritten -- extra custom processing - if input_ids is not None: - img_token_not_enough = (input_ids == self.config.image_token_index).sum( - 1 - ).max() < self.config.image_seq_length - video_token_not_enough = (input_ids == self.config.video_token_index).sum( - 1 - ).max() < self.config.video_seq_length - legacy_processing = (img_token_not_enough and pixel_values is not None) or ( - video_token_not_enough and pixel_values_videos is not None - ) - model_inputs = self.language_model.prepare_inputs_for_generation( input_ids, past_key_values=past_key_values, @@ -646,7 +635,7 @@ def prepare_inputs_for_generation( # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore # Otherwise we need pixel values to be passed to model - if legacy_processing or cache_position[0] == 0: + if cache_position[0] == 0: model_inputs["pixel_values"] = pixel_values model_inputs["pixel_values_videos"] = pixel_values_videos model_inputs["image_sizes"] = image_sizes diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py index a9bd8b745a6..30f82e45056 100644 --- a/src/transformers/models/video_llava/modeling_video_llava.py +++ b/src/transformers/models/video_llava/modeling_video_llava.py @@ -720,17 +720,6 @@ def prepare_inputs_for_generation( ): # Overwritten -- in specific circumstances we don't want to forward image inputs to the model - if input_ids is not None: - img_token_not_enough = (input_ids == self.config.image_token_index).sum( - 1 - ).max() < self.config.image_seq_length - video_token_not_enough = (input_ids == self.config.video_token_index).sum( - 1 - ).max() < self.config.video_seq_length - legacy_processing = (img_token_not_enough and pixel_values_images is not None) or ( - video_token_not_enough and pixel_values_videos is not None - ) - model_inputs = self.language_model.prepare_inputs_for_generation( input_ids, past_key_values=past_key_values, @@ -741,7 +730,7 @@ def prepare_inputs_for_generation( **kwargs, ) - if legacy_processing or cache_position[0] == 0: + if cache_position[0] == 0: # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore # Otherwise we need pixel values to be passed to model model_inputs["pixel_values_images"] = pixel_values_images diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py index 987ae0ad0c6..c9db6e261c6 100644 --- a/src/transformers/models/vipllava/modeling_vipllava.py +++ b/src/transformers/models/vipllava/modeling_vipllava.py @@ -466,72 +466,71 @@ def forward( (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length ) or (input_ids.shape[-1] == 1 and pixel_values is not None) + image_features = None if pixel_values is not None: image_features = self.get_image_features( pixel_values=pixel_values, vision_feature_layers=vision_feature_layers ) - if legacy_processing: - logger.warning_once( - "Expanding inputs for image tokens in VipLLaVa should be done in processing. " - "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + if legacy_processing: + logger.warning_once( + "Expanding inputs for image tokens in VipLLaVa should be done in processing. " + "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. " + "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + ) + # prefill stage vs decoding stage (legacy behavior copied) + if input_ids.shape[1] != 1: + inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features( + image_features, inputs_embeds, input_ids, attention_mask, labels ) - # prefill stage vs decoding stage (legacy behavior copied) - if input_ids.shape[1] != 1: - inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features( - image_features, inputs_embeds, input_ids, attention_mask, labels - ) - cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device) - else: - # Retrieve the first layer to inspect the logits and mask out the hidden states - # that are set to 0 - first_layer_past_key_value = past_key_values[0][0][:, :, :, 0] - - # Sum all dimensions of head_dim (-1) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941 - batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0) - - target_length = input_ids.shape[1] - past_length = first_layer_past_key_value.shape[-1] - - extended_attention_mask = torch.ones( - (attention_mask.shape[0], past_length), - dtype=attention_mask.dtype, - device=attention_mask.device, - ) - - # Filter out only the tokens that can be un-attended, this can happen - # in the case one uses Llava + Fused modules where the cache on the - # first iteration is already big enough, or if one passes custom cache - valid_indices = non_attended_tokens < extended_attention_mask.size(-1) - new_batch_index = batch_index[valid_indices] - new_non_attended_tokens = non_attended_tokens[valid_indices] - - # Zero-out the places where we don't need to attend - extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0 - - attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1) - position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1 - cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[ - -target_length: - ] - - # TODO: @raushan retain only the new behavior after v4.47 + cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device) else: - n_image_tokens = (input_ids == self.config.image_token_index).sum(dim=-1)[0].item() - n_image_features = image_features.shape[1] - if n_image_tokens != n_image_features: - raise ValueError( - f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" - ) - special_image_mask = ( - (input_ids == self.config.image_token_index) - .unsqueeze(-1) - .expand_as(inputs_embeds) - .to(inputs_embeds.device) + # Retrieve the first layer to inspect the logits and mask out the hidden states + # that are set to 0 + first_layer_past_key_value = past_key_values[0][0][:, :, :, 0] + + # Sum all dimensions of head_dim (-1) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941 + batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0) + + target_length = input_ids.shape[1] + past_length = first_layer_past_key_value.shape[-1] + + extended_attention_mask = torch.ones( + (attention_mask.shape[0], past_length), + dtype=attention_mask.dtype, + device=attention_mask.device, ) - image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) - inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + # Filter out only the tokens that can be un-attended, this can happen + # in the case one uses Llava + Fused modules where the cache on the + # first iteration is already big enough, or if one passes custom cache + valid_indices = non_attended_tokens < extended_attention_mask.size(-1) + new_batch_index = batch_index[valid_indices] + new_non_attended_tokens = non_attended_tokens[valid_indices] + + # Zero-out the places where we don't need to attend + extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0 + + attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1) + position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1 + cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[-target_length:] + + # TODO: @raushan retain only the new behavior after v4.47 + elif image_features is not None: + n_image_tokens = (input_ids == self.config.image_token_index).sum(dim=-1)[0].item() + n_image_features = image_features.shape[1] + if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + special_image_mask = ( + (input_ids == self.config.image_token_index) + .unsqueeze(-1) + .expand_as(inputs_embeds) + .to(inputs_embeds.device) + ) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) outputs = self.language_model( attention_mask=attention_mask, @@ -590,12 +589,6 @@ def prepare_inputs_for_generation( ): # Overwritten -- in specific circumstances we don't want to forward image inputs to the model - # Trigger the new behavior if we have more than image embeddings seq length tokens for images - legacy_processing = ( - input_ids is not None - and (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length - ) - model_inputs = self.language_model.prepare_inputs_for_generation( input_ids, past_key_values=past_key_values, @@ -606,7 +599,7 @@ def prepare_inputs_for_generation( **kwargs, ) - if legacy_processing or cache_position[0] == 0: + if cache_position[0] == 0: # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore # Otherwise we need pixel values to be passed to model model_inputs["pixel_values"] = pixel_values From 808d6c50f8c6911d972f27bb5155c04e513c99ee Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Tue, 29 Oct 2024 07:57:10 +0100 Subject: [PATCH 48/99] Generation: fix test (#34369) * fix test * fix copies --- tests/generation/test_utils.py | 44 +++++-------------- tests/models/idefics/test_modeling_idefics.py | 3 +- tests/models/mamba2/test_modeling_mamba2.py | 3 +- tests/models/moshi/test_modeling_moshi.py | 22 +++++----- 4 files changed, 28 insertions(+), 44 deletions(-) diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 6f2eaf734df..d552bf73442 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -671,29 +671,6 @@ def test_beam_sample_generate(self): else: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + inputs_dict["input_ids"].shape[-1]) - # for VLMs inputs embeds won't match input ids unless images are encoded and merged with ids properly - # no quick fix available, since obtaining image embeddings step is very model-specific - if any(name in model.__class__.__name__.lower() for name in ("blip", "llava", "paligemma")): - prepare_inputs_for_generation_args = set( - inspect.signature(model.prepare_inputs_for_generation).parameters - ) - # `inputs_embeds` input is well supported when `cache_positions` is used, because it means the modeling - # code is up to date with our most recent standards - if ( - "inputs_embeds" in prepare_inputs_for_generation_args - and "cache_positions" in prepare_inputs_for_generation_args - ): - input_embeds = model.get_input_embeddings()(inputs_dict["input_ids"]) - beam_kwargs.update({"inputs_embeds": input_embeds}) - output_generate2 = self._beam_sample_generate( - model=model, - input_ids=None, - inputs_dict={}, - beam_kwargs=beam_kwargs, - ) - - torch.testing.assert_close(output_generate[:, input_embeds.shape[1] :], output_generate2) - @pytest.mark.generate def test_beam_sample_generate_dict_output(self): for model_class in self.all_generative_model_classes: @@ -1570,7 +1547,8 @@ def test_past_key_values_format(self): ) @pytest.mark.generate - def test_generate_from_inputs_embeds_decoder_only(self): + @parameterized.expand([(1,), (2,)]) + def test_generate_from_inputs_embeds_decoder_only(self, num_beams): # When supported, tests that the decoder model can generate from `inputs_embeds` instead of `input_ids` # if fails, you should probably update the `prepare_inputs_for_generation` function for model_class in self.all_generative_model_classes: @@ -1597,11 +1575,15 @@ def test_generate_from_inputs_embeds_decoder_only(self): continue input_ids = inputs_dict.pop("input_ids") + generation_kwargs = { + "return_dict_in_generate": True, + "output_scores": True, + "num_beams": num_beams, + "do_sample": False, + } # Traditional way of generating text - outputs_from_ids = model.generate( - input_ids, max_new_tokens=5, return_dict_in_generate=True, output_scores=True - ) + outputs_from_ids = model.generate(input_ids, max_new_tokens=5, **generation_kwargs) self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 5)) # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output) @@ -1610,8 +1592,7 @@ def test_generate_from_inputs_embeds_decoder_only(self): input_ids, inputs_embeds=inputs_embeds, max_new_tokens=5, - return_dict_in_generate=True, - output_scores=True, + **generation_kwargs, ) self.assertListEqual(outputs_from_ids.sequences.tolist(), outputs_from_embeds.sequences.tolist()) @@ -1622,15 +1603,14 @@ def test_generate_from_inputs_embeds_decoder_only(self): input_ids, inputs_embeds=random_embeds, max_new_tokens=5, - return_dict_in_generate=True, - output_scores=True, + **generation_kwargs, ) for i in range(len(outputs_from_rand_embeds.scores)): self.assertFalse(torch.allclose(outputs_from_embeds.scores[i], outputs_from_rand_embeds.scores[i])) # input_ids is not a required input -- if we don't pass it, the newly generated tokens will be the same outputs_from_embeds_wo_ids = model.generate( - inputs_embeds=inputs_embeds, max_new_tokens=5, return_dict_in_generate=True, output_scores=True + inputs_embeds=inputs_embeds, max_new_tokens=5, **generation_kwargs ) self.assertListEqual( outputs_from_embeds.sequences[:, inputs_embeds.shape[1] :].tolist(), diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py index bbade169550..c2f0ef8ccd0 100644 --- a/tests/models/idefics/test_modeling_idefics.py +++ b/tests/models/idefics/test_modeling_idefics.py @@ -773,7 +773,8 @@ def test_custom_4d_attention_mask(self): @unittest.skip( reason="IDEFICS has specific requirements for working with inputs embeds like passing also the ids and pixels" ) - def test_generate_from_inputs_embeds_decoder_only(self): + @parameterized.expand([(1,), (2,)]) + def test_generate_from_inputs_embeds_decoder_only(self, num_beams): pass @unittest.skip(reason="IDEFICS cannot compile due to dynamic control flow when checking inputs") diff --git a/tests/models/mamba2/test_modeling_mamba2.py b/tests/models/mamba2/test_modeling_mamba2.py index f19358a22f4..1a8cf047745 100644 --- a/tests/models/mamba2/test_modeling_mamba2.py +++ b/tests/models/mamba2/test_modeling_mamba2.py @@ -204,7 +204,8 @@ def test_generate_without_input_ids(self): pass @unittest.skip(reason="To fix, Mamba 2 cache slicing test case is an edge case") - def test_generate_from_inputs_embeds_decoder_only(self): + @parameterized.expand([(1,), (2,)]) + def test_generate_from_inputs_embeds_decoder_only(self, num_beams): pass @unittest.skip(reason="To fix, Mamba 2 cache slicing test case is an edge case") diff --git a/tests/models/moshi/test_modeling_moshi.py b/tests/models/moshi/test_modeling_moshi.py index dd9302ee2c5..b77a6ff1036 100644 --- a/tests/models/moshi/test_modeling_moshi.py +++ b/tests/models/moshi/test_modeling_moshi.py @@ -656,16 +656,21 @@ def test_initialization(self): ) @pytest.mark.generate - def test_generate_from_inputs_embeds_decoder_only(self): + @parameterized.expand([(1,), (2,)]) + def test_generate_from_inputs_embeds_decoder_only(self, num_beams): for model_class in self.all_generative_model_classes: config, input_ids, _, inputs_dict = self._get_input_ids_and_config() model = model_class(config).to(torch_device).eval() + generation_kwargs = { + "return_dict_in_generate": True, + "output_scores": True, + "num_beams": num_beams, + "do_sample": False, + } # Traditional way of generating text - outputs_from_ids = model.generate( - input_ids, max_new_tokens=5, return_dict_in_generate=True, output_scores=True, **inputs_dict - ) + outputs_from_ids = model.generate(input_ids, max_new_tokens=5, **generation_kwargs, **inputs_dict) self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 5)) # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output) @@ -674,8 +679,7 @@ def test_generate_from_inputs_embeds_decoder_only(self): input_ids, inputs_embeds=inputs_embeds, max_new_tokens=5, - return_dict_in_generate=True, - output_scores=True, + **generation_kwargs, **inputs_dict, ) @@ -686,8 +690,7 @@ def test_generate_from_inputs_embeds_decoder_only(self): input_ids, inputs_embeds=random_embeds, max_new_tokens=5, - return_dict_in_generate=True, - output_scores=True, + **generation_kwargs, **inputs_dict, ) for i in range(len(outputs_from_rand_embeds.scores)): @@ -697,8 +700,7 @@ def test_generate_from_inputs_embeds_decoder_only(self): outputs_from_embeds_wo_ids = model.generate( inputs_embeds=inputs_embeds, max_new_tokens=5, - return_dict_in_generate=True, - output_scores=True, + **generation_kwargs, **inputs_dict, ) self.assertListEqual( From 63ca6d9771b13b603deb228420623681188a4dc2 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Tue, 29 Oct 2024 08:26:04 +0100 Subject: [PATCH 49/99] Fix CI (#34458) * fix * fix mistral --- src/transformers/generation/flax_utils.py | 2 ++ tests/generation/test_flax_utils.py | 4 ++++ tests/test_modeling_common.py | 2 +- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/transformers/generation/flax_utils.py b/src/transformers/generation/flax_utils.py index 08480ac983e..88535b44e9c 100644 --- a/src/transformers/generation/flax_utils.py +++ b/src/transformers/generation/flax_utils.py @@ -397,6 +397,8 @@ def generate( "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)" ) generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length + else: # by default let's always generate 10 new tokens + generation_config.max_length = generation_config.max_length + input_ids_seq_length if generation_config.min_length is not None and generation_config.min_length > generation_config.max_length: raise ValueError( diff --git a/tests/generation/test_flax_utils.py b/tests/generation/test_flax_utils.py index 647482b88cd..bb0c1828763 100644 --- a/tests/generation/test_flax_utils.py +++ b/tests/generation/test_flax_utils.py @@ -101,6 +101,10 @@ def test_greedy_generate_pt_fx(self): pt_model = pt_model_class(config).eval() pt_model = load_flax_weights_in_pytorch_model(pt_model, flax_model.params) + # Generate max 5 tokens only otherwise seems to be numerical error accumulation + pt_model.generation_config.max_length = 5 + flax_model.generation_config.max_length = 5 + flax_generation_outputs = flax_model.generate(input_ids).sequences pt_generation_outputs = pt_model.generate(torch.tensor(input_ids, dtype=torch.long)) diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 51d51dfcc28..d88b0dc5f02 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -3002,7 +3002,7 @@ def test_inputs_embeds_matches_input_ids(self): def test_inputs_embeds_matches_input_ids_with_generate(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - for model_class in self.all_model_classes: + for model_class in self.all_generative_model_classes: if model_class.__name__ not in [ *get_values(MODEL_FOR_CAUSAL_LM_MAPPING_NAMES), *get_values(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES), From 655bec2da7120a8681acc2ce951f8d58c6f0e6ef Mon Sep 17 00:00:00 2001 From: kang sheng Date: Tue, 29 Oct 2024 16:39:06 +0800 Subject: [PATCH 50/99] use a tinymodel to test generation config which aviod timeout (#34482) * use a tinymodel to test generation config which aviod timeout * remove tailing whitespace --- tests/utils/test_modeling_utils.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py index 8af47cde8e5..0452a10d5d5 100644 --- a/tests/utils/test_modeling_utils.py +++ b/tests/utils/test_modeling_utils.py @@ -1544,15 +1544,16 @@ def test_pretrained_low_mem_new_config(self): self.assertEqual(model.__class__.__name__, model_ref.__class__.__name__) def test_generation_config_is_loaded_with_model(self): - # Note: `TinyLlama/TinyLlama-1.1B-Chat-v1.0` has a `generation_config.json` containing `max_length: 2048` + # Note: `hf-internal-testing/tiny-random-MistralForCausalLM` has a `generation_config.json` + # containing `bos_token_id: 1` # 1. Load without further parameters - model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0") - self.assertEqual(model.generation_config.max_length, 2048) + model = AutoModelForCausalLM.from_pretrained(TINY_MISTRAL) + self.assertEqual(model.generation_config.bos_token_id, 1) # 2. Load with `device_map` - model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", device_map="auto") - self.assertEqual(model.generation_config.max_length, 2048) + model = AutoModelForCausalLM.from_pretrained(TINY_MISTRAL, device_map="auto") + self.assertEqual(model.generation_config.bos_token_id, 1) @require_safetensors def test_safetensors_torch_from_torch(self): From a1835195d134f5a244aed1212342be94fa27b40c Mon Sep 17 00:00:00 2001 From: StevenBucaille Date: Tue, 29 Oct 2024 10:36:03 +0100 Subject: [PATCH 51/99] =?UTF-8?q?=F0=9F=9A=A8=F0=9F=9A=A8=F0=9F=9A=A8=20[S?= =?UTF-8?q?uperPoint]=20Fix=20keypoint=20coordinate=20output=20and=20add?= =?UTF-8?q?=20post=20processing=20(#33200)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: Added int conversion and unwrapping * test: added tests for post_process_keypoint_detection of SuperPointImageProcessor * docs: changed docs to include post_process_keypoint_detection method and switched from opencv to matplotlib * test: changed test to not depend on SuperPointModel forward * test: added missing require_torch decorator * docs: changed pyplot parameters for the keypoints to be more visible in the example * tests: changed import torch location to make test_flax and test_tf * Revert "tests: changed import torch location to make test_flax and test_tf" This reverts commit 39b32a2f69500bc7af01715fc7beae2260549afe. * tests: fixed import * chore: applied suggestions from code review Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * tests: fixed import * tests: fixed import (bis) * tests: fixed import (ter) * feat: added choice of type for target_size and changed tests accordingly * docs: updated code snippet to reflect the addition of target size type choice in post process method * tests: fixed imports (...) * tests: fixed imports (...) * style: formatting file * docs: fixed typo from image[0] to image.size[0] * docs: added output image and fixed some tests * Update docs/source/en/model_doc/superpoint.md Co-authored-by: Pavel Iakubovskii * fix: included SuperPointKeypointDescriptionOutput in TYPE_CHECKING if statement and changed tests results to reflect changes to SuperPoint from absolute keypoints coordinates to relative * docs: changed SuperPoint's docs to print output instead of just accessing * style: applied make style * docs: added missing output type and precision in docstring of post_process_keypoint_detection * perf: deleted loop to perform keypoint conversion in one statement * fix: moved keypoint conversion at the end of model forward * docs: changed SuperPointInterestPointDecoder to SuperPointKeypointDecoder class name and added relative (x, y) coordinates information to its method * fix: changed type hint * refactor: removed unnecessary brackets * revert: SuperPointKeypointDecoder to SuperPointInterestPointDecoder * Update docs/source/en/model_doc/superpoint.md Co-authored-by: Pavel Iakubovskii --------- Co-authored-by: Steven Bucaille Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Co-authored-by: Pavel Iakubovskii --- docs/source/en/model_doc/superpoint.md | 37 +++++++----- .../superpoint/image_processing_superpoint.py | 59 ++++++++++++++++++- .../models/superpoint/modeling_superpoint.py | 10 +++- .../test_image_processing_superpoint.py | 54 ++++++++++++++++- .../superpoint/test_modeling_superpoint.py | 10 ++-- 5 files changed, 147 insertions(+), 23 deletions(-) diff --git a/docs/source/en/model_doc/superpoint.md b/docs/source/en/model_doc/superpoint.md index b9aab2f1b92..59e451adceb 100644 --- a/docs/source/en/model_doc/superpoint.md +++ b/docs/source/en/model_doc/superpoint.md @@ -86,24 +86,32 @@ model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/sup inputs = processor(images, return_tensors="pt") outputs = model(**inputs) - -for i in range(len(images)): - image_mask = outputs.mask[i] - image_indices = torch.nonzero(image_mask).squeeze() - image_keypoints = outputs.keypoints[i][image_indices] - image_scores = outputs.scores[i][image_indices] - image_descriptors = outputs.descriptors[i][image_indices] +image_sizes = [(image.height, image.width) for image in images] +outputs = processor.post_process_keypoint_detection(outputs, image_sizes) + +for output in outputs: + for keypoints, scores, descriptors in zip(output["keypoints"], output["scores"], output["descriptors"]): + print(f"Keypoints: {keypoints}") + print(f"Scores: {scores}") + print(f"Descriptors: {descriptors}") ``` -You can then print the keypoints on the image to visualize the result : +You can then print the keypoints on the image of your choice to visualize the result: ```python -import cv2 -for keypoint, score in zip(image_keypoints, image_scores): - keypoint_x, keypoint_y = int(keypoint[0].item()), int(keypoint[1].item()) - color = tuple([score.item() * 255] * 3) - image = cv2.circle(image, (keypoint_x, keypoint_y), 2, color) -cv2.imwrite("output_image.png", image) +import matplotlib.pyplot as plt + +plt.axis("off") +plt.imshow(image_1) +plt.scatter( + outputs[0]["keypoints"][:, 0], + outputs[0]["keypoints"][:, 1], + c=outputs[0]["scores"] * 100, + s=outputs[0]["scores"] * 50, + alpha=0.8 +) +plt.savefig(f"output_image.png") ``` +![image/png](https://cdn-uploads.huggingface.co/production/uploads/632885ba1558dac67c440aa8/ZtFmphEhx8tcbEQqOolyE.png) This model was contributed by [stevenbucaille](https://huggingface.co/stevenbucaille). The original code can be found [here](https://github.com/magicleap/SuperPointPretrainedNetwork). @@ -123,6 +131,7 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h [[autodoc]] SuperPointImageProcessor - preprocess +- post_process_keypoint_detection ## SuperPointForKeypointDetection diff --git a/src/transformers/models/superpoint/image_processing_superpoint.py b/src/transformers/models/superpoint/image_processing_superpoint.py index fbbb717570c..65309b1c182 100644 --- a/src/transformers/models/superpoint/image_processing_superpoint.py +++ b/src/transformers/models/superpoint/image_processing_superpoint.py @@ -13,11 +13,11 @@ # limitations under the License. """Image processor class for SuperPoint.""" -from typing import Dict, Optional, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union import numpy as np -from ... import is_vision_available +from ... import is_torch_available, is_vision_available from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict from ...image_transforms import resize, to_channel_dimension_format from ...image_utils import ( @@ -32,6 +32,12 @@ from ...utils import TensorType, logging, requires_backends +if is_torch_available(): + import torch + +if TYPE_CHECKING: + from .modeling_superpoint import SuperPointKeypointDescriptionOutput + if is_vision_available(): import PIL @@ -270,3 +276,52 @@ def preprocess( data = {"pixel_values": images} return BatchFeature(data=data, tensor_type=return_tensors) + + def post_process_keypoint_detection( + self, outputs: "SuperPointKeypointDescriptionOutput", target_sizes: Union[TensorType, List[Tuple]] + ) -> List[Dict[str, "torch.Tensor"]]: + """ + Converts the raw output of [`SuperPointForKeypointDetection`] into lists of keypoints, scores and descriptors + with coordinates absolute to the original image sizes. + + Args: + outputs ([`SuperPointKeypointDescriptionOutput`]): + Raw outputs of the model containing keypoints in a relative (x, y) format, with scores and descriptors. + target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`): + Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size + `(height, width)` of each image in the batch. This must be the original + image size (before any processing). + Returns: + `List[Dict]`: A list of dictionaries, each dictionary containing the keypoints in absolute format according + to target_sizes, scores and descriptors for an image in the batch as predicted by the model. + """ + if len(outputs.mask) != len(target_sizes): + raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the mask") + + if isinstance(target_sizes, List): + image_sizes = torch.tensor(target_sizes) + else: + if target_sizes.shape[1] != 2: + raise ValueError( + "Each element of target_sizes must contain the size (h, w) of each image of the batch" + ) + image_sizes = target_sizes + + # Flip the image sizes to (width, height) and convert keypoints to absolute coordinates + image_sizes = torch.flip(image_sizes, [1]) + masked_keypoints = outputs.keypoints * image_sizes[:, None] + + # Convert masked_keypoints to int + masked_keypoints = masked_keypoints.to(torch.int32) + + results = [] + for image_mask, keypoints, scores, descriptors in zip( + outputs.mask, masked_keypoints, outputs.scores, outputs.descriptors + ): + indices = torch.nonzero(image_mask).squeeze(1) + keypoints = keypoints[indices] + scores = scores[indices] + descriptors = descriptors[indices] + results.append({"keypoints": keypoints, "scores": scores, "descriptors": descriptors}) + + return results diff --git a/src/transformers/models/superpoint/modeling_superpoint.py b/src/transformers/models/superpoint/modeling_superpoint.py index cfd3dfd86e8..1075de299a9 100644 --- a/src/transformers/models/superpoint/modeling_superpoint.py +++ b/src/transformers/models/superpoint/modeling_superpoint.py @@ -239,7 +239,10 @@ def _get_pixel_scores(self, encoded: torch.Tensor) -> torch.Tensor: return scores def _extract_keypoints(self, scores: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - """Based on their scores, extract the pixels that represent the keypoints that will be used for descriptors computation""" + """ + Based on their scores, extract the pixels that represent the keypoints that will be used for descriptors computation. + The keypoints are in the form of relative (x, y) coordinates. + """ _, height, width = scores.shape # Threshold keypoints by score value @@ -447,7 +450,7 @@ def forward( pixel_values = self.extract_one_channel_pixel_values(pixel_values) - batch_size = pixel_values.shape[0] + batch_size, _, height, width = pixel_values.shape encoder_outputs = self.encoder( pixel_values, @@ -485,6 +488,9 @@ def forward( descriptors[i, : _descriptors.shape[0]] = _descriptors mask[i, : _scores.shape[0]] = 1 + # Convert to relative coordinates + keypoints = keypoints / torch.tensor([width, height], device=keypoints.device) + hidden_states = encoder_outputs[1] if output_hidden_states else None if not return_dict: return tuple(v for v in [loss, keypoints, scores, descriptors, mask, hidden_states] if v is not None) diff --git a/tests/models/superpoint/test_image_processing_superpoint.py b/tests/models/superpoint/test_image_processing_superpoint.py index 90bbf82d1ed..c2eae872004 100644 --- a/tests/models/superpoint/test_image_processing_superpoint.py +++ b/tests/models/superpoint/test_image_processing_superpoint.py @@ -16,7 +16,7 @@ import numpy as np from transformers.testing_utils import require_torch, require_vision -from transformers.utils import is_vision_available +from transformers.utils import is_torch_available, is_vision_available from ...test_image_processing_common import ( ImageProcessingTestMixin, @@ -24,6 +24,11 @@ ) +if is_torch_available(): + import torch + + from transformers.models.superpoint.modeling_superpoint import SuperPointKeypointDescriptionOutput + if is_vision_available(): from transformers import SuperPointImageProcessor @@ -70,6 +75,23 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F torchify=torchify, ) + def prepare_keypoint_detection_output(self, pixel_values): + max_number_keypoints = 50 + batch_size = len(pixel_values) + mask = torch.zeros((batch_size, max_number_keypoints)) + keypoints = torch.zeros((batch_size, max_number_keypoints, 2)) + scores = torch.zeros((batch_size, max_number_keypoints)) + descriptors = torch.zeros((batch_size, max_number_keypoints, 16)) + for i in range(batch_size): + random_number_keypoints = np.random.randint(0, max_number_keypoints) + mask[i, :random_number_keypoints] = 1 + keypoints[i, :random_number_keypoints] = torch.rand((random_number_keypoints, 2)) + scores[i, :random_number_keypoints] = torch.rand((random_number_keypoints,)) + descriptors[i, :random_number_keypoints] = torch.rand((random_number_keypoints, 16)) + return SuperPointKeypointDescriptionOutput( + loss=None, keypoints=keypoints, scores=scores, descriptors=descriptors, mask=mask, hidden_states=None + ) + @require_torch @require_vision @@ -110,3 +132,33 @@ def test_input_image_properly_converted_to_grayscale(self): pre_processed_images = image_processor.preprocess(image_inputs) for image in pre_processed_images["pixel_values"]: self.assertTrue(np.all(image[0, ...] == image[1, ...]) and np.all(image[1, ...] == image[2, ...])) + + @require_torch + def test_post_processing_keypoint_detection(self): + image_processor = self.image_processing_class.from_dict(self.image_processor_dict) + image_inputs = self.image_processor_tester.prepare_image_inputs() + pre_processed_images = image_processor.preprocess(image_inputs, return_tensors="pt") + outputs = self.image_processor_tester.prepare_keypoint_detection_output(**pre_processed_images) + + def check_post_processed_output(post_processed_output, image_size): + for post_processed_output, image_size in zip(post_processed_output, image_size): + self.assertTrue("keypoints" in post_processed_output) + self.assertTrue("descriptors" in post_processed_output) + self.assertTrue("scores" in post_processed_output) + keypoints = post_processed_output["keypoints"] + all_below_image_size = torch.all(keypoints[:, 0] <= image_size[1]) and torch.all( + keypoints[:, 1] <= image_size[0] + ) + all_above_zero = torch.all(keypoints[:, 0] >= 0) and torch.all(keypoints[:, 1] >= 0) + self.assertTrue(all_below_image_size) + self.assertTrue(all_above_zero) + + tuple_image_sizes = [(image.size[0], image.size[1]) for image in image_inputs] + tuple_post_processed_outputs = image_processor.post_process_keypoint_detection(outputs, tuple_image_sizes) + + check_post_processed_output(tuple_post_processed_outputs, tuple_image_sizes) + + tensor_image_sizes = torch.tensor([image.size for image in image_inputs]).flip(1) + tensor_post_processed_outputs = image_processor.post_process_keypoint_detection(outputs, tensor_image_sizes) + + check_post_processed_output(tensor_post_processed_outputs, tensor_image_sizes) diff --git a/tests/models/superpoint/test_modeling_superpoint.py b/tests/models/superpoint/test_modeling_superpoint.py index 25c384a7955..8db435502ca 100644 --- a/tests/models/superpoint/test_modeling_superpoint.py +++ b/tests/models/superpoint/test_modeling_superpoint.py @@ -260,7 +260,7 @@ def test_inference(self): inputs = preprocessor(images=images, return_tensors="pt").to(torch_device) with torch.no_grad(): outputs = model(**inputs) - expected_number_keypoints_image0 = 567 + expected_number_keypoints_image0 = 568 expected_number_keypoints_image1 = 830 expected_max_number_keypoints = max(expected_number_keypoints_image0, expected_number_keypoints_image1) expected_keypoints_shape = torch.Size((len(images), expected_max_number_keypoints, 2)) @@ -275,11 +275,13 @@ def test_inference(self): self.assertEqual(outputs.keypoints.shape, expected_keypoints_shape) self.assertEqual(outputs.scores.shape, expected_scores_shape) self.assertEqual(outputs.descriptors.shape, expected_descriptors_shape) - expected_keypoints_image0_values = torch.tensor([[480.0, 9.0], [494.0, 9.0], [489.0, 16.0]]).to(torch_device) + expected_keypoints_image0_values = torch.tensor([[0.75, 0.0188], [0.7719, 0.0188], [0.7641, 0.0333]]).to( + torch_device + ) expected_scores_image0_values = torch.tensor( - [0.0064, 0.0137, 0.0589, 0.0723, 0.5166, 0.0174, 0.1515, 0.2054, 0.0334] + [0.0064, 0.0139, 0.0591, 0.0727, 0.5170, 0.0175, 0.1526, 0.2057, 0.0335] ).to(torch_device) - expected_descriptors_image0_value = torch.tensor(-0.1096).to(torch_device) + expected_descriptors_image0_value = torch.tensor(-0.1095).to(torch_device) predicted_keypoints_image0_values = outputs.keypoints[0, :3] predicted_scores_image0_values = outputs.scores[0, :9] predicted_descriptors_image0_value = outputs.descriptors[0, 0, 0] From 439334c8fb4edf11314dc94c72dda868f87a0808 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Tue, 29 Oct 2024 10:48:57 +0100 Subject: [PATCH 52/99] Simplify running tests in a subprocess (#34213) * check * check * check * check * add docstring --------- Co-authored-by: ydshieh --- src/transformers/testing_utils.py | 40 +++++++++++++++++++ .../models/imagegpt/test_modeling_imagegpt.py | 8 ++-- .../video_llava/test_modeling_video_llava.py | 13 ++++-- 3 files changed, 52 insertions(+), 9 deletions(-) diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 2781e9e102e..0eef286732d 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -2366,6 +2366,46 @@ def run_test_in_subprocess(test_case, target_func, inputs=None, timeout=None): test_case.fail(f'{results["error"]}') +def run_test_using_subprocess(func): + """ + To decorate a test to run in a subprocess using the `subprocess` module. This could avoid potential GPU memory + issues (GPU OOM or a test that causes many subsequential failing with `CUDA error: device-side assert triggered`). + """ + import pytest + + @functools.wraps(func) + def wrapper(*args, **kwargs): + if os.getenv("_INSIDE_SUB_PROCESS", None) == "1": + func(*args, **kwargs) + else: + test = " ".join(os.environ.get("PYTEST_CURRENT_TEST").split(" ")[:-1]) + try: + import copy + + env = copy.deepcopy(os.environ) + env["_INSIDE_SUB_PROCESS"] = "1" + + # If not subclass of `unitTest.TestCase` and `pytestconfig` is used: try to grab and use the arguments + if "pytestconfig" in kwargs: + command = list(kwargs["pytestconfig"].invocation_params.args) + for idx, x in enumerate(command): + if x in kwargs["pytestconfig"].args: + test = test.split("::")[1:] + command[idx] = "::".join([f"{func.__globals__['__file__']}"] + test) + command = [f"{sys.executable}", "-m", "pytest"] + command + command = [x for x in command if x not in ["--no-summary"]] + # Otherwise, simply run the test with no option at all + else: + command = [f"{sys.executable}", "-m", "pytest", f"{test}"] + + subprocess.run(command, env=env, check=True, capture_output=True) + except subprocess.CalledProcessError as e: + exception_message = e.stdout.decode() + raise pytest.fail(exception_message, pytrace=False) + + return wrapper + + """ The following contains utils to run the documentation tests without having to overwrite any files. diff --git a/tests/models/imagegpt/test_modeling_imagegpt.py b/tests/models/imagegpt/test_modeling_imagegpt.py index 07972675528..cdbe815431f 100644 --- a/tests/models/imagegpt/test_modeling_imagegpt.py +++ b/tests/models/imagegpt/test_modeling_imagegpt.py @@ -18,7 +18,7 @@ import unittest from transformers import ImageGPTConfig -from transformers.testing_utils import require_torch, require_vision, slow, torch_device +from transformers.testing_utils import require_torch, require_vision, run_test_using_subprocess, slow, torch_device from transformers.utils import cached_property, is_torch_available, is_vision_available from ...generation.test_utils import GenerationTesterMixin @@ -257,11 +257,9 @@ def _check_scores(self, batch_size, scores, length, config): self.assertEqual(len(scores), length) self.assertListEqual([iter_scores.shape for iter_scores in scores], [expected_shape] * len(scores)) - @unittest.skip( - reason="After #33632, this test still passes, but many subsequential tests fail with `device-side assert triggered`" - ) + @run_test_using_subprocess def test_beam_search_generate_dict_outputs_use_cache(self): - pass + super().test_beam_search_generate_dict_outputs_use_cache() def setUp(self): self.model_tester = ImageGPTModelTester(self) diff --git a/tests/models/video_llava/test_modeling_video_llava.py b/tests/models/video_llava/test_modeling_video_llava.py index 1bd01843981..fd4c49f4a69 100644 --- a/tests/models/video_llava/test_modeling_video_llava.py +++ b/tests/models/video_llava/test_modeling_video_llava.py @@ -28,7 +28,14 @@ is_torch_available, is_vision_available, ) -from transformers.testing_utils import require_bitsandbytes, require_torch, require_torch_gpu, slow, torch_device +from transformers.testing_utils import ( + require_bitsandbytes, + require_torch, + require_torch_gpu, + run_test_using_subprocess, + slow, + torch_device, +) from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester @@ -248,9 +255,7 @@ def test_flash_attn_2_fp32_ln(self): def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self): pass - @unittest.skip( - reason="After #33533, this still passes, but many subsequential tests fail with `device-side assert triggered`" - ) + @run_test_using_subprocess def test_mixed_input(self): config, inputs = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: From 626c610a4d9d36427d392e0ed70a5c7018900eba Mon Sep 17 00:00:00 2001 From: Martin Gubri <1850174+Framartin@users.noreply.github.com> Date: Tue, 29 Oct 2024 11:10:10 +0100 Subject: [PATCH 53/99] Fix perplexity computation in perplexity.md (#34387) fix average NLL in perplexity.md --- docs/source/en/perplexity.md | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/docs/source/en/perplexity.md b/docs/source/en/perplexity.md index 7555619fe48..ac7ef8504e7 100644 --- a/docs/source/en/perplexity.md +++ b/docs/source/en/perplexity.md @@ -107,7 +107,8 @@ max_length = model.config.n_positions stride = 512 seq_len = encodings.input_ids.size(1) -nlls = [] +nll_sum = 0.0 +n_tokens = 0 prev_end_loc = 0 for begin_loc in tqdm(range(0, seq_len, stride)): end_loc = min(begin_loc + max_length, seq_len) @@ -124,13 +125,19 @@ for begin_loc in tqdm(range(0, seq_len, stride)): # to the left by 1. neg_log_likelihood = outputs.loss - nlls.append(neg_log_likelihood) + # Accumulate the total negative log-likelihood and the total number of tokens + num_valid_tokens = (target_ids != -100).sum().item() # number of valid tokens in target_ids + batch_size = target_ids.size(0) + num_loss_tokens = num_valid_tokens - batch_size # subtract batch_size due to internal label shift + nll_sum += neg_log_likelihood * num_loss_tokens + n_tokens += num_loss_tokens prev_end_loc = end_loc if end_loc == seq_len: break -ppl = torch.exp(torch.stack(nlls).mean()) +avg_nll = nll_sum / n_tokens # average negative log-likelihood per token +ppl = torch.exp(avg_nll) ``` Running this with the stride length equal to the max input length is equivalent to the suboptimal, non-sliding-window @@ -139,5 +146,5 @@ and the better the reported perplexity will typically be. When we run the above with `stride = 1024`, i.e. no overlap, the resulting PPL is `19.44`, which is about the same as the `19.93` reported in the GPT-2 paper. By using `stride = 512` and thereby employing our striding window -strategy, this jumps down to `16.45`. This is not only a more favorable score, but is calculated in a way that is +strategy, this jumps down to `16.44`. This is not only a more favorable score, but is calculated in a way that is closer to the true autoregressive decomposition of a sequence likelihood. From 9e3d704e2340fe9b306b5bd6b12605e4341c012b Mon Sep 17 00:00:00 2001 From: hlky Date: Tue, 29 Oct 2024 10:40:41 +0000 Subject: [PATCH 54/99] Fixes for Modular Converter on Windows (#34266) * Separator in regex * Standardize separator for relative path in auto generated message * open() encoding * Replace `\` on `os.path.abspath` --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- utils/modular_model_converter.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/utils/modular_model_converter.py b/utils/modular_model_converter.py index c107a483186..bda143c2577 100644 --- a/utils/modular_model_converter.py +++ b/utils/modular_model_converter.py @@ -56,7 +56,7 @@ def get_module_source_from_name(module_name: str) -> str: if spec is None or spec.origin is None: return f"Module {module_name} not found" - with open(spec.origin, "r") as file: + with open(spec.origin, "r", encoding="utf-8") as file: source_code = file.read() return source_code @@ -1132,7 +1132,7 @@ def convert_modular_file(modular_file, old_model_name=None, new_model_name=None, if pattern is not None: model_name = pattern.groups()[0] # Parse the Python file - with open(modular_file, "r") as file: + with open(modular_file, "r", encoding="utf-8") as file: code = file.read() module = cst.parse_module(code) wrapper = MetadataWrapper(module) @@ -1143,7 +1143,7 @@ def convert_modular_file(modular_file, old_model_name=None, new_model_name=None, if node != {}: # Get relative path starting from src/transformers/ relative_path = re.search( - rf"(src{os.sep}transformers{os.sep}.*|examples{os.sep}.*)", os.path.abspath(modular_file) + r"(src/transformers/.*|examples/.*)", os.path.abspath(modular_file).replace("\\", "/") ).group(1) header = AUTO_GENERATED_MESSAGE.format( @@ -1164,7 +1164,7 @@ def save_modeling_file(modular_file, converted_file): [line for line in converted_file[file_type][0].strip().split("\n") if not line.strip().startswith("#")] ) if len(converted_file[file_type][0].strip()) > 0 and non_comment_lines > 0: - with open(modular_file.replace("modular_", f"{file_type}_"), "w") as f: + with open(modular_file.replace("modular_", f"{file_type}_"), "w", encoding="utf-8") as f: f.write(converted_file[file_type][0]) else: non_comment_lines = len( @@ -1172,7 +1172,7 @@ def save_modeling_file(modular_file, converted_file): ) if len(converted_file[file_type][1].strip()) > 0 and non_comment_lines > 0: logger.warning("The modeling code contains errors, it's written without formatting") - with open(modular_file.replace("modular_", f"{file_type}_"), "w") as f: + with open(modular_file.replace("modular_", f"{file_type}_"), "w", encoding="utf-8") as f: f.write(converted_file[file_type][1]) From 004530aa050efcdd489f1ac6809626fa578636ad Mon Sep 17 00:00:00 2001 From: Marc Sun <57196510+SunMarc@users.noreply.github.com> Date: Tue, 29 Oct 2024 19:41:04 +0900 Subject: [PATCH 55/99] Fix regression loading dtype (#34409) * fix regression * add test for torchao * expected output * better fix --- src/transformers/modeling_utils.py | 9 +++++---- .../torchao_integration/test_torchao.py | 20 +++++++++++++++++++ 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index a6fbd7b1a91..8481fa7df9c 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -943,13 +943,14 @@ def _load_state_dict_into_meta_model( old_param = model splits = param_name.split(".") for split in splits: - old_param = getattr(old_param, split) - # Not all the attributes of a module are Parameters/Tensor - if not isinstance(old_param, (torch.nn.Parameter, torch.Tensor)): - old_param = None + # We shouldn't hit the default value unless for quant methods like hqq that modifies expected_keys. + old_param = getattr(old_param, split, None) if old_param is None: break + if not isinstance(old_param, (torch.nn.Parameter, torch.Tensor)): + old_param = None + if old_param is not None: if dtype is None: param = param.to(old_param.dtype) diff --git a/tests/quantization/torchao_integration/test_torchao.py b/tests/quantization/torchao_integration/test_torchao.py index 8014f745d08..c7c701e49ae 100644 --- a/tests/quantization/torchao_integration/test_torchao.py +++ b/tests/quantization/torchao_integration/test_torchao.py @@ -208,6 +208,26 @@ def test_int4wo_offload(self): self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT) + def test_int8_dynamic_activation_int8_weight_quant(self): + """ + Simple LLM model testing int8_dynamic_activation_int8_weight + """ + quant_config = TorchAoConfig("int8_dynamic_activation_int8_weight") + + # Note: we quantize the bfloat16 model on the fly to int4 + quantized_model = AutoModelForCausalLM.from_pretrained( + self.model_name, + device_map=torch_device, + quantization_config=quant_config, + ) + tokenizer = AutoTokenizer.from_pretrained(self.model_name) + + input_ids = tokenizer(self.input_text, return_tensors="pt").to(torch_device) + + output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens) + EXPECTED_OUTPUT = "What are we having for dinner?\n\nJessica: (smiling)" + self.assertEqual(tokenizer.decode(output[0], skip_special_tokens=True), EXPECTED_OUTPUT) + if __name__ == "__main__": unittest.main() From 5392f12e1614383270ae8df524415a1f6b555773 Mon Sep 17 00:00:00 2001 From: Guang Yang <42389959+guangy10@users.noreply.github.com> Date: Tue, 29 Oct 2024 06:30:02 -0700 Subject: [PATCH 56/99] Bert is ExecuTorch compatible (#34424) Co-authored-by: Guang Yang --- tests/models/bert/test_modeling_bert.py | 42 +++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py index 8ac1c3d2b40..aa9835d8cd6 100644 --- a/tests/models/bert/test_modeling_bert.py +++ b/tests/models/bert/test_modeling_bert.py @@ -16,6 +16,8 @@ import tempfile import unittest +from packaging import version + from transformers import AutoTokenizer, BertConfig, is_torch_available from transformers.models.auto import get_values from transformers.testing_utils import ( @@ -749,3 +751,43 @@ def test_sdpa_ignored_mask(self): self.assertTrue( torch.allclose(res_eager.last_hidden_state, res_sdpa.last_hidden_state, atol=1e-5, rtol=1e-4) ) + + @slow + def test_export(self): + if version.parse(torch.__version__) < version.parse("2.4.0"): + self.skipTest(reason="This test requires torch >= 2.4 to run.") + + bert_model = "google-bert/bert-base-uncased" + device = "cpu" + attn_implementation = "sdpa" + max_length = 512 + + tokenizer = AutoTokenizer.from_pretrained(bert_model) + inputs = tokenizer( + "the man worked as a [MASK].", + return_tensors="pt", + padding="max_length", + max_length=max_length, + ) + + model = BertForMaskedLM.from_pretrained( + bert_model, + device_map=device, + attn_implementation=attn_implementation, + use_cache=True, + ) + + logits = model(**inputs).logits + eg_predicted_mask = tokenizer.decode(logits[0, 6].topk(5).indices) + self.assertEqual(eg_predicted_mask.split(), ["carpenter", "waiter", "barber", "mechanic", "salesman"]) + + exported_program = torch.export.export( + model, + args=(inputs["input_ids"],), + kwargs={"attention_mask": inputs["attention_mask"]}, + strict=True, + ) + + result = exported_program.module().forward(inputs["input_ids"], inputs["attention_mask"]) + ep_predicted_mask = tokenizer.decode(result.logits[0, 6].topk(5).indices) + self.assertEqual(eg_predicted_mask, ep_predicted_mask) From 8755dd26b7e5ac25987a03627d317624dcdad2a1 Mon Sep 17 00:00:00 2001 From: Doohae Jung <80743307+wavy-jung@users.noreply.github.com> Date: Tue, 29 Oct 2024 22:31:36 +0900 Subject: [PATCH 57/99] manual `head_dim` for `mixtral` model (#34281) --- .../models/mixtral/configuration_mixtral.py | 4 ++++ src/transformers/models/mixtral/modeling_mixtral.py | 13 ++++--------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/transformers/models/mixtral/configuration_mixtral.py b/src/transformers/models/mixtral/configuration_mixtral.py index 164988b4dc5..686c214ef25 100644 --- a/src/transformers/models/mixtral/configuration_mixtral.py +++ b/src/transformers/models/mixtral/configuration_mixtral.py @@ -53,6 +53,8 @@ class MixtralConfig(PretrainedConfig): converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed by meanpooling all the original heads within that group. For more details checkout [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`. + head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`): + The attention head dimension. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): The non-linear activation function (function or string) in the decoder. max_position_embeddings (`int`, *optional*, defaults to `4096*32`): @@ -116,6 +118,7 @@ def __init__( num_hidden_layers=32, num_attention_heads=32, num_key_value_heads=8, + head_dim=None, hidden_act="silu", max_position_embeddings=4096 * 32, initializer_range=0.02, @@ -154,6 +157,7 @@ def __init__( self.use_cache = use_cache self.rope_theta = rope_theta self.attention_dropout = attention_dropout + self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads self.num_experts_per_tok = num_experts_per_tok self.num_local_experts = num_local_experts diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py index 78a17178ecd..de1cd1097a5 100644 --- a/src/transformers/models/mixtral/modeling_mixtral.py +++ b/src/transformers/models/mixtral/modeling_mixtral.py @@ -283,7 +283,7 @@ def __init__(self, config: MixtralConfig, layer_idx: Optional[int] = None): self.hidden_size = config.hidden_size self.num_heads = config.num_attention_heads - self.head_dim = self.hidden_size // self.num_heads + self.head_dim = config.head_dim self.num_key_value_heads = config.num_key_value_heads self.num_key_value_groups = self.num_heads // self.num_key_value_heads self.max_position_embeddings = config.max_position_embeddings @@ -291,11 +291,6 @@ def __init__(self, config: MixtralConfig, layer_idx: Optional[int] = None): self.is_causal = True self.attention_dropout = config.attention_dropout - if (self.head_dim * self.num_heads) != self.hidden_size: - raise ValueError( - f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" - f" and `num_heads`: {self.num_heads})." - ) self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) @@ -374,7 +369,7 @@ def forward( ) attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + attn_output = attn_output.reshape(bsz, q_len, -1) attn_output = self.o_proj(attn_output) @@ -481,7 +476,7 @@ def forward( is_causal=self.is_causal, ) - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() + attn_output = attn_output.reshape(bsz, q_len, -1).contiguous() attn_output = self.o_proj(attn_output) if not output_attentions: @@ -575,7 +570,7 @@ def forward( ) attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.view(bsz, q_len, self.hidden_size) + attn_output = attn_output.view(bsz, q_len, -1) attn_output = self.o_proj(attn_output) From 0ab0a4265131536d7422c57d0cc74c2afee1afd9 Mon Sep 17 00:00:00 2001 From: Shijie <821898965@qq.com> Date: Tue, 29 Oct 2024 22:27:34 +0800 Subject: [PATCH 58/99] fix-qwen2vl-no-position_ids (#33487) --- src/transformers/models/qwen2_vl/modeling_qwen2_vl.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py index 90bf29c8b5d..17e722a217d 100644 --- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py @@ -1719,6 +1719,9 @@ def forward( if attention_mask is not None: attention_mask = attention_mask.to(inputs_embeds.device) + if position_ids is None and input_ids is not None: + position_ids, _ = self.get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) + outputs = self.model( input_ids=None, position_ids=position_ids, From 56c45d575786de60acba02838fb2b0d1176b4ff7 Mon Sep 17 00:00:00 2001 From: Abhijit Deo <72816663+abhi-glitchhg@users.noreply.github.com> Date: Tue, 29 Oct 2024 20:39:18 +0530 Subject: [PATCH 59/99] Bug fix for drop path decay rate in swin transformer (#34291) * potential bug fix for drop path * variable name change * forgot to rename the variables * back to original * modify dpr properly * check_copies auto fix * corresponsing swin2 changes * auto fix * linting * default value for drop_path_rate as 0.0 * Update src/transformers/models/glm/modeling_glm.py * maskformer fix * ruff format * changes made to tf code as well * lint --------- Co-authored-by: abhijit deo <167164474+deo-abhijit@users.noreply.github.com> --- src/transformers/models/clap/modeling_clap.py | 5 +++-- .../models/donut/modeling_donut_swin.py | 5 +++-- .../models/maskformer/modeling_maskformer_swin.py | 7 +++---- src/transformers/models/swin/modeling_swin.py | 5 +++-- src/transformers/models/swin/modeling_tf_swin.py | 14 +++++++++++--- .../models/swin2sr/modeling_swin2sr.py | 6 ++++-- src/transformers/models/swinv2/modeling_swinv2.py | 7 +++++-- 7 files changed, 32 insertions(+), 17 deletions(-) diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py index d0224e3caa5..f422b17b204 100644 --- a/src/transformers/models/clap/modeling_clap.py +++ b/src/transformers/models/clap/modeling_clap.py @@ -575,7 +575,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: # Copied from transformers.models.swin.modeling_swin.SwinLayer with SwinDropPath->ClapDropPath, Swin->ClapAudio class ClapAudioLayer(nn.Module): - def __init__(self, config, dim, input_resolution, num_heads, shift_size=0): + def __init__(self, config, dim, input_resolution, num_heads, drop_path_rate=0.0, shift_size=0): super().__init__() self.chunk_size_feed_forward = config.chunk_size_feed_forward self.shift_size = shift_size @@ -583,7 +583,7 @@ def __init__(self, config, dim, input_resolution, num_heads, shift_size=0): self.input_resolution = input_resolution self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps) self.attention = ClapAudioAttention(config, dim, num_heads, window_size=self.window_size) - self.drop_path = ClapDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() + self.drop_path = ClapDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity() self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps) self.intermediate = ClapAudioIntermediate(config, dim) self.output = ClapAudioOutput(config, dim) @@ -712,6 +712,7 @@ def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, d dim=dim, input_resolution=input_resolution, num_heads=num_heads, + drop_path_rate=drop_path[i], shift_size=0 if (i % 2 == 0) else config.window_size // 2, ) for i in range(depth) diff --git a/src/transformers/models/donut/modeling_donut_swin.py b/src/transformers/models/donut/modeling_donut_swin.py index 8d639131b84..2d5272e8642 100644 --- a/src/transformers/models/donut/modeling_donut_swin.py +++ b/src/transformers/models/donut/modeling_donut_swin.py @@ -558,7 +558,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: # Copied from transformers.models.swin.modeling_swin.SwinLayer with Swin->DonutSwin class DonutSwinLayer(nn.Module): - def __init__(self, config, dim, input_resolution, num_heads, shift_size=0): + def __init__(self, config, dim, input_resolution, num_heads, drop_path_rate=0.0, shift_size=0): super().__init__() self.chunk_size_feed_forward = config.chunk_size_feed_forward self.shift_size = shift_size @@ -566,7 +566,7 @@ def __init__(self, config, dim, input_resolution, num_heads, shift_size=0): self.input_resolution = input_resolution self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps) self.attention = DonutSwinAttention(config, dim, num_heads, window_size=self.window_size) - self.drop_path = DonutSwinDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() + self.drop_path = DonutSwinDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity() self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps) self.intermediate = DonutSwinIntermediate(config, dim) self.output = DonutSwinOutput(config, dim) @@ -695,6 +695,7 @@ def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, d dim=dim, input_resolution=input_resolution, num_heads=num_heads, + drop_path_rate=drop_path[i], shift_size=0 if (i % 2 == 0) else config.window_size // 2, ) for i in range(depth) diff --git a/src/transformers/models/maskformer/modeling_maskformer_swin.py b/src/transformers/models/maskformer/modeling_maskformer_swin.py index 9a40e050459..598e1d8186a 100644 --- a/src/transformers/models/maskformer/modeling_maskformer_swin.py +++ b/src/transformers/models/maskformer/modeling_maskformer_swin.py @@ -520,16 +520,14 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class MaskFormerSwinLayer(nn.Module): - def __init__(self, config, dim, input_resolution, num_heads, shift_size=0): + def __init__(self, config, dim, input_resolution, num_heads, drop_path_rate=0.0, shift_size=0): super().__init__() self.shift_size = shift_size self.window_size = config.window_size self.input_resolution = input_resolution self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps) self.attention = MaskFormerSwinAttention(config, dim, num_heads, self.window_size) - self.drop_path = ( - MaskFormerSwinDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() - ) + self.drop_path = MaskFormerSwinDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity() self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps) self.intermediate = MaskFormerSwinIntermediate(config, dim) self.output = MaskFormerSwinOutput(config, dim) @@ -644,6 +642,7 @@ def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, d dim=dim, input_resolution=input_resolution, num_heads=num_heads, + drop_path_rate=drop_path[i], shift_size=0 if (i % 2 == 0) else config.window_size // 2, ) for i in range(depth) diff --git a/src/transformers/models/swin/modeling_swin.py b/src/transformers/models/swin/modeling_swin.py index 45383a36d9b..23f0ba6da62 100644 --- a/src/transformers/models/swin/modeling_swin.py +++ b/src/transformers/models/swin/modeling_swin.py @@ -635,7 +635,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class SwinLayer(nn.Module): - def __init__(self, config, dim, input_resolution, num_heads, shift_size=0): + def __init__(self, config, dim, input_resolution, num_heads, drop_path_rate=0.0, shift_size=0): super().__init__() self.chunk_size_feed_forward = config.chunk_size_feed_forward self.shift_size = shift_size @@ -643,7 +643,7 @@ def __init__(self, config, dim, input_resolution, num_heads, shift_size=0): self.input_resolution = input_resolution self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps) self.attention = SwinAttention(config, dim, num_heads, window_size=self.window_size) - self.drop_path = SwinDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() + self.drop_path = SwinDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity() self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps) self.intermediate = SwinIntermediate(config, dim) self.output = SwinOutput(config, dim) @@ -771,6 +771,7 @@ def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, d dim=dim, input_resolution=input_resolution, num_heads=num_heads, + drop_path_rate=drop_path[i], shift_size=0 if (i % 2 == 0) else config.window_size // 2, ) for i in range(depth) diff --git a/src/transformers/models/swin/modeling_tf_swin.py b/src/transformers/models/swin/modeling_tf_swin.py index 035b31e8d43..f1aa0bfef74 100644 --- a/src/transformers/models/swin/modeling_tf_swin.py +++ b/src/transformers/models/swin/modeling_tf_swin.py @@ -742,7 +742,14 @@ def build(self, input_shape=None): class TFSwinLayer(keras.layers.Layer): def __init__( - self, config, dim, input_resolution: Tuple[int, int], num_heads: int, shift_size: int = 0, **kwargs + self, + config, + dim, + input_resolution: Tuple[int, int], + num_heads: int, + drop_path_rate: float = 0.0, + shift_size: int = 0, + **kwargs, ) -> None: super().__init__(**kwargs) self.chunk_size_feed_forward = config.chunk_size_feed_forward @@ -754,8 +761,8 @@ def __init__( self.layernorm_before = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_before") self.attention = TFSwinAttention(config, dim, num_heads, name="attention") self.drop_path = ( - TFSwinDropPath(config.drop_path_rate, name="drop_path") - if config.drop_path_rate > 0.0 + TFSwinDropPath(drop_path_rate, name="drop_path") + if drop_path_rate > 0.0 else keras.layers.Activation("linear", name="drop_path") ) self.layernorm_after = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_after") @@ -913,6 +920,7 @@ def __init__( input_resolution=input_resolution, num_heads=num_heads, shift_size=0 if (i % 2 == 0) else config.window_size // 2, + drop_path_rate=drop_path[i], name=f"blocks.{i}", ) for i in range(depth) diff --git a/src/transformers/models/swin2sr/modeling_swin2sr.py b/src/transformers/models/swin2sr/modeling_swin2sr.py index b0a773c8af3..d6bd8da9bed 100644 --- a/src/transformers/models/swin2sr/modeling_swin2sr.py +++ b/src/transformers/models/swin2sr/modeling_swin2sr.py @@ -482,7 +482,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: # Copied from transformers.models.swinv2.modeling_swinv2.Swinv2Layer with Swinv2->Swin2SR class Swin2SRLayer(nn.Module): - def __init__(self, config, dim, input_resolution, num_heads, shift_size=0, pretrained_window_size=0): + def __init__( + self, config, dim, input_resolution, num_heads, drop_path_rate=0.0, shift_size=0, pretrained_window_size=0 + ): super().__init__() self.input_resolution = input_resolution window_size, shift_size = self._compute_window_shift( @@ -500,7 +502,7 @@ def __init__(self, config, dim, input_resolution, num_heads, shift_size=0, pretr else (pretrained_window_size, pretrained_window_size), ) self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps) - self.drop_path = Swin2SRDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() + self.drop_path = Swin2SRDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity() self.intermediate = Swin2SRIntermediate(config, dim) self.output = Swin2SROutput(config, dim) self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps) diff --git a/src/transformers/models/swinv2/modeling_swinv2.py b/src/transformers/models/swinv2/modeling_swinv2.py index 0c30e739a48..191923958cf 100644 --- a/src/transformers/models/swinv2/modeling_swinv2.py +++ b/src/transformers/models/swinv2/modeling_swinv2.py @@ -683,7 +683,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class Swinv2Layer(nn.Module): - def __init__(self, config, dim, input_resolution, num_heads, shift_size=0, pretrained_window_size=0): + def __init__( + self, config, dim, input_resolution, num_heads, drop_path_rate=0.0, shift_size=0, pretrained_window_size=0 + ): super().__init__() self.input_resolution = input_resolution window_size, shift_size = self._compute_window_shift( @@ -701,7 +703,7 @@ def __init__(self, config, dim, input_resolution, num_heads, shift_size=0, pretr else (pretrained_window_size, pretrained_window_size), ) self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps) - self.drop_path = Swinv2DropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() + self.drop_path = Swinv2DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity() self.intermediate = Swinv2Intermediate(config, dim) self.output = Swinv2Output(config, dim) self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps) @@ -819,6 +821,7 @@ def __init__( dim=dim, input_resolution=input_resolution, num_heads=num_heads, + drop_path_rate=drop_path[i], shift_size=0 if (i % 2 == 0) else config.window_size // 2, pretrained_window_size=pretrained_window_size, ) From 34620e8f0a974761debf52093968107c14f41315 Mon Sep 17 00:00:00 2001 From: Guang Yang <42389959+guangy10@users.noreply.github.com> Date: Tue, 29 Oct 2024 08:14:31 -0700 Subject: [PATCH 60/99] MobileBERT is ExecuTorch compatible (#34473) Co-authored-by: Guang Yang --- .../mobilebert/test_modeling_mobilebert.py | 43 ++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/tests/models/mobilebert/test_modeling_mobilebert.py b/tests/models/mobilebert/test_modeling_mobilebert.py index d7a409427c9..d2bc11d09f1 100644 --- a/tests/models/mobilebert/test_modeling_mobilebert.py +++ b/tests/models/mobilebert/test_modeling_mobilebert.py @@ -16,7 +16,9 @@ import unittest -from transformers import MobileBertConfig, is_torch_available +from packaging import version + +from transformers import AutoTokenizer, MobileBertConfig, MobileBertForMaskedLM, is_torch_available from transformers.models.auto import get_values from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device @@ -384,3 +386,42 @@ def test_inference_no_head(self): upper_bound = torch.all((expected_slice / output[..., :3, :3]) <= 1 + TOLERANCE) self.assertTrue(lower_bound and upper_bound) + + @slow + def test_export(self): + if version.parse(torch.__version__) < version.parse("2.4.0"): + self.skipTest(reason="This test requires torch >= 2.4 to run.") + + mobilebert_model = "google/mobilebert-uncased" + device = "cpu" + attn_implementation = "eager" + max_length = 512 + + tokenizer = AutoTokenizer.from_pretrained(mobilebert_model) + inputs = tokenizer( + f"the man worked as a {tokenizer.mask_token}.", + return_tensors="pt", + padding="max_length", + max_length=max_length, + ) + + model = MobileBertForMaskedLM.from_pretrained( + mobilebert_model, + device_map=device, + attn_implementation=attn_implementation, + ) + + logits = model(**inputs).logits + eg_predicted_mask = tokenizer.decode(logits[0, 6].topk(5).indices) + self.assertEqual(eg_predicted_mask.split(), ["carpenter", "waiter", "mechanic", "teacher", "clerk"]) + + exported_program = torch.export.export( + model, + args=(inputs["input_ids"],), + kwargs={"attention_mask": inputs["attention_mask"]}, + strict=True, + ) + + result = exported_program.module().forward(inputs["input_ids"], inputs["attention_mask"]) + ep_predicted_mask = tokenizer.decode(result.logits[0, 6].topk(5).indices) + self.assertEqual(eg_predicted_mask, ep_predicted_mask) From f339042b0b8bdc0b57a70d37f67cafbea960a2ab Mon Sep 17 00:00:00 2001 From: Guang Yang <42389959+guangy10@users.noreply.github.com> Date: Tue, 29 Oct 2024 08:22:13 -0700 Subject: [PATCH 61/99] Albert is ExecuTorch compatible (#34476) Co-authored-by: Guang Yang --- tests/models/albert/test_modeling_albert.py | 46 ++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/tests/models/albert/test_modeling_albert.py b/tests/models/albert/test_modeling_albert.py index d1e5631b342..970f1dd8555 100644 --- a/tests/models/albert/test_modeling_albert.py +++ b/tests/models/albert/test_modeling_albert.py @@ -16,7 +16,9 @@ import unittest -from transformers import AlbertConfig, is_torch_available +from packaging import version + +from transformers import AlbertConfig, AutoTokenizer, is_torch_available from transformers.models.auto import get_values from transformers.testing_utils import require_torch, slow, torch_device @@ -342,3 +344,45 @@ def test_inference_no_head_absolute_embedding(self): ) self.assertTrue(torch.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4)) + + @slow + def test_export(self): + if version.parse(torch.__version__) < version.parse("2.4.0"): + self.skipTest(reason="This test requires torch >= 2.4 to run.") + + distilbert_model = "albert/albert-base-v2" + device = "cpu" + attn_implementation = "sdpa" + max_length = 64 + + tokenizer = AutoTokenizer.from_pretrained(distilbert_model) + inputs = tokenizer( + f"Paris is the {tokenizer.mask_token} of France.", + return_tensors="pt", + padding="max_length", + max_length=max_length, + ) + + model = AlbertForMaskedLM.from_pretrained( + distilbert_model, + device_map=device, + attn_implementation=attn_implementation, + ) + + logits = model(**inputs).logits + eg_predicted_mask = tokenizer.decode(logits[0, 4].topk(5).indices) + self.assertEqual( + eg_predicted_mask.split(), + ["capital", "capitol", "comune", "arrondissement", "bastille"], + ) + + exported_program = torch.export.export( + model, + args=(inputs["input_ids"],), + kwargs={"attention_mask": inputs["attention_mask"]}, + strict=True, + ) + + result = exported_program.module().forward(inputs["input_ids"], inputs["attention_mask"]) + ep_predicted_mask = tokenizer.decode(result.logits[0, 4].topk(5).indices) + self.assertEqual(eg_predicted_mask, ep_predicted_mask) From e9ad46049411624bb1b6e830fbc1138991c0135e Mon Sep 17 00:00:00 2001 From: Apoorv Khandelwal Date: Tue, 29 Oct 2024 11:23:16 -0400 Subject: [PATCH 62/99] Adding `optimizer_cls_and_kwargs` to `Trainer.__init__` (#34358) * Adding `optimizer_cls_and_kwargs` to `Trainer.__init__` * formatting * make fix-copies docstring * added more docs for optimizer_cls_and_kwargs * add docs for Trainer(optimizer_cls_and_kwargs) * reverting anchor names --- docs/source/en/trainer.md | 106 +++++++++++++++++++++++------------- src/transformers/trainer.py | 18 +++++- 2 files changed, 82 insertions(+), 42 deletions(-) diff --git a/docs/source/en/trainer.md b/docs/source/en/trainer.md index f9ea3337699..7bee3472892 100644 --- a/docs/source/en/trainer.md +++ b/docs/source/en/trainer.md @@ -252,7 +252,70 @@ trainer = Trainer(..., args=training_args) NEFTune is disabled after training to restore the original embedding layer to avoid any unexpected behavior. -## GaLore +## Liger Kernel + +[Liger-Kernel](https://github.com/linkedin/Liger-Kernel) Kernel is a collection of Triton kernels developed by Linkedin designed specifically for LLM training. We have implemented Hugging Face Compatible RMSNorm, RoPE, SwiGLU, CrossEntropy, FusedLinearCrossEntropy, and more to come. It can effectively increase multi-GPU training throughput by 20% and reduces memory usage by 60%. The kernel works out of the box with flash attention, PyTorch FSDP, and Microsoft DeepSpeed. + + +Gain +20% throughput and reduce memory usage by 60% on LLaMA 3-8B model training. Achieve longer context lengths and larger batch sizes. It’s also useful if you want to scale up your model to multi-head training or large vocabulary sizes. Unleash multi-head training (medusa) and more. See details and examples in [Liger](https://github.com/linkedin/Liger-Kernel/tree/main/examples) + + +First make sure to install Liger official repository: +```bash +pip install liger-kernel +``` + +You should pass `use_liger_kernel=True` to apply liger kernel on your model, for example: + +```py +from transformers import TrainingArguments + +training_args = TrainingArguments( + output_dir="your-model", + learning_rate=2e-5, + per_device_train_batch_size=16, + per_device_eval_batch_size=16, + num_train_epochs=2, + weight_decay=0.01, + eval_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, + push_to_hub=True, + use_liger_kernel=True +) +``` + +The kernel supports the Llama, Gemma, Mistral, and Mixtral model architectures. The most up-to-date list of supported models can be found [here](https://github.com/linkedin/Liger-Kernel). When `use_liger_kernel` is set to `True`, the corresponding layers in the original model will be patched with Liger's efficient implementation, so you don't need to do anything extra other than setting the argument value. + + +## Optimizers + +You can choose a built-in optimizer for training using: + +```python +from transformers import TrainingArguments +training_args = TrainingArguments(..., optim="adamw_torch") +``` + +See [`OptimizerNames`](https://github.com/huggingface/transformers/blob/main/src/transformers/training_args.py) for a full list of choices. We include advanced examples in the sections below. + +You can also use an arbitrary PyTorch optimizer via: + +```python +import torch + +optimizer_cls = torch.optim.AdamW +optimizer_kwargs = { + "lr": 4e-3, + "betas": (0.9, 0.999), + "weight_decay": 0.05, +} + +from transformers import Trainer +trainer = Trainer(..., optimizer_cls_and_kwargs=(optimizer_cls, optimizer_kwargs)) +``` + +### GaLore Gradient Low-Rank Projection (GaLore) is a memory-efficient low-rank training strategy that allows full-parameter learning but is more memory-efficient than common low-rank adaptation methods, such as LoRA. @@ -382,42 +445,7 @@ trainer.train() Note layerwise optimization is a bit experimental and does not support DDP (Distributed Data Parallel), thus you can run the training script only on a single GPU. Please see [this appropriate section](https://github.com/jiaweizzhao/GaLore?tab=readme-ov-file#train-7b-model-with-a-single-gpu-with-24gb-memory) for more details. Other features such as gradient clipping, DeepSpeed, etc might not be supported out of the box. Please [raise an issue on GitHub](https://github.com/huggingface/transformers/issues) if you encounter such issue. -## Liger Kernel - -[Liger-Kernel](https://github.com/linkedin/Liger-Kernel) Kernel is a collection of Triton kernels developed by Linkedin designed specifically for LLM training. We have implemented Hugging Face Compatible RMSNorm, RoPE, SwiGLU, CrossEntropy, FusedLinearCrossEntropy, and more to come. It can effectively increase multi-GPU training throughput by 20% and reduces memory usage by 60%. The kernel works out of the box with flash attention, PyTorch FSDP, and Microsoft DeepSpeed. - - -Gain +20% throughput and reduce memory usage by 60% on LLaMA 3-8B model training. Achieve longer context lengths and larger batch sizes. It’s also useful if you want to scale up your model to multi-head training or large vocabulary sizes. Unleash multi-head training (medusa) and more. See details and examples in [Liger](https://github.com/linkedin/Liger-Kernel/tree/main/examples) - - -First make sure to install Liger official repository: -```bash -pip install liger-kernel -``` - -You should pass `use_liger_kernel=True` to apply liger kernel on your model, for example: - -```py -from transformers import TrainingArguments - -training_args = TrainingArguments( - output_dir="your-model", - learning_rate=2e-5, - per_device_train_batch_size=16, - per_device_eval_batch_size=16, - num_train_epochs=2, - weight_decay=0.01, - eval_strategy="epoch", - save_strategy="epoch", - load_best_model_at_end=True, - push_to_hub=True, - use_liger_kernel=True -) -``` - -The kernel supports the Llama, Gemma, Mistral, and Mixtral model architectures. The most up-to-date list of supported models can be found [here](https://github.com/linkedin/Liger-Kernel). When `use_liger_kernel` is set to `True`, the corresponding layers in the original model will be patched with Liger's efficient implementation, so you don't need to do anything extra other than setting the argument value. - -## LOMO optimizer +### LOMO optimizer The LOMO optimizers have been introduced in [Full Parameter Fine-Tuning for Large Language Models with Limited Resources](https://hf.co/papers/2306.09782) and [AdaLomo: Low-memory Optimization with Adaptive Learning Rate](https://hf.co/papers/2310.10195). They both consist of an efficient full-parameter fine-tuning method. These optimizers fuse the gradient computation and the parameter update in one step to reduce memory usage. Supported optimizers for LOMO are `"lomo"` and `"adalomo"`. First either install LOMO from pypi `pip install lomo-optim` or install it from source with `pip install git+https://github.com/OpenLMLab/LOMO.git`. @@ -467,7 +495,7 @@ trainer = trl.SFTTrainer( trainer.train() ``` -## GrokAdamW optimizer +### GrokAdamW optimizer The GrokAdamW optimizer is designed to enhance training performance and stability, particularly for models that benefit from grokking signal functions. To use GrokAdamW, first install the optimizer package with `pip install grokadamw`. @@ -518,7 +546,7 @@ trainer.train() This script demonstrates how to fine-tune the `google/gemma-2b` model on the IMDB dataset using the GrokAdamW optimizer. The `TrainingArguments` are configured to use GrokAdamW, and the dataset is passed to the `Trainer` for training. -## Schedule Free Optimizer +### Schedule Free Optimizer The Schedule Free optimizers have been introduced in [The Road Less Scheduled](https://hf.co/papers/2405.15682). Schedule-Free learning replaces the momentum of the base optimizer with a combination of averaging and interpolation, to completely remove the need to anneal the learning rate with a traditional schedule. diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 9176bd72a55..e2ae622e2b6 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -34,7 +34,7 @@ import warnings from collections.abc import Mapping from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Type, Union # Integrations must be imported before ML frameworks: @@ -358,6 +358,11 @@ class Trainer: optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`): A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`. + optimizer_cls_and_kwargs (`Tuple[Type[torch.optim.Optimizer], Dict[str, Any]]`, *optional*): + A tuple containing the optimizer class and keyword arguments to use. + Overrides `optim` and `optim_args` in `args`. Incompatible with the `optimizers` argument. + + Unlike `optimizers`, this argument avoids the need to place model parameters on the correct devices before initializing the Trainer. preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*): A function that preprocess the logits right before caching them at each evaluation step. Must take two tensors, the logits and the labels, and return the logits once processed as desired. The modifications made @@ -401,7 +406,8 @@ def __init__( compute_loss_func: Optional[Callable] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, callbacks: Optional[List[TrainerCallback]] = None, - optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), + optimizers: Tuple[Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]] = (None, None), + optimizer_cls_and_kwargs: Optional[Tuple[Type[torch.optim.Optimizer], Dict[str, Any]]] = None, preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, ): if args is None: @@ -603,6 +609,9 @@ def __init__( self.compute_metrics = compute_metrics self.preprocess_logits_for_metrics = preprocess_logits_for_metrics self.optimizer, self.lr_scheduler = optimizers + self.optimizer_cls_and_kwargs = optimizer_cls_and_kwargs + if self.optimizer_cls_and_kwargs is not None and self.optimizer is not None: + raise RuntimeError("Passing both `optimizers` and `optimizer_cls_and_kwargs` arguments is incompatible.") if model_init is not None and (self.optimizer is not None or self.lr_scheduler is not None): raise RuntimeError( "Passing a `model_init` is incompatible with providing the `optimizers` argument. " @@ -1171,7 +1180,10 @@ def create_optimizer(self): }, ] - optimizer_cls, optimizer_kwargs = self.get_optimizer_cls_and_kwargs(self.args, opt_model) + if self.optimizer_cls_and_kwargs is not None: + optimizer_cls, optimizer_kwargs = self.optimizer_cls_and_kwargs + else: + optimizer_cls, optimizer_kwargs = self.get_optimizer_cls_and_kwargs(self.args, opt_model) # Overwrite `params` in case it's created by `get_optimizer_cls_and_kwargs` # e.g. for GaLore optimizer. From 4e2e8809ff4a596f7a9398c04293804a01cbe1d5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 29 Oct 2024 16:42:40 +0000 Subject: [PATCH 63/99] Bump werkzeug from 3.0.3 to 3.0.6 in /examples/research_projects/decision_transformer (#34420) Bump werkzeug in /examples/research_projects/decision_transformer Bumps [werkzeug](https://github.com/pallets/werkzeug) from 3.0.3 to 3.0.6. - [Release notes](https://github.com/pallets/werkzeug/releases) - [Changelog](https://github.com/pallets/werkzeug/blob/main/CHANGES.rst) - [Commits](https://github.com/pallets/werkzeug/compare/3.0.3...3.0.6) --- updated-dependencies: - dependency-name: werkzeug dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .../research_projects/decision_transformer/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/research_projects/decision_transformer/requirements.txt b/examples/research_projects/decision_transformer/requirements.txt index a54f3d03cab..6d42c3256a8 100644 --- a/examples/research_projects/decision_transformer/requirements.txt +++ b/examples/research_projects/decision_transformer/requirements.txt @@ -233,7 +233,7 @@ urllib3==1.26.19 wasabi==0.9.0 wcwidth==0.2.5 websocket-client==1.3.1 -Werkzeug==3.0.3 +Werkzeug==3.0.6 wrapt==1.14.0 xxhash==3.0.0 yarl==1.7.2 From f55595b177d66c3f8d9d43c8cccd52d263580bd1 Mon Sep 17 00:00:00 2001 From: Aleksey Lobanov Date: Tue, 29 Oct 2024 20:29:24 +0300 Subject: [PATCH 64/99] Fix performance in get_imports regexp (#34298) * fix: Fix performance in get_imports regexp * Minimize get_imports content regexp --- src/transformers/dynamic_module_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/dynamic_module_utils.py b/src/transformers/dynamic_module_utils.py index 4e0e1dd3430..bf44d4b427c 100644 --- a/src/transformers/dynamic_module_utils.py +++ b/src/transformers/dynamic_module_utils.py @@ -152,7 +152,8 @@ def get_imports(filename: Union[str, os.PathLike]) -> List[str]: content = f.read() # filter out try/except block so in custom code we can have try/except imports - content = re.sub(r"\s*try\s*:\s*.*?\s*except\s*.*?:", "", content, flags=re.MULTILINE | re.DOTALL) + content = re.sub(r"\s*try\s*:.*?except.*?:", "", content, flags=re.DOTALL) + # filter out imports under is_flash_attn_2_available block for avoid import issues in cpu only environment content = re.sub( r"if is_flash_attn[a-zA-Z0-9_]+available\(\):\s*(from flash_attn\s*.*\s*)+", "", content, flags=re.MULTILINE From e4449bb790db5cb27a6cb36581991cf5e75209a9 Mon Sep 17 00:00:00 2001 From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com> Date: Tue, 29 Oct 2024 14:08:42 -0400 Subject: [PATCH 65/99] fix incorrect warning (#34416) --- src/transformers/processing_utils.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index b363f8c72cc..286ca49de85 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -874,12 +874,13 @@ class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwarg else: # kwargs is a flat dictionary for key in kwargs: - if key not in ModelProcessorKwargs.__annotations__["common_kwargs"].__annotations__.keys(): - logger.warning_once( - f"Keyword argument `{key}` is not a valid argument for this processor and will be ignored." - ) - elif key not in used_keys: - output_kwargs["common_kwargs"][key] = kwargs[key] + if key not in used_keys: + if key in ModelProcessorKwargs.__annotations__["common_kwargs"].__annotations__.keys(): + output_kwargs["common_kwargs"][key] = kwargs[key] + else: + logger.warning_once( + f"Keyword argument `{key}` is not a valid argument for this processor and will be ignored." + ) # all modality-specific kwargs are updated with common kwargs for modality in output_kwargs: From 9bee9ff5db6e68fb31065898d7e924d07c1eb9c1 Mon Sep 17 00:00:00 2001 From: Matt Date: Tue, 29 Oct 2024 18:45:14 +0000 Subject: [PATCH 66/99] Un-deprecate timeout arg in pipelines (#34382) * Un-deprecate timeout * Put "timeout" on the allowed list * make fixup --- src/transformers/pipelines/depth_estimation.py | 7 +++---- src/transformers/pipelines/image_classification.py | 7 +++---- src/transformers/pipelines/image_segmentation.py | 7 +++---- src/transformers/pipelines/image_to_text.py | 8 ++++---- src/transformers/pipelines/object_detection.py | 7 +++---- .../pipelines/zero_shot_image_classification.py | 7 ++++--- tests/test_pipeline_mixin.py | 7 +++++++ 7 files changed, 27 insertions(+), 23 deletions(-) diff --git a/src/transformers/pipelines/depth_estimation.py b/src/transformers/pipelines/depth_estimation.py index ae86c552a72..2203ac09c9c 100644 --- a/src/transformers/pipelines/depth_estimation.py +++ b/src/transformers/pipelines/depth_estimation.py @@ -1,4 +1,3 @@ -import warnings from typing import List, Union from ..utils import ( @@ -72,6 +71,9 @@ def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Imag A dictionary of argument names to parameter values, to control pipeline behaviour. The only parameter available right now is `timeout`, which is the length of time, in seconds, that the pipeline should wait before giving up on trying to download an image. + timeout (`float`, *optional*, defaults to None): + The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and + the call may block forever. Return: A dictionary or a list of dictionaries containing result. If the input is a single image, will return a @@ -93,9 +95,6 @@ def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Imag def _sanitize_parameters(self, timeout=None, parameters=None, **kwargs): preprocess_params = {} if timeout is not None: - warnings.warn( - "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning - ) preprocess_params["timeout"] = timeout if isinstance(parameters, dict) and "timeout" in parameters: preprocess_params["timeout"] = parameters["timeout"] diff --git a/src/transformers/pipelines/image_classification.py b/src/transformers/pipelines/image_classification.py index 20ad72e7905..0085e5eb73f 100644 --- a/src/transformers/pipelines/image_classification.py +++ b/src/transformers/pipelines/image_classification.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import warnings from typing import List, Union import numpy as np @@ -113,9 +112,6 @@ def __init__(self, *args, **kwargs): def _sanitize_parameters(self, top_k=None, function_to_apply=None, timeout=None): preprocess_params = {} if timeout is not None: - warnings.warn( - "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning - ) preprocess_params["timeout"] = timeout postprocess_params = {} if top_k is not None: @@ -159,6 +155,9 @@ def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Imag top_k (`int`, *optional*, defaults to 5): The number of top labels that will be returned by the pipeline. If the provided number is higher than the number of labels available in the model configuration, it will default to the number of labels. + timeout (`float`, *optional*, defaults to None): + The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and + the call may block forever. Return: A dictionary or a list of dictionaries containing result. If the input is a single image, will return a diff --git a/src/transformers/pipelines/image_segmentation.py b/src/transformers/pipelines/image_segmentation.py index 0ac653fd1e8..d388e591bf9 100644 --- a/src/transformers/pipelines/image_segmentation.py +++ b/src/transformers/pipelines/image_segmentation.py @@ -1,4 +1,3 @@ -import warnings from typing import Any, Dict, List, Union import numpy as np @@ -91,9 +90,6 @@ def _sanitize_parameters(self, **kwargs): if "overlap_mask_area_threshold" in kwargs: postprocess_kwargs["overlap_mask_area_threshold"] = kwargs["overlap_mask_area_threshold"] if "timeout" in kwargs: - warnings.warn( - "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning - ) preprocess_kwargs["timeout"] = kwargs["timeout"] return preprocess_kwargs, {}, postprocess_kwargs @@ -122,6 +118,9 @@ def __call__(self, inputs=None, **kwargs) -> Union[Predictions, List[Prediction] Threshold to use when turning the predicted masks into binary values. overlap_mask_area_threshold (`float`, *optional*, defaults to 0.5): Mask overlap threshold to eliminate small, disconnected segments. + timeout (`float`, *optional*, defaults to None): + The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and + the call may block forever. Return: A dictionary or a list of dictionaries containing the result. If the input is a single image, will return a diff --git a/src/transformers/pipelines/image_to_text.py b/src/transformers/pipelines/image_to_text.py index 4beaa481920..0d37ce91dad 100644 --- a/src/transformers/pipelines/image_to_text.py +++ b/src/transformers/pipelines/image_to_text.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import warnings from typing import List, Union from ..utils import ( @@ -81,9 +80,6 @@ def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None, prompt if prompt is not None: preprocess_params["prompt"] = prompt if timeout is not None: - warnings.warn( - "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning - ) preprocess_params["timeout"] = timeout if max_new_tokens is not None: @@ -118,6 +114,10 @@ def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Imag generate_kwargs (`Dict`, *optional*): Pass it to send all of these arguments directly to `generate` allowing full control of this function. + timeout (`float`, *optional*, defaults to None): + The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and + the call may block forever. + Return: A list or a list of list of `dict`: Each result comes as a dictionary with the following key: diff --git a/src/transformers/pipelines/object_detection.py b/src/transformers/pipelines/object_detection.py index c135b1e131a..c84f17b2bd6 100644 --- a/src/transformers/pipelines/object_detection.py +++ b/src/transformers/pipelines/object_detection.py @@ -1,4 +1,3 @@ -import warnings from typing import Any, Dict, List, Union from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends @@ -64,9 +63,6 @@ def __init__(self, *args, **kwargs): def _sanitize_parameters(self, **kwargs): preprocess_params = {} if "timeout" in kwargs: - warnings.warn( - "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning - ) preprocess_params["timeout"] = kwargs["timeout"] postprocess_kwargs = {} if "threshold" in kwargs: @@ -89,6 +85,9 @@ def __call__(self, *args, **kwargs) -> Union[Predictions, List[Prediction]]: same format: all as HTTP(S) links, all as local paths, or all as PIL images. threshold (`float`, *optional*, defaults to 0.5): The probability necessary to make a prediction. + timeout (`float`, *optional*, defaults to None): + The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and + the call may block forever. Return: A list of dictionaries or a list of list of dictionaries containing the result. If the input is a single diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py index 253c684fcbb..c53b515dccc 100644 --- a/src/transformers/pipelines/zero_shot_image_classification.py +++ b/src/transformers/pipelines/zero_shot_image_classification.py @@ -94,6 +94,10 @@ def __call__(self, image: Union[str, List[str], "Image", List["Image"]] = None, replacing the placeholder with the candidate_labels. Pass "{}" if *candidate_labels* are already formatted. + timeout (`float`, *optional*, defaults to None): + The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and + the call may block forever. + Return: A list of dictionaries containing one entry per proposed label. Each dictionary contains the following keys: @@ -113,9 +117,6 @@ def _sanitize_parameters(self, tokenizer_kwargs=None, **kwargs): if "candidate_labels" in kwargs: preprocess_params["candidate_labels"] = kwargs["candidate_labels"] if "timeout" in kwargs: - warnings.warn( - "The `timeout` argument is deprecated and will be removed in version 5 of Transformers", FutureWarning - ) preprocess_params["timeout"] = kwargs["timeout"] if "hypothesis_template" in kwargs: preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"] diff --git a/tests/test_pipeline_mixin.py b/tests/test_pipeline_mixin.py index fe8a1972372..f079bcdd92e 100644 --- a/tests/test_pipeline_mixin.py +++ b/tests/test_pipeline_mixin.py @@ -916,6 +916,8 @@ def parse_args_from_docstring_by_indentation(docstring): def compare_pipeline_args_to_hub_spec(pipeline_class, hub_spec): + ALLOWED_TRANSFORMERS_ONLY_ARGS = ["timeout"] + docstring = inspect.getdoc(pipeline_class.__call__).strip() docstring_args = set(parse_args_from_docstring_by_indentation(docstring)) hub_args = set(get_arg_names_from_hub_spec(hub_spec)) @@ -933,6 +935,11 @@ def compare_pipeline_args_to_hub_spec(pipeline_class, hub_spec): hub_args.remove(js_generate_args[0]) docstring_args.remove(docstring_generate_args[0]) + # Special casing 2: We permit some transformers-only arguments that don't affect pipeline output + for arg in ALLOWED_TRANSFORMERS_ONLY_ARGS: + if arg in docstring_args and arg not in hub_args: + docstring_args.remove(arg) + if hub_args != docstring_args: error = [f"{pipeline_class.__name__} differs from JS spec {hub_spec.__name__}"] matching_args = hub_args & docstring_args From cd277618d4dbcafff108739e46584fd0a5c8f872 Mon Sep 17 00:00:00 2001 From: Guang Yang <42389959+guangy10@users.noreply.github.com> Date: Wed, 30 Oct 2024 01:36:45 -0700 Subject: [PATCH 67/99] Roberta is ExecuTorch compatible (#34425) * Roberta is ExecuTorch compatible * [run_slow] roberta --------- Co-authored-by: Guang Yang --- tests/models/roberta/test_modeling_roberta.py | 43 ++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/tests/models/roberta/test_modeling_roberta.py b/tests/models/roberta/test_modeling_roberta.py index ca557937803..1c128513b17 100644 --- a/tests/models/roberta/test_modeling_roberta.py +++ b/tests/models/roberta/test_modeling_roberta.py @@ -16,7 +16,7 @@ import unittest -from transformers import RobertaConfig, is_torch_available +from transformers import AutoTokenizer, RobertaConfig, is_torch_available from transformers.testing_utils import TestCasePlus, require_torch, slow, torch_device from ...generation.test_utils import GenerationTesterMixin @@ -41,6 +41,7 @@ RobertaEmbeddings, create_position_ids_from_input_ids, ) + from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_4 ROBERTA_TINY = "sshleifer/tiny-distilroberta-base" @@ -576,3 +577,43 @@ def test_inference_classification_head(self): # expected_tensor = roberta.predict("mnli", input_ids, return_logits=True).detach() self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-4)) + + @slow + def test_export(self): + if not is_torch_greater_or_equal_than_2_4: + self.skipTest(reason="This test requires torch >= 2.4 to run.") + + roberta_model = "FacebookAI/roberta-base" + device = "cpu" + attn_implementation = "sdpa" + max_length = 512 + + tokenizer = AutoTokenizer.from_pretrained(roberta_model) + inputs = tokenizer( + "The goal of life is .", + return_tensors="pt", + padding="max_length", + max_length=max_length, + ) + + model = RobertaForMaskedLM.from_pretrained( + roberta_model, + device_map=device, + attn_implementation=attn_implementation, + use_cache=True, + ) + + logits = model(**inputs).logits + eager_predicted_mask = tokenizer.decode(logits[0, 6].topk(5).indices) + self.assertEqual(eager_predicted_mask.split(), ["happiness", "love", "peace", "freedom", "simplicity"]) + + exported_program = torch.export.export( + model, + args=(inputs["input_ids"],), + kwargs={"attention_mask": inputs["attention_mask"]}, + strict=True, + ) + + result = exported_program.module().forward(inputs["input_ids"], inputs["attention_mask"]) + exported_predicted_mask = tokenizer.decode(result.logits[0, 6].topk(5).indices) + self.assertEqual(eager_predicted_mask, exported_predicted_mask) From 25a9fc584acb09afecb08b6cfd74e705058bf2ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peth=C5=91=20Gergely?= Date: Wed, 30 Oct 2024 10:03:41 +0100 Subject: [PATCH 68/99] Fix format mistake in string repr of tokenizer objects (#34493) * fix repr string format for tokenizer objects The repr of tokenizer tokens looks confusing and just stupid, like this: `Tokenizer(...), added_tokens_decoder={1: ..., 2: ...}`. The dict that is the value of the added_tokens_decoder attribute is outside of the parentheses of the tokenizer object, whereas all other attributes are inside the parentheses like they should be. This commit fixes this bug. * cos: add newline before closing parenthesis of repr string --- src/transformers/tokenization_utils_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 4f3187d510f..89ab2dc9260 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1687,8 +1687,8 @@ def __repr__(self) -> str: f"{self.__class__.__name__}(name_or_path='{self.name_or_path}'," f" vocab_size={self.vocab_size}, model_max_length={self.model_max_length}, is_fast={self.is_fast}," f" padding_side='{self.padding_side}', truncation_side='{self.truncation_side}'," - f" special_tokens={self.special_tokens_map}, clean_up_tokenization_spaces={self.clean_up_tokenization_spaces}), " - " added_tokens_decoder={\n\t" + added_tokens_decoder_rep + "\n}" + f" special_tokens={self.special_tokens_map}, clean_up_tokenization_spaces={self.clean_up_tokenization_spaces}," + " added_tokens_decoder={\n\t" + added_tokens_decoder_rep + "\n}\n)" ) def __len__(self) -> int: From 0f764a5af77de9a8f74521ec2237dc79feb40a71 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Wed, 30 Oct 2024 10:11:50 +0100 Subject: [PATCH 69/99] Mllama: update docs (#34334) * update docs * be more explicit * use avaialble methods --- docs/source/en/model_doc/mllama.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/docs/source/en/model_doc/mllama.md b/docs/source/en/model_doc/mllama.md index 9cb038ed2e3..4a6080ea2ce 100644 --- a/docs/source/en/model_doc/mllama.md +++ b/docs/source/en/model_doc/mllama.md @@ -30,6 +30,25 @@ The Llama 3.2-Vision collection of multimodal large language models (LLMs) is a - The text passed to the processor should have the `"<|image|>"` tokens where the images should be inserted. - The processor has its own `apply_chat_template` method to convert chat messages to text that can then be passed as text to the processor. + + + +Mllama has an extra token used as a placeholder for image positions in the text. It means that input ids and an input embedding layer will have an extra token. But since the weights for input and output embeddings are not tied, the `lm_head` layer has one less token and will fail if you want to calculate loss on image tokens or apply some logit processors. In case you are training, make sure to mask out special `"<|image|>"` tokens in the `labels` as the model should not be trained on predicting them. + +Otherwise if you see CUDA-side index erros when generating, use the below code to expand the `lm_head` by one more token. + + +```python +old_embeddings = model.get_output_embeddings() + +num_tokens = model.vocab_size + 1 +resized_embeddings = model._get_resized_lm_head(old_embeddings, new_num_tokens=num_tokens, mean_resizing=True) +resized_embeddings.requires_grad_(old_embeddings.weight.requires_grad) +model.set_output_embeddings(resized_embeddings) +``` + + + ## Usage Example #### Instruct model From 913330ca9f80b0a308d7490a02274b01b51e6051 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Wed, 30 Oct 2024 10:21:37 +0100 Subject: [PATCH 70/99] VLMs: fix number of image tokens (#34332) * fix * fix tests * add tests * style * style * fix qwen after rebase * fix video llava --- .../models/chameleon/modeling_chameleon.py | 2 +- .../models/llava/modeling_llava.py | 5 +-- .../modeling_llava_next_video.py | 1 + .../modular_llava_next_video.py | 1 + .../modeling_llava_onevision.py | 2 ++ .../models/qwen2_vl/modeling_qwen2_vl.py | 5 +-- .../video_llava/modeling_video_llava.py | 8 ++--- .../models/vipllava/modeling_vipllava.py | 4 +-- tests/models/llava/test_modeling_llava.py | 29 +++++++++++++++ .../llava_next/test_modeling_llava_next.py | 32 +++++++++++++++++ .../test_modeling_llava_next_video.py | 32 +++++++++++++++++ .../paligemma/test_modeling_paligemma.py | 30 ++++++++++++++++ .../models/qwen2_vl/test_modeling_qwen2_vl.py | 36 ++++++++++++++++++- .../video_llava/test_modeling_video_llava.py | 35 ++++++++++++++++-- .../models/vipllava/test_modeling_vipllava.py | 30 ++++++++++++++++ 15 files changed, 237 insertions(+), 15 deletions(-) diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py index 79790827793..0661da87279 100644 --- a/src/transformers/models/chameleon/modeling_chameleon.py +++ b/src/transformers/models/chameleon/modeling_chameleon.py @@ -1288,7 +1288,7 @@ def forward( if pixel_values is not None: image_tokens = self.get_image_tokens(pixel_values) n_image_tokens_in_text = (input_ids == self.vocabulary_mapping.image_token_id).sum().item() - n_image_features = image_tokens.shape[0] + n_image_features = image_tokens.shape[0] * image_tokens.shape[1] if n_image_tokens_in_text != n_image_features: raise ValueError( f"Image features and image tokens do not match: tokens: {n_image_tokens_in_text}, features {n_image_features}" diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py index a0079f1787a..6d6bf4a6f38 100644 --- a/src/transformers/models/llava/modeling_llava.py +++ b/src/transformers/models/llava/modeling_llava.py @@ -527,8 +527,9 @@ def forward( # TODO: @raushan retain only the new behavior after v4.47 elif image_features is not None: - n_image_tokens = (input_ids == self.config.image_token_index).sum(dim=-1)[0].item() - n_image_features = image_features.shape[1] + n_image_tokens = (input_ids == self.config.image_token_index).sum().item() + n_image_features = image_features.shape[0] * image_features.shape[1] + if n_image_tokens != n_image_features: raise ValueError( f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index 44b372535d7..c40ee1f70f9 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -1020,6 +1020,7 @@ def forward( if image_features is not None: n_image_tokens = (input_ids == self.config.image_token_index).sum().item() n_image_features = image_features.shape[0] + if n_image_tokens != n_image_features: raise ValueError( f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py index e9974e92049..1425a017dc0 100644 --- a/src/transformers/models/llava_next_video/modular_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py @@ -533,6 +533,7 @@ def forward( if image_features is not None: n_image_tokens = (input_ids == self.config.image_token_index).sum().item() n_image_features = image_features.shape[0] + if n_image_tokens != n_image_features: raise ValueError( f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py index 946688bfcf0..f8bdb5bf8d5 100644 --- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py @@ -679,6 +679,7 @@ def forward( ) n_image_tokens = (input_ids == self.config.image_token_index).sum().item() n_image_features = image_features.shape[0] + if n_image_tokens != n_image_features: raise ValueError( f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" @@ -704,6 +705,7 @@ def forward( ) video_features = torch.cat((video_features, image_newline), dim=1) video_features = video_features.flatten(0, 1) + n_video_tokens = (input_ids == self.config.video_token_index).sum().item() n_video_features = video_features.shape[0] if n_video_tokens != n_video_features: diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py index 17e722a217d..9c0d0b45ee8 100644 --- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py @@ -1503,13 +1503,14 @@ def get_rope_index( mrope_position_deltas = [] if image_grid_thw is not None or video_grid_thw is not None: total_input_ids = input_ids + if attention_mask is None: + attention_mask = torch.ones_like(total_input_ids) position_ids = torch.ones( 3, input_ids.shape[0], input_ids.shape[1], dtype=input_ids.dtype, device=input_ids.device ) image_index, video_index = 0, 0 for i, input_ids in enumerate(total_input_ids): - if attention_mask is not None: - input_ids = input_ids[attention_mask[i] == 1] + input_ids = input_ids[attention_mask[i] == 1] image_nums, video_nums = 0, 0 vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1) vision_tokens = input_ids[vision_start_indices + 1] diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py index 30f82e45056..02efc7c344f 100644 --- a/src/transformers/models/video_llava/modeling_video_llava.py +++ b/src/transformers/models/video_llava/modeling_video_llava.py @@ -628,8 +628,8 @@ def forward( # TODO: @raushan retain only the new behavior after v4.47 else: if pixel_values_images is not None: - n_image_tokens = (input_ids == self.config.image_token_index).sum(dim=-1)[0].item() - n_image_features = image_features.shape[1] + n_image_tokens = (input_ids == self.config.image_token_index).sum().item() + n_image_features = image_features.shape[0] * image_features.shape[1] if n_image_tokens != n_image_features: raise ValueError( f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" @@ -644,8 +644,8 @@ def forward( inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) if pixel_values_videos is not None: - n_video_tokens = (input_ids == self.config.video_token_index).sum(dim=-1)[0].item() - n_video_features = video_features.shape[1] + n_video_tokens = (input_ids == self.config.video_token_index).sum().item() + n_video_features = video_features.shape[0] * video_features.shape[1] if n_video_tokens != n_video_features: raise ValueError( f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}" diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py index c9db6e261c6..4060f8c8ecd 100644 --- a/src/transformers/models/vipllava/modeling_vipllava.py +++ b/src/transformers/models/vipllava/modeling_vipllava.py @@ -517,8 +517,8 @@ def forward( # TODO: @raushan retain only the new behavior after v4.47 elif image_features is not None: - n_image_tokens = (input_ids == self.config.image_token_index).sum(dim=-1)[0].item() - n_image_features = image_features.shape[1] + n_image_tokens = (input_ids == self.config.image_token_index).sum().item() + n_image_features = image_features.shape[0] * image_features.shape[1] if n_image_tokens != n_image_features: raise ValueError( f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py index 405fad1bd31..1a17f18de34 100644 --- a/tests/models/llava/test_modeling_llava.py +++ b/tests/models/llava/test_modeling_llava.py @@ -235,6 +235,35 @@ def test_inputs_embeds_matches_input_ids(self): out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0] self.assertTrue(torch.allclose(out_embeds, out_ids)) + def test_mismatching_num_image_tokens(self): + """ + Tests that VLMs through an error with explicit message saying what is wrong + when number of images don't match number of image tokens in the text. + Also we need to test multi-image cases when one prompr has multiple image tokens. + """ + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + model = model_class(config).to(torch_device) + _ = model(**input_dict) # successfull forward with no modifications + + # remove one image but leave the image token in text + input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...] + with self.assertRaises(ValueError): + _ = model(**input_dict) + + # simulate multi-image case by concatenating inputs where each has exactly one image/image-token + input_ids = input_dict["input_ids"][:1] + pixel_values = input_dict["pixel_values"][:1] + input_ids = torch.cat([input_ids, input_ids], dim=0) + + # one image and two image tokens raise an error + with self.assertRaises(ValueError): + _ = model(input_ids=input_ids, pixel_values=pixel_values) + + # two images and two image tokens don't raise an error + pixel_values = torch.cat([pixel_values, pixel_values], dim=0) + _ = model(input_ids=input_ids, pixel_values=pixel_values) + @unittest.skip( reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" ) diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py index 6589bf14d24..e088b250536 100644 --- a/tests/models/llava_next/test_modeling_llava_next.py +++ b/tests/models/llava_next/test_modeling_llava_next.py @@ -283,6 +283,38 @@ def test_inputs_embeds_matches_input_ids(self): out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0] self.assertTrue(torch.allclose(out_embeds, out_ids)) + def test_mismatching_num_image_tokens(self): + """ + Tests that VLMs through an error with explicit message saying what is wrong + when number of images don't match number of image tokens in the text. + Also we need to test multi-image cases when one prompr has multiple image tokens. + """ + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + model = model_class(config).to(torch_device) + _ = model(**input_dict) # successfull forward with no modifications + + # remove one image but leave the image token in text + input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...] + input_dict["image_sizes"] = input_dict["image_sizes"][-1:, ...] + with self.assertRaises(ValueError): + _ = model(**input_dict) + + # simulate multi-image case by concatenating inputs where each has exactly one image/image-token + input_ids = input_dict["input_ids"][:1] + pixel_values = input_dict["pixel_values"][:1] + image_sizes = input_dict["image_sizes"][:1] + input_ids = torch.cat([input_ids, input_ids], dim=0) + + # one image and two image tokens raise an error + with self.assertRaises(ValueError): + _ = model(input_ids=input_ids, pixel_values=pixel_values, image_sizes=image_sizes) + + # two images and two image tokens don't raise an error + pixel_values = torch.cat([pixel_values, pixel_values], dim=0) + image_sizes = torch.cat([image_sizes, image_sizes], dim=0) + _ = model(input_ids=input_ids, pixel_values=pixel_values, image_sizes=image_sizes) + @unittest.skip( reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" ) diff --git a/tests/models/llava_next_video/test_modeling_llava_next_video.py b/tests/models/llava_next_video/test_modeling_llava_next_video.py index 05fc8a49e1e..edf1dd2d4c0 100644 --- a/tests/models/llava_next_video/test_modeling_llava_next_video.py +++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py @@ -303,6 +303,38 @@ def test_inputs_embeds_matches_input_ids(self): out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0] self.assertTrue(torch.allclose(out_embeds, out_ids)) + def test_mismatching_num_image_tokens(self): + """ + Tests that VLMs through an error with explicit message saying what is wrong + when number of images don't match number of image tokens in the text. + Also we need to test multi-image cases when one prompr has multiple image tokens. + """ + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + model = model_class(config).to(torch_device) + _ = model(**input_dict) # successfull forward with no modifications + + # remove one image but leave the image token in text + input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...] + input_dict["image_sizes"] = input_dict["image_sizes"][-1:, ...] + with self.assertRaises(ValueError): + _ = model(**input_dict) + + # simulate multi-image case by concatenating inputs where each has exactly one image/image-token + input_ids = input_dict["input_ids"][:1] + pixel_values = input_dict["pixel_values"][:1] + image_sizes = input_dict["image_sizes"][:1] + input_ids = torch.cat([input_ids, input_ids], dim=0) + + # one image and two image tokens raise an error + with self.assertRaises(ValueError): + _ = model(input_ids=input_ids, pixel_values=pixel_values, image_sizes=image_sizes) + + # two images and two image tokens don't raise an error + pixel_values = torch.cat([pixel_values, pixel_values], dim=0) + image_sizes = torch.cat([image_sizes, image_sizes], dim=0) + _ = model(input_ids=input_ids, pixel_values=pixel_values, image_sizes=image_sizes) + @unittest.skip( reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" ) diff --git a/tests/models/paligemma/test_modeling_paligemma.py b/tests/models/paligemma/test_modeling_paligemma.py index cfc2a2c29b1..95ae59dfc08 100644 --- a/tests/models/paligemma/test_modeling_paligemma.py +++ b/tests/models/paligemma/test_modeling_paligemma.py @@ -236,6 +236,36 @@ def test_inputs_embeds_matches_input_ids(self): out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0] self.assertTrue(torch.allclose(out_embeds, out_ids)) + # Copied from tests.models.llava.test_modeling_llava.LlavaForConditionalGenerationModelTest.test_mismatching_num_image_tokens + def test_mismatching_num_image_tokens(self): + """ + Tests that VLMs through an error with explicit message saying what is wrong + when number of images don't match number of image tokens in the text. + Also we need to test multi-image cases when one prompr has multiple image tokens. + """ + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + model = model_class(config).to(torch_device) + _ = model(**input_dict) # successfull forward with no modifications + + # remove one image but leave the image token in text + input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...] + with self.assertRaises(ValueError): + _ = model(**input_dict) + + # simulate multi-image case by concatenating inputs where each has exactly one image/image-token + input_ids = input_dict["input_ids"][:1] + pixel_values = input_dict["pixel_values"][:1] + input_ids = torch.cat([input_ids, input_ids], dim=0) + + # one image and two image tokens raise an error + with self.assertRaises(ValueError): + _ = model(input_ids=input_ids, pixel_values=pixel_values) + + # two images and two image tokens don't raise an error + pixel_values = torch.cat([pixel_values, pixel_values], dim=0) + _ = model(input_ids=input_ids, pixel_values=pixel_values) + @unittest.skip( reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" ) diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py index 956243dcceb..e1cd715f8f1 100644 --- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py @@ -58,7 +58,7 @@ class Qwen2VLVisionText2TextModelTester: def __init__( self, parent, - batch_size=2, + batch_size=3, seq_length=7, num_channels=3, ignore_index=-100, @@ -245,6 +245,40 @@ def test_initialization(self): msg=f"Parameter {name} of model {model_class} seems not properly initialized", ) + def test_mismatching_num_image_tokens(self): + """ + Tests that VLMs through an error with explicit message saying what is wrong + when number of images don't match number of image tokens in the text. + Also we need to test multi-image cases when one prompr has multiple image tokens. + """ + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + model = model_class(config).to(torch_device) + _ = model(**input_dict) # successfull forward with no modifications + + # remove one image but leave the image token in text + patch_size = config.vision_config.patch_size + one_img_length = (self.model_tester.image_size**2) // (patch_size**2) + input_dict["pixel_values"] = input_dict["pixel_values"][-one_img_length:, ...] + input_dict["image_grid_thw"] = input_dict["image_grid_thw"][-1:, ...] + with self.assertRaises(ValueError): + _ = model(**input_dict) + + # simulate multi-image case by concatenating inputs where each has exactly one image/image-token + input_ids = input_dict["input_ids"][:1] + pixel_values = input_dict["pixel_values"][:one_img_length] + image_grid_thw = input_dict["image_grid_thw"][:1] + input_ids = torch.cat([input_ids, input_ids], dim=0) + + # one image and two image tokens raise an error + with self.assertRaises(ValueError): + _ = model(input_ids=input_ids, pixel_values=pixel_values, image_grid_thw=image_grid_thw) + + # two images and two image tokens don't raise an error + pixel_values = torch.cat([pixel_values, pixel_values], dim=0) + image_grid_thw = torch.cat([image_grid_thw, image_grid_thw], dim=0) + _ = model(input_ids=input_ids, pixel_values=pixel_values, image_grid_thw=image_grid_thw) + @unittest.skip( reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" ) diff --git a/tests/models/video_llava/test_modeling_video_llava.py b/tests/models/video_llava/test_modeling_video_llava.py index fd4c49f4a69..e25ad1d4446 100644 --- a/tests/models/video_llava/test_modeling_video_llava.py +++ b/tests/models/video_llava/test_modeling_video_llava.py @@ -123,9 +123,9 @@ def __init__( self.batch_size = 5 self.num_channels = 3 self.image_size = 224 - self.encoder_seq_length = 64 + self.encoder_seq_length = 246 self.num_image_tokens = 25 - self.num_video_tokens = 26 + self.num_video_tokens = 26 * self.num_frames self.seq_length = seq_length + self.num_image_tokens + self.num_video_tokens def get_config(self): @@ -267,7 +267,7 @@ def test_mixed_input(self): # if we remove some images from inputs leaving only one # image number mismatch error should raise inputs["pixel_values_images"] = inputs["pixel_values_images"][:1] - with self.assertRaises(RuntimeError): + with self.assertRaises(ValueError): _ = model(**inputs) def test_video_only_input(self): @@ -401,6 +401,35 @@ def test_inputs_embeds_matches_input_ids(self): out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0] self.assertTrue(torch.allclose(out_embeds, out_ids)) + def test_mismatching_num_image_tokens(self): + """ + Tests that VLMs through an error with explicit message saying what is wrong + when number of images don't match number of image tokens in the text. + Also we need to test multi-image cases when one prompr has multiple image tokens. + """ + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + model = model_class(config).to(torch_device) + _ = model(**input_dict) # successfull forward with no modifications + + # remove one image but leave the image token in text + input_dict["pixel_values_images"] = input_dict["pixel_values_images"][-1:, ...] + with self.assertRaises(ValueError): + _ = model(**input_dict) + + # simulate multi-image case by concatenating inputs where each has exactly one image/image-token + input_ids = input_dict["input_ids"][:1] + pixel_values = input_dict["pixel_values_images"][:1] + input_ids = torch.cat([input_ids, input_ids], dim=0) + + # one image and two image tokens raise an error + with self.assertRaises(ValueError): + _ = model(input_ids=input_ids, pixel_values_images=pixel_values) + + # two images and two image tokens don't raise an error + pixel_values = torch.cat([pixel_values, pixel_values], dim=0) + _ = model(input_ids=input_ids, pixel_values_images=pixel_values) + @require_torch class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase): diff --git a/tests/models/vipllava/test_modeling_vipllava.py b/tests/models/vipllava/test_modeling_vipllava.py index 2c241c23f26..a976e3cb51f 100644 --- a/tests/models/vipllava/test_modeling_vipllava.py +++ b/tests/models/vipllava/test_modeling_vipllava.py @@ -217,6 +217,36 @@ def test_inputs_embeds_matches_input_ids(self): out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0] self.assertTrue(torch.allclose(out_embeds, out_ids)) + # Copied from tests.models.llava.test_modeling_llava.LlavaForConditionalGenerationModelTest.test_mismatching_num_image_tokens + def test_mismatching_num_image_tokens(self): + """ + Tests that VLMs through an error with explicit message saying what is wrong + when number of images don't match number of image tokens in the text. + Also we need to test multi-image cases when one prompr has multiple image tokens. + """ + config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + model = model_class(config).to(torch_device) + _ = model(**input_dict) # successfull forward with no modifications + + # remove one image but leave the image token in text + input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...] + with self.assertRaises(ValueError): + _ = model(**input_dict) + + # simulate multi-image case by concatenating inputs where each has exactly one image/image-token + input_ids = input_dict["input_ids"][:1] + pixel_values = input_dict["pixel_values"][:1] + input_ids = torch.cat([input_ids, input_ids], dim=0) + + # one image and two image tokens raise an error + with self.assertRaises(ValueError): + _ = model(input_ids=input_ids, pixel_values=pixel_values) + + # two images and two image tokens don't raise an error + pixel_values = torch.cat([pixel_values, pixel_values], dim=0) + _ = model(input_ids=input_ids, pixel_values=pixel_values) + @unittest.skip( reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" ) From 8a734ea2c340beee23e665601919814918bf4c43 Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Wed, 30 Oct 2024 10:59:08 +0000 Subject: [PATCH 71/99] Tests: move `generate` tests to the right mixin and delete redundant tests (#34464) * tmp commit * tmp commit * cull overwrites of deleted tests * typo * more specific docstring * make fixup * parameterize at the top? * correction * more deletions :D * tmp commit * for VLMs too * fix _check_outputs * test nit * make fixup * fix another flaky * test_generate_from_inputs_embeds -- handle missing attention mask --- src/transformers/generation/utils.py | 33 +- .../modeling_llava_next_video.py | 3 +- .../modular_llava_next_video.py | 3 +- .../modeling_llava_onevision.py | 3 +- .../models/musicgen/modeling_musicgen.py | 4 +- .../modeling_musicgen_melody.py | 4 +- .../video_llava/modeling_video_llava.py | 3 +- tests/generation/test_utils.py | 377 +++++++++------- tests/models/bart/test_modeling_bart.py | 5 - tests/models/bert/test_modeling_bert.py | 5 - .../chameleon/test_modeling_chameleon.py | 40 -- tests/models/gemma/test_modeling_gemma.py | 48 -- tests/models/gemma2/test_modeling_gemma2.py | 1 - tests/models/glm/test_modeling_glm.py | 40 -- tests/models/gptj/test_modeling_gptj.py | 45 +- tests/models/granite/test_modeling_granite.py | 47 +- .../granitemoe/test_modeling_granitemoe.py | 45 -- tests/models/idefics/test_modeling_idefics.py | 7 - .../models/idefics2/test_modeling_idefics2.py | 45 -- .../models/idefics3/test_modeling_idefics3.py | 78 ---- tests/models/jamba/test_modeling_jamba.py | 87 ---- tests/models/jetmoe/test_modeling_jetmoe.py | 80 ---- tests/models/kosmos2/test_modeling_kosmos2.py | 6 - tests/models/llama/test_modeling_llama.py | 41 -- tests/models/mamba2/test_modeling_mamba2.py | 10 +- tests/models/mimi/test_modeling_mimi.py | 17 - tests/models/mistral/test_modeling_mistral.py | 80 ---- tests/models/mixtral/test_modeling_mixtral.py | 80 ---- tests/models/mllama/test_modeling_mllama.py | 1 - tests/models/moshi/test_modeling_moshi.py | 61 +-- tests/models/mt5/test_modeling_mt5.py | 3 - .../models/musicgen/test_modeling_musicgen.py | 281 ------------ .../test_modeling_musicgen_melody.py | 143 ------ .../models/nemotron/test_modeling_nemotron.py | 2 - .../paligemma/test_modeling_paligemma.py | 4 - tests/models/phi/test_modeling_phi.py | 41 -- tests/models/qwen2/test_modeling_qwen2.py | 80 ---- .../qwen2_moe/test_modeling_qwen2_moe.py | 80 ---- .../models/qwen2_vl/test_modeling_qwen2_vl.py | 4 - .../test_modeling_recurrent_gemma.py | 4 - .../starcoder2/test_modeling_starcoder2.py | 80 ---- tests/models/t5/test_modeling_t5.py | 3 - tests/models/umt5/test_modeling_umt5.py | 3 - tests/models/whisper/test_modeling_whisper.py | 70 --- tests/models/zamba/test_modeling_zamba.py | 87 ---- tests/test_modeling_common.py | 425 ------------------ 46 files changed, 263 insertions(+), 2346 deletions(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index efe953db051..6e6d5b8bdce 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -378,10 +378,14 @@ def prepare_inputs_for_generation( # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens # Exception 1: when passing input_embeds, input_ids may be missing entries # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here - # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case + # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case. + # (we can't check exception 3 while compiling) if past_key_values is not None: model_inputs["past_key_values"] = past_key_values - if inputs_embeds is not None or cache_position[-1] >= input_ids.shape[1]: # Exception 1 or Exception 3 + if ( + inputs_embeds is not None # Exception 1 + or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1]) # Exception 3 + ): input_ids = input_ids[:, -cache_position.shape[0] :] elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) input_ids = input_ids[:, cache_position] @@ -414,7 +418,7 @@ def prepare_inputs_for_generation( for model_input_name in ["position_ids", "token_type_ids"]: model_input = kwargs.get(model_input_name) if model_input is not None: - if past_key_values: + if past_key_values is not None: model_input = model_input[:, -input_ids.shape[1] :] model_input = model_input.clone(memory_format=torch.contiguous_format) model_inputs[model_input_name] = model_input @@ -568,27 +572,34 @@ def _maybe_initialize_input_ids_for_generation( def _prepare_attention_mask_for_generation( self, - inputs: torch.Tensor, - pad_token_id: Optional[torch.Tensor], - eos_token_id: Optional[torch.Tensor], + inputs_tensor: torch.Tensor, + generation_config: GenerationConfig, + model_kwargs: Dict[str, Any], ) -> torch.LongTensor: + pad_token_id = generation_config._pad_token_tensor + eos_token_id = generation_config._eos_token_tensor + + # `input_ids` may be present in the model kwargs, instead of being the main input (e.g. multimodal model) + if "input_ids" in model_kwargs and model_kwargs["input_ids"].shape[1] > 0: + inputs_tensor = model_kwargs["input_ids"] + # No information for attention mask inference -> return default attention mask - default_attention_mask = torch.ones(inputs.shape[:2], dtype=torch.long, device=inputs.device) + default_attention_mask = torch.ones(inputs_tensor.shape[:2], dtype=torch.long, device=inputs_tensor.device) if pad_token_id is None: return default_attention_mask - is_input_ids = len(inputs.shape) == 2 and inputs.dtype in [torch.int, torch.long] + is_input_ids = len(inputs_tensor.shape) == 2 and inputs_tensor.dtype in [torch.int, torch.long] if not is_input_ids: return default_attention_mask is_pad_token_in_inputs = (pad_token_id is not None) and ( - isin_mps_friendly(elements=inputs, test_elements=pad_token_id).any() + isin_mps_friendly(elements=inputs_tensor, test_elements=pad_token_id).any() ) is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or ~( isin_mps_friendly(elements=eos_token_id, test_elements=pad_token_id).any() ) can_infer_attention_mask = is_pad_token_in_inputs * is_pad_token_not_equal_to_eos_token_id - attention_mask_from_padding = inputs.ne(pad_token_id).long() + attention_mask_from_padding = inputs_tensor.ne(pad_token_id).long() attention_mask = ( attention_mask_from_padding * can_infer_attention_mask + default_attention_mask * ~can_infer_attention_mask @@ -2020,7 +2031,7 @@ def generate( if not kwargs_has_attention_mask and requires_attention_mask and accepts_attention_mask: model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation( - inputs_tensor, generation_config._pad_token_tensor, generation_config._eos_token_tensor + inputs_tensor, generation_config, model_kwargs ) elif kwargs_has_attention_mask: # TODO (joao): generalize this check with other types of inputs diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index c40ee1f70f9..85c109919da 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -911,7 +911,8 @@ def forward( if (pixel_values is not None or pixel_values_videos is not None) and inputs_embeds is not None: raise ValueError( - "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one" + "You cannot specify both `pixel_values`/`pixel_values_videos` and `inputs_embeds` at the same time, " + "and must specify either one" ) legacy_processing = False diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py index 1425a017dc0..2025140bb6e 100644 --- a/src/transformers/models/llava_next_video/modular_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py @@ -424,7 +424,8 @@ def forward( if (pixel_values is not None or pixel_values_videos is not None) and inputs_embeds is not None: raise ValueError( - "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one" + "You cannot specify both `pixel_values`/`pixel_values_videos` and `inputs_embeds` at the same time, " + "and must specify either one" ) legacy_processing = False diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py index f8bdb5bf8d5..2aa6b2fa1d6 100644 --- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py @@ -657,7 +657,8 @@ def forward( if (pixel_values is not None or pixel_values_videos is not None) and inputs_embeds is not None: raise ValueError( - "You cannot specify both pixel_values/pixel_values_videos and inputs_embeds at the same time, and must specify either one" + "You cannot specify both `pixel_values`/`pixel_values_videos` and `inputs_embeds` at the same time, " + "and must specify either one" ) if inputs_embeds is None: diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py index c18e1d1c9d8..109ddfb626d 100644 --- a/src/transformers/models/musicgen/modeling_musicgen.py +++ b/src/transformers/models/musicgen/modeling_musicgen.py @@ -1562,7 +1562,7 @@ def generate( if model_kwargs.get("attention_mask", None) is None and requires_attention_mask: model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation( - input_ids, generation_config._pad_token_tensor, generation_config._eos_token_tensor + input_ids, generation_config, model_kwargs ) # 5. Prepare `max_length` depending on other stopping criteria. @@ -2578,7 +2578,7 @@ def generate( if model_kwargs.get("attention_mask", None) is None and requires_attention_mask: model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation( - inputs_tensor, generation_config._pad_token_tensor, generation_config._eos_token_tensor + inputs_tensor, generation_config, model_kwargs ) if "encoder_outputs" not in model_kwargs: diff --git a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py index d2f339afc41..61f2ce414e1 100644 --- a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py +++ b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py @@ -1484,7 +1484,7 @@ def generate( if model_kwargs.get("attention_mask", None) is None and requires_attention_mask: model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation( - input_ids, generation_config._pad_token_tensor, generation_config._eos_token_tensor + input_ids, generation_config, model_kwargs ) # 5. Prepare `max_length` depending on other stopping criteria. @@ -2425,7 +2425,7 @@ def generate( if model_kwargs.get("attention_mask", None) is None and requires_attention_mask: model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation( - inputs_tensor, generation_config._pad_token_tensor, generation_config._eos_token_tensor + inputs_tensor, generation_config, model_kwargs ) if "encoder_hidden_states" not in model_kwargs: diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py index 02efc7c344f..a3b3de33fa6 100644 --- a/src/transformers/models/video_llava/modeling_video_llava.py +++ b/src/transformers/models/video_llava/modeling_video_llava.py @@ -534,7 +534,8 @@ def forward( if (pixel_values_images is not None or pixel_values_videos is not None) and inputs_embeds is not None: raise ValueError( - "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one" + "You cannot specify both `pixel_values_images`/`pixel_values_videos` and `inputs_embeds` at the same " + "time, and must specify either one" ) legacy_processing = False diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index d552bf73442..545b696d673 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -29,6 +29,7 @@ from transformers.testing_utils import ( is_flaky, require_accelerate, + require_flash_attn, require_optimum_quanto, require_torch, require_torch_gpu, @@ -136,6 +137,34 @@ def prepare_config_and_inputs_for_generate(self, batch_size=2): return config, filtered_inputs_dict + def _check_similar_generate_outputs(self, output_1, output_2, atol=1e-5, rtol=1e-5): + """ + Checks whether a pair of generate outputs are similar. Two `generate` call outputs are considered similar in + the following siturations: + 1. The sequences are the same + 2. The sequences are different, but the scores up to (and including) the first mismatch are nearly identical + """ + # scores doesn't include data regarding decoder input tokens + decoder_input_length = output_1.sequences.shape[1] - len(output_1.scores) + output_matches = output_1.sequences == output_2.sequences + has_matching_outputs = output_matches.all() + has_matching_scores = None + if not has_matching_outputs: + for batch_idx in range(output_1.sequences.shape[0]): + batch_matches = output_matches[batch_idx] + if batch_matches.all(): + continue + first_mismatch_idx = batch_matches.int().argmin() # gets the index of the first False + first_mismatch_idx -= decoder_input_length + output_1_first_mismatch_scores = output_1.scores[first_mismatch_idx][batch_idx] + output_2_first_mismatch_scores = output_2.scores[first_mismatch_idx][batch_idx] + has_matching_scores = torch.allclose( + output_1_first_mismatch_scores, output_2_first_mismatch_scores, rtol=atol, atol=rtol + ) + if not has_matching_scores: + break + self.assertTrue(has_matching_outputs or has_matching_scores) + def _get_logits_processor_kwargs(self, do_sample=False, config=None): logits_processor_kwargs = { "bad_words_ids": [[1, 0]], @@ -426,7 +455,6 @@ def test_greedy_generate(self): def test_greedy_generate_dict_outputs(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] model = model_class(config).to(torch_device).eval() output_generate = self._greedy_generate( @@ -453,13 +481,12 @@ def test_greedy_generate_dict_outputs(self): # Retrocompatibility check self.assertIsInstance(output_generate, GreedySearchDecoderOnlyOutput) - self._check_outputs(output_generate, main_input, model.config) + self._check_outputs(output_generate, model.config) @pytest.mark.generate def test_greedy_generate_dict_outputs_use_cache(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] if not hasattr(config, "use_cache"): self.skipTest(reason=f"{model_class.__name__} doesn't support caching") @@ -486,7 +513,7 @@ def test_greedy_generate_dict_outputs_use_cache(self): output_generate.sequences.shape[-1] == self.max_new_tokens + inputs_dict["input_ids"].shape[-1] ) - self._check_outputs(output_generate, main_input, model.config, use_cache=True) + self._check_outputs(output_generate, model.config, use_cache=True) @pytest.mark.generate def test_sample_generate(self): @@ -505,7 +532,6 @@ def test_sample_generate(self): def test_sample_generate_dict_output(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] model = model_class(config).to(torch_device).eval() output_generate = self._sample_generate( @@ -533,7 +559,7 @@ def test_sample_generate_dict_output(self): # Retrocompatibility check self.assertIsInstance(output_generate, SampleDecoderOnlyOutput) - self._check_outputs(output_generate, main_input, model.config, num_return_sequences=2) + self._check_outputs(output_generate, model.config, num_return_sequences=2) @pytest.mark.generate def test_beam_search_generate(self): @@ -554,7 +580,6 @@ def test_beam_search_generate(self): def test_beam_search_generate_dict_output(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] model = model_class(config).to(torch_device).eval() beam_kwargs = self._get_beam_kwargs() @@ -583,14 +608,16 @@ def test_beam_search_generate_dict_output(self): self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput) self._check_outputs( - output_generate, main_input, model.config, num_return_sequences=beam_kwargs["num_beams"] + output_generate, + model.config, + num_return_sequences=beam_kwargs["num_return_sequences"], + num_beams=beam_kwargs["num_beams"], ) @pytest.mark.generate def test_beam_search_generate_dict_outputs_use_cache(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] if not hasattr(config, "use_cache"): self.skipTest(reason=f"{model_class.__name__} doesn't support caching") @@ -623,10 +650,10 @@ def test_beam_search_generate_dict_outputs_use_cache(self): self._check_outputs( output_generate, - main_input, model.config, use_cache=True, - num_return_sequences=beam_kwargs["num_beams"], + num_return_sequences=beam_kwargs["num_return_sequences"], + num_beams=beam_kwargs["num_beams"], ) @require_accelerate @@ -675,7 +702,6 @@ def test_beam_sample_generate(self): def test_beam_sample_generate_dict_output(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] model = model_class(config).to(torch_device).eval() beam_kwargs = self._get_beam_kwargs() @@ -706,7 +732,10 @@ def test_beam_sample_generate_dict_output(self): self.assertIsInstance(output_generate, BeamSampleDecoderOnlyOutput) self._check_outputs( - output_generate, main_input, model.config, num_return_sequences=beam_kwargs["num_beams"] + output_generate, + model.config, + num_return_sequences=beam_kwargs["num_return_sequences"], + num_beams=beam_kwargs["num_beams"], ) @pytest.mark.generate @@ -765,7 +794,6 @@ def test_group_beam_search_generate(self): def test_group_beam_search_generate_dict_output(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] model = model_class(config).to(torch_device).eval() beam_kwargs = self._get_diverse_beam_kwargs() @@ -794,7 +822,10 @@ def test_group_beam_search_generate_dict_output(self): self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput) self._check_outputs( - output_generate, main_input, model.config, num_return_sequences=beam_kwargs["num_beams"] + output_generate, + model.config, + num_return_sequences=beam_kwargs["num_return_sequences"], + num_beams=beam_kwargs["num_beams"], ) # TODO: @gante check why it is flaky @@ -859,7 +890,6 @@ def test_constrained_beam_search_generate(self): def test_constrained_beam_search_generate_dict_output(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] model = model_class(config).to(torch_device).eval() @@ -899,7 +929,10 @@ def test_constrained_beam_search_generate_dict_output(self): self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput) self._check_outputs( - output_generate, main_input, model.config, num_return_sequences=beam_kwargs["num_beams"] + output_generate, + model.config, + num_return_sequences=beam_kwargs["num_return_sequences"], + num_beams=beam_kwargs["num_beams"], ) @pytest.mark.generate @@ -942,7 +975,6 @@ def test_contrastive_generate_dict_outputs_use_cache(self): self.skipTest(reason="Won't fix: old model with different cache format") config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] # NOTE: contrastive search only works with cache on at the moment. if not hasattr(config, "use_cache"): @@ -968,7 +1000,7 @@ def test_contrastive_generate_dict_outputs_use_cache(self): output_generate.sequences.shape[-1] == self.max_new_tokens + inputs_dict["input_ids"].shape[-1] ) - self._check_outputs(output_generate, main_input, model.config, use_cache=True) + self._check_outputs(output_generate, model.config, use_cache=True) @pytest.mark.generate def test_contrastive_generate_low_memory(self): @@ -1064,14 +1096,10 @@ def test_beam_search_low_memory(self): @pytest.mark.generate @parameterized.expand([("random",), ("same",)]) - @is_flaky() # Read NOTE (1) below. If there are API issues, all attempts will fail. def test_assisted_decoding_matches_greedy_search(self, assistant_type): # This test ensures that the assisted generation does not introduce output changes over greedy search. - # NOTE (1): The sentence above is true most of the time, there is a tiny difference in the logits due to matmul - # shape differences -- and it may result in a different output. The input shape difference happens in the - # main model, that runs the forward pass with several candidates at once (as opposed to generating one token at - # a time). See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 for more info. - # NOTE (2): It breaks the pattern in the tests above, for multiple reasons: + # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 for more info. + # NOTE: It breaks the pattern in the tests above, for multiple reasons: # - assisted_decoding, contrarily to the other methods, can't be called on its own (e.g. needs to # prepare the assistant encoder outputs in the main generate body); # - assisted_decoding does not support `use_cache = False` @@ -1100,7 +1128,6 @@ def test_assisted_decoding_matches_greedy_search(self, assistant_type): # enable cache config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1) - main_input = inputs_dict[model_class.main_input_name] # NOTE: assisted generation only works with cache on at the moment. if not hasattr(config, "use_cache"): @@ -1141,12 +1168,10 @@ def test_assisted_decoding_matches_greedy_search(self, assistant_type): output_assisted = model.generate(**generation_kwargs, **inputs_dict) # The two outputs must match and their shape must be as expected - - self.assertListEqual(output_greedy.sequences.tolist(), output_assisted.sequences.tolist()) + self._check_similar_generate_outputs(output_greedy, output_assisted) for output in (output_greedy, output_assisted): - self._check_outputs(output, main_input, model.config, use_cache=True) + self._check_outputs(output, model.config, use_cache=True) - @is_flaky() @pytest.mark.generate def test_prompt_lookup_decoding_matches_greedy_search(self): # This test ensures that the prompt lookup generation does not introduce output changes over greedy search. @@ -1175,7 +1200,6 @@ def test_prompt_lookup_decoding_matches_greedy_search(self): # enable cache config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1) - main_input = inputs_dict[model_class.main_input_name] # NOTE: assisted generation only works with cache on at the moment. if not hasattr(config, "use_cache"): @@ -1208,10 +1232,9 @@ def test_prompt_lookup_decoding_matches_greedy_search(self): output_prompt_lookup = model.generate(**generation_kwargs, **inputs_dict) # The two outputs must match and their shape must be as expected - - self.assertListEqual(output_greedy.sequences.tolist(), output_prompt_lookup.sequences.tolist()) + self._check_similar_generate_outputs(output_greedy, output_prompt_lookup) for output in (output_greedy, output_prompt_lookup): - self._check_outputs(output, main_input, model.config, use_cache=True) + self._check_outputs(output, model.config, use_cache=True) @pytest.mark.generate def test_dola_decoding_sample(self): @@ -1231,7 +1254,6 @@ def test_dola_decoding_sample(self): # enable cache if the model is not openai-gpt, xlnet, cpm, or xlm config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] # Encoder-decoder models are not supported if config.is_encoder_decoder: @@ -1259,7 +1281,7 @@ def test_dola_decoding_sample(self): "dola_layers": "low", } output_dola = model.generate(**generation_kwargs, **inputs_dict) - self._check_outputs(output_dola, main_input, model.config, use_cache=getattr(config, "use_cache", False)) + self._check_outputs(output_dola, model.config, use_cache=getattr(config, "use_cache", False)) @pytest.mark.generate def test_assisted_decoding_sample(self): @@ -1289,7 +1311,6 @@ def test_assisted_decoding_sample(self): # enable cache config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1) - main_input = inputs_dict[model_class.main_input_name] # NOTE: assisted generation only works with cache on at the moment. if not hasattr(config, "use_cache"): @@ -1321,7 +1342,7 @@ def test_assisted_decoding_sample(self): } output_assisted = model.generate(**generation_kwargs, **inputs_dict) - self._check_outputs(output_assisted, main_input, config, use_cache=True) + self._check_outputs(output_assisted, config, use_cache=True) @pytest.mark.generate def test_prompt_lookup_decoding_stops_at_eos(self): @@ -1547,75 +1568,93 @@ def test_past_key_values_format(self): ) @pytest.mark.generate - @parameterized.expand([(1,), (2,)]) - def test_generate_from_inputs_embeds_decoder_only(self, num_beams): + @parameterized.expand([("greedy", 1), ("beam search", 2)]) + def test_generate_from_inputs_embeds(self, _, num_beams): + """Tests that we can generate from `inputs_embeds` instead of `input_ids` in LLMs, VLMs, etc""" # When supported, tests that the decoder model can generate from `inputs_embeds` instead of `input_ids` # if fails, you should probably update the `prepare_inputs_for_generation` function for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - # Ignore: - # a) eos (to always output 20 tokens) and pad (so we don't try to infer the attn mask from the input_ids, - # which would cause a mismatch), - config.pad_token_id = config.eos_token_id = -1 - # b) embedding scaling, the scaling factor applied after embeding from input_ids (requires knowledge of the - # variable that holds the scaling factor, which is model-dependent) - if hasattr(config, "scale_embedding"): - config.scale_embedding = False - # This test is for decoder-only models (encoder-decoder models have native input embeddings support in the # decoder) if config.is_encoder_decoder: continue + config.is_decoder = True # Skip models without explicit support - config.is_decoder = True model = model_class(config).to(torch_device).eval() if "inputs_embeds" not in inspect.signature(model.prepare_inputs_for_generation).parameters.keys(): continue + # There are a few exception patterns in this test: + # 1 - Some models can't generate without `input_ids`, when `inputs_embeds` are passed + requires_inputs_ids = any( + model_name in model_class.__name__.lower() for model_name in ["idefics", "qwen2vl"] + ) + # 2 - Complex `inputs_embeds` computation, i.e. the correct computation of inputs embeds is more complex + # than calling the embedding layer with `input_ids`. Subcases of this exception: + # 2.A - Ignore `scale_embedding`, if the model supports it (it is controlled by a model-dependent flag) + if hasattr(config, "scale_embedding"): + config.scale_embedding = False + # 2.B - Some VLMs assume `inputs_embeds` and `pixel_values` are mutually exclusive AND fall in the + # exception above (complex `inputs_embeds` computation). Popping `pixel_values` allow us to run the + # checks without adding test complexity. Ditto for `pixel_values_videos` and `pixel_values_images` + pixel_values_is_mutually_exclusive = any( + model_name in model_class.__name__.lower() + for model_name in ["llava", "idefics2", "idefics3", "mllama", "paligemma"] + ) + if pixel_values_is_mutually_exclusive: + inputs_dict.pop("pixel_values", None) + inputs_dict.pop("pixel_values_videos", None) + inputs_dict.pop("pixel_values_images", None) + # 2.C - No easy fix, let's skip the check that compares the outputs from `input_ids` and `inputs_embeds` + has_complex_embeds_computation = any( + model_name in model_class.__name__.lower() for model_name in ["moshi"] + ) + # 3 - `inputs_dict` doesn't contain `attention_mask`. When `attention_mask` is not passed to generate, + # we infer it from `input_ids`. The last test case will fail if there is a pad token in the original input. + missing_attention_mask = "attention_mask" not in inputs_dict + + # Traditional way of generating text input_ids = inputs_dict.pop("input_ids") generation_kwargs = { "return_dict_in_generate": True, "output_scores": True, "num_beams": num_beams, "do_sample": False, + "max_new_tokens": 5, + "min_new_tokens": 5, # generate exactly 5 tokens } - - # Traditional way of generating text - outputs_from_ids = model.generate(input_ids, max_new_tokens=5, **generation_kwargs) + outputs_from_ids = model.generate(input_ids, **generation_kwargs, **inputs_dict) self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 5)) - # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output) + # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output). + # The output of the two calls should be the same. inputs_embeds = model.get_input_embeddings()(input_ids) outputs_from_embeds = model.generate( - input_ids, - inputs_embeds=inputs_embeds, - max_new_tokens=5, - **generation_kwargs, + input_ids, inputs_embeds=inputs_embeds, **generation_kwargs, **inputs_dict ) - self.assertListEqual(outputs_from_ids.sequences.tolist(), outputs_from_embeds.sequences.tolist()) + if not has_complex_embeds_computation: + self._check_similar_generate_outputs(outputs_from_ids, outputs_from_embeds) - # But if we pass different inputs_embeds, we should get different outputs (the output text may be the + # If we pass different inputs_embeds, we should get different outputs (the output text may be the # same, but the logits will almost surely be different) random_embeds = torch.rand_like(inputs_embeds) outputs_from_rand_embeds = model.generate( - input_ids, - inputs_embeds=random_embeds, - max_new_tokens=5, - **generation_kwargs, + input_ids, inputs_embeds=random_embeds, **generation_kwargs, **inputs_dict ) for i in range(len(outputs_from_rand_embeds.scores)): self.assertFalse(torch.allclose(outputs_from_embeds.scores[i], outputs_from_rand_embeds.scores[i])) - # input_ids is not a required input -- if we don't pass it, the newly generated tokens will be the same - outputs_from_embeds_wo_ids = model.generate( - inputs_embeds=inputs_embeds, max_new_tokens=5, **generation_kwargs - ) - self.assertListEqual( - outputs_from_embeds.sequences[:, inputs_embeds.shape[1] :].tolist(), - outputs_from_embeds_wo_ids.sequences.tolist(), - ) + # input_ids is not a required input on most models -- if we don't pass it, the newly generated tokens will + # be the same + if not (requires_inputs_ids or missing_attention_mask): + outputs_from_embeds_wo_ids = model.generate( + inputs_embeds=inputs_embeds, **generation_kwargs, **inputs_dict + ) + outputs_from_embeds.sequences = outputs_from_embeds.sequences[:, inputs_embeds.shape[1] :] + self._check_similar_generate_outputs(outputs_from_embeds_wo_ids, outputs_from_embeds) @pytest.mark.generate def test_generate_from_inputs_embeds_with_static_cache(self): @@ -1829,10 +1868,8 @@ def test_new_cache_format(self, num_beams, do_sample): @pytest.mark.generate def test_generate_with_static_cache(self): """ - Tests if StaticCache works if we set attn_implementation=static when generation. - This doesn't test if generation quality is good, but tests that models with - self._supports_static_cache don't throw an error when generating and return - a StaticCache object at the end. + Tests that generating with static cache give almost same results as with dynamic cache, and the output cache + has the expected shapes """ for model_class in self.all_generative_model_classes: if not model_class._supports_static_cache: @@ -1851,13 +1888,15 @@ def test_generate_with_static_cache(self): model = model_class(config).to(torch_device).eval() generation_kwargs = { - "max_length": None, "max_new_tokens": max_new_tokens, - "cache_implementation": "static", "return_dict_in_generate": True, # Required to return `past_key_values` + "output_scores": True, "use_cache": True, } + static_cache_generation = model.generate(**generation_kwargs, **inputs_dict, cache_implementation="static") + + # Check 1: The cache shapes must match the expected shapes max_cache_len = seq_length + max_new_tokens config = config.text_config if hasattr(config, "text_config") else config head_dim = ( @@ -1869,12 +1908,14 @@ def test_generate_with_static_cache(self): else config.num_key_value_heads ) num_hidden_layers = config.num_hidden_layers - results = model.generate(**generation_kwargs, **inputs_dict) - cache_shape = (batch_size, num_key_value_heads, max_cache_len, head_dim) - self.assertTrue(isinstance(results.past_key_values, StaticCache)) - self.assertTrue(len(results.past_key_values.key_cache) == num_hidden_layers) - self.assertTrue(results.past_key_values.key_cache[0].shape == cache_shape) + self.assertTrue(isinstance(static_cache_generation.past_key_values, StaticCache)) + self.assertTrue(len(static_cache_generation.past_key_values.key_cache) == num_hidden_layers) + self.assertTrue(static_cache_generation.past_key_values.key_cache[0].shape == cache_shape) + + # Check 2: The outputs must be similar to the case with dynamic cache + dynamic_cache_generation = model.generate(**generation_kwargs, **inputs_dict) + self._check_similar_generate_outputs(dynamic_cache_generation, static_cache_generation) @require_optimum_quanto @pytest.mark.generate @@ -1908,25 +1949,32 @@ def test_generate_with_quant_cache(self): with self.assertRaises(ValueError): model.generate(**generation_kwargs, **inputs_dict) + @parameterized.expand( + [ + ("forward_only", False), # TODO (@joao): a few models failing. After fixed, this should not be "@slow" + ("end_to_end", True), # TODO (@joao): end-to-end compilation is broken with torch 2.5+, explore and fix + ] + ) @pytest.mark.generate @require_torch_gpu @slow - @is_flaky() # compilation may result in equivalent (!= same) FP ops, causing the argmax in `generate` to be flaky - def test_generate_compile_fullgraph(self): + def test_generate_compile(self, _, end_to_end): """ - Tests that `.generate` is compatible with torch.compile without graph breaks, keeping the same results. + Tests that `.generate` is compatible with torch.compile without graph breaks, keeping the same results. Tests + end-to-end compilation and forward pass compilation only. ⚠️ Runs two sequential generations to ensure the cache doesn't get stuck after the first compiled run! ⚠️ """ for model_class in self.all_generative_model_classes: if not model_class._supports_static_cache: self.skipTest("This model doesn't support static cache") + # TODO (joao) -- fix and enable me :) - if any(model_name in model_class.__name__.lower() for model_name in ["whisper"]): + if end_to_end and any(model_name in model_class.__name__.lower() for model_name in ["whisper"]): self.skipTest("whisper model end-to-end generate compile not yet supported") config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() # TODO (joao) -- fix and enable me :) - if config.is_encoder_decoder: + if end_to_end and config.is_encoder_decoder: self.skipTest("Encoder-decoder model end-to-end generate compile not yet supported") model = model_class(config).to(torch_device) @@ -1941,27 +1989,33 @@ def test_generate_compile_fullgraph(self): generation_kwargs = { "do_sample": False, "max_new_tokens": 10, + "return_dict_in_generate": True, + "output_scores": True, } + # end-to-end works best with dynamic cache, forward compilation works best with static cache + if not end_to_end: + generation_kwargs["cache_implementation"] = "static" - max_cache_len = input_ids.shape[1] + generation_kwargs["max_new_tokens"] - config = config.get_text_config() - past_key_values = StaticCache( - config, batch_size=half_batch_size, max_cache_len=max_cache_len, device=torch_device - ) + # get eager + dynamic cache results for future comparison + dynamic_outputs = [] + for model_inputs in input_ids_sets: + dynamic_outputs.append(model.generate(model_inputs, **generation_kwargs)) + + # get compiled results + generation_config = copy.deepcopy(model.generation_config) + generation_config.update(**generation_kwargs) + torch.compiler.reset() + if end_to_end: + model.generate = torch.compile(model.generate, fullgraph=True, mode="reduce-overhead") + else: + model.forward = torch.compile(model.forward, fullgraph=True, mode="reduce-overhead") + compiled_outputs = [] for model_inputs in input_ids_sets: - # eager dynamic cache - output_dynamic = model.generate(model_inputs, **generation_kwargs) - - # end-to-end compiled dynamic cache - torch.compiler.reset() - compiled_generate = torch.compile(model.generate, fullgraph=True, mode="reduce-overhead") - generation_config = copy.deepcopy(model.generation_config) - generation_config.update(**generation_kwargs) - output_compiled = compiled_generate( - model_inputs, generation_config=generation_config, past_key_values=past_key_values - ) - self.assertListEqual(output_dynamic.tolist(), output_compiled.tolist()) + compiled_outputs.append(model.generate(model_inputs, generation_config=generation_config)) + + for dynamic_result, compiled_result in zip(dynamic_outputs, compiled_outputs): + self._check_similar_generate_outputs(dynamic_result, compiled_result) @pytest.mark.generate def test_generate_methods_with_num_logits_to_keep(self): @@ -1989,7 +2043,6 @@ def test_generate_methods_with_num_logits_to_keep(self): self.assertEqual(with_all_logits.tolist(), without_all_logits.tolist()) @pytest.mark.generate - @is_flaky() # assisted generation tests are flaky (minor fp ops differences) def test_assisted_decoding_with_num_logits_to_keep(self): for model_class in self.all_generative_model_classes: if "num_logits_to_keep" not in set(inspect.signature(model_class.forward).parameters.keys()): @@ -1998,6 +2051,9 @@ def test_assisted_decoding_with_num_logits_to_keep(self): self.skipTest(reason="Stateful models don't support assisted generation") config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1) + # NOTE: assisted generation only works with cache on at the moment. + if not hasattr(config, "use_cache"): + self.skipTest(reason=f"{model_class.__name__} doesn't support caching") config.use_cache = True config.is_decoder = True @@ -2010,14 +2066,16 @@ def test_assisted_decoding_with_num_logits_to_keep(self): "max_new_tokens": 10, "do_sample": False, "assistant_model": assistant_model, + "return_dict_in_generate": True, + "output_scores": True, } - assistant_model.generation_config.assistant_confidence_threshold = None # Setting num_logits_to_keep at 0 keeps all logits (old behavior) with_all_logits = model.generate(**generation_kwargs, **inputs_dict, num_logits_to_keep=0) # By default, num_logits_to_keep is automatically set to 1 if not provided (new behavior) without_all_logits = model.generate(**inputs_dict, **generation_kwargs) - self.assertEqual(with_all_logits.tolist(), without_all_logits.tolist()) + + self._check_similar_generate_outputs(with_all_logits, without_all_logits) @pytest.mark.generate def test_inherits_generation_mixin(self): @@ -2028,14 +2086,21 @@ def test_inherits_generation_mixin(self): for model_class in self.all_generative_model_classes: self.assertTrue("GenerationMixin" in str(model_class.__bases__)) - @require_torch_sdpa - @slow - def test_eager_matches_sdpa_generate(self): + def _test_attention_implementation(self, attn_implementation): + """ + Compares the output of generate with the eager attention implementation against other implementations. + NOTE: despite the test logic being the same, different implementations actually need diferent decorators, hence + this separate function. + """ max_new_tokens = 30 + support_flag = { + "sdpa": "_supports_sdpa", + "flash_attention_2": "_supports_flash_attn_2", + } for model_class in self.all_generative_model_classes: - if not model_class._supports_sdpa: - self.skipTest(f"{model_class.__name__} does not support SDPA") + if not getattr(model_class, support_flag[attn_implementation]): + self.skipTest(f"{model_class.__name__} does not support `attn_implementation={attn_implementation}`") config, original_inputs_dict = self.prepare_config_and_inputs_for_generate() inputs_dict = {} @@ -2062,63 +2127,59 @@ def test_eager_matches_sdpa_generate(self): "do_sample": False, "return_dict_in_generate": True, "output_scores": True, + "use_cache": True, } - model_sdpa = model_class.from_pretrained( + model_eager = model_class.from_pretrained( tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True, + attn_implementation="eager", ).to(torch_device) - res_sdpa = model_sdpa.generate(**inputs_dict, **generate_kwargs) - del model_sdpa + res_eager = model_eager.generate(**inputs_dict, **generate_kwargs) + del model_eager gc.collect() - model_eager = model_class.from_pretrained( + model_attn = model_class.from_pretrained( tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True, - attn_implementation="eager", + attn_implementation=attn_implementation, ).to(torch_device) - res_eager = model_eager.generate(**inputs_dict, **generate_kwargs) - del model_eager + res_attn = model_attn.generate(**inputs_dict, **generate_kwargs) + del model_attn gc.collect() - # Eager and SDPA are very similar, but not exactly the same. Because we are using random models, this - # test would be flaky if we only checked the sequences. Two situations in which this test passes: - # 1. The sequences are the same - # 2. The sequences are different, but the scores up until the first mismatch are nearly identical - output_matches = res_eager.sequences == res_sdpa.sequences - has_matching_outputs = output_matches.all() - has_matching_scores = None - if not has_matching_outputs: - input_length = main_input.shape[1] - for batch_idx in range(res_eager.sequences.shape[0]): - batch_matches = output_matches[batch_idx] - if batch_matches.all(): - continue - first_mismatch_idx = batch_matches.int().argmin() # gets the index of the first False - first_mismatch_idx -= input_length # scores doesn't include data regarding input tokens - sdpa_first_mismatch_scores = res_sdpa.scores[first_mismatch_idx][batch_idx] - eager_first_mismatch_scores = res_eager.scores[first_mismatch_idx][batch_idx] - has_matching_scores = torch.allclose( - sdpa_first_mismatch_scores, eager_first_mismatch_scores, rtol=1e-3, atol=1e-3 - ) - if not has_matching_scores: - break + self._check_similar_generate_outputs(res_eager, res_attn, atol=1e-3, rtol=1e-3) - self.assertTrue(has_matching_outputs or has_matching_scores) + @pytest.mark.generate + @require_torch_sdpa + @slow + def test_eager_matches_sdpa_generate(self): + """Tests that generate has equivalent outputs with SDPA and eager attention implementations.""" + self._test_attention_implementation("sdpa") - def _check_outputs(self, output, main_input, config, use_cache=False, num_return_sequences=1): - # we can be sure what is batch size from main input but seq length depends on model type and whether input is text/audio/image - # so we infer actual text seq length from model_tester, same was as it is done in `test_modeling_common.py` tests` - batch_size = main_input.shape[0] + @pytest.mark.flash_attn_test + @require_flash_attn + @require_torch_gpu + @slow + def test_eager_matches_fa2_generate(self): + """Tests that generate has equivalent outputs with FA2 and eager attention implementations.""" + # TODO (@joao @raushan) -- this test is failing the output checks on most models, investigate. After fixing, + # check whether we still need the overwrites + self._test_attention_implementation("flash_attention_2") + + def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1): + input_batch_size = int(output.sequences.shape[0] / num_return_sequences) + internal_batch_size = ( + input_batch_size * num_beams if num_beams > 1 else input_batch_size * num_return_sequences + ) seq_length = getattr(self.model_tester, "seq_length", None) seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length) seq_length = getattr(self.model_tester, "text_seq_length", seq_length) config = config.text_config if hasattr(config, "text_config") else config - num_sequences_in_output = batch_size * num_return_sequences gen_len = ( output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length @@ -2129,19 +2190,21 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return seq_length = self.model_tester.get_subsampled_output_lengths(seq_length) # scores - self._check_scores(num_sequences_in_output, output.scores, length=gen_len, config=config) + self._check_scores(internal_batch_size, output.scores, length=gen_len, config=config) # unprocessed logits - self._check_logits(num_sequences_in_output, output.logits, config=config) + self._check_logits(internal_batch_size, output.logits, config=config) # Attentions if self.has_attentions: if config.is_encoder_decoder: # encoder - self._check_encoder_attention_for_generate(output.encoder_attentions, batch_size, config, seq_length) + self._check_encoder_attention_for_generate( + output.encoder_attentions, input_batch_size, config, seq_length + ) # decoder self._check_attentions_for_generate( - num_sequences_in_output, + internal_batch_size, output.decoder_attentions, min_length=1, max_length=output.sequences.shape[-1], @@ -2153,7 +2216,7 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return attentions = output.attentions if not use_cache else output.attentions[1:] min_length = seq_length if not use_cache else seq_length + 1 self._check_attentions_for_generate( - num_sequences_in_output, + internal_batch_size, attentions=attentions, min_length=min_length, max_length=output.sequences.shape[-1], @@ -2165,12 +2228,12 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return if config.is_encoder_decoder: # encoder self._check_encoder_hidden_states_for_generate( - output.encoder_hidden_states, batch_size, config, seq_length + output.encoder_hidden_states, input_batch_size, config, seq_length ) # decoder self._check_hidden_states_for_generate( - num_sequences_in_output, + internal_batch_size, output.decoder_hidden_states, min_length=1, max_length=output.sequences.shape[-1], @@ -2182,7 +2245,7 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:] min_length = seq_length if not use_cache else seq_length + 1 self._check_hidden_states_for_generate( - num_sequences_in_output, + internal_batch_size, hidden_states, min_length=min_length, max_length=output.sequences.shape[-1], @@ -2213,7 +2276,7 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return past_key_values = output.past_key_values past_sequence_length = output.sequences.shape[-1] - 1 self._check_past_key_values_for_generate( - num_sequences_in_output, + internal_batch_size, past_key_values, seq_length=past_sequence_length, config=config, diff --git a/tests/models/bart/test_modeling_bart.py b/tests/models/bart/test_modeling_bart.py index eda51d21199..e4d0df141be 100644 --- a/tests/models/bart/test_modeling_bart.py +++ b/tests/models/bart/test_modeling_bart.py @@ -1532,8 +1532,3 @@ def test_retain_grad_hidden_states_attentions(self): @unittest.skip def test_save_load_fast_init_from_base(self): pass - - @unittest.skip(reason="Generate needs input ids") - def test_inputs_embeds_matches_input_ids_with_generate(self): - # generate only works with input ids for bartforcausalLM - pass diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py index aa9835d8cd6..25566027742 100644 --- a/tests/models/bert/test_modeling_bert.py +++ b/tests/models/bert/test_modeling_bert.py @@ -511,11 +511,6 @@ def test_model_as_decoder(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() self.model_tester.create_and_check_model_as_decoder(*config_and_inputs) - @unittest.skip(reason="Generate needs input ids") - def test_inputs_embeds_matches_input_ids_with_generate(self): - # generate only works with input ids for bertforcausalLM - pass - def test_model_as_decoder_with_default_input_mask(self): # This regression test was failing with PyTorch < 1.3 ( diff --git a/tests/models/chameleon/test_modeling_chameleon.py b/tests/models/chameleon/test_modeling_chameleon.py index aad26ef147e..2a8e7633ba4 100644 --- a/tests/models/chameleon/test_modeling_chameleon.py +++ b/tests/models/chameleon/test_modeling_chameleon.py @@ -16,17 +16,14 @@ import unittest -import pytest import requests from parameterized import parameterized from transformers import ChameleonConfig, is_torch_available, is_vision_available, set_seed from transformers.testing_utils import ( require_bitsandbytes, - require_flash_attn, require_read_token, require_torch, - require_torch_gpu, slow, torch_device, ) @@ -329,43 +326,6 @@ def test_model_rope_scaling(self, scaling_type): # The output should be different for long inputs self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5)) - @require_flash_attn - @require_read_token - @require_torch_gpu - @require_bitsandbytes - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - """ - Overwritting the common test as the test is flaky on tiny models - """ - model = ChameleonForConditionalGeneration.from_pretrained( - "facebook/chameleon-7b", - load_in_4bit=True, - device_map={"": 0}, - ) - - processor = ChameleonProcessor.from_pretrained("facebook/chameleon-7b") - texts = ["hi", "Hello this is a very long sentence"] - - processor.tokenizer.padding_side = "right" - - inputs = processor(text=texts, return_tensors="pt", padding=True).to(0) - - output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_native = processor.tokenizer.batch_decode(output_native) - - model = ChameleonForConditionalGeneration.from_pretrained( - "facebook/chameleon-7b", - load_in_4bit=True, - attn_implementation="flash_attention_2", - ) - - output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_fa_2 = processor.tokenizer.batch_decode(output_fa_2) - - self.assertListEqual(output_native, output_fa_2) - @unittest.skip("Chameleon forces some token ids to be -inf!") def test_batching_equivalence(self): pass diff --git a/tests/models/gemma/test_modeling_gemma.py b/tests/models/gemma/test_modeling_gemma.py index a888bdcd3bc..e8483f8c7c7 100644 --- a/tests/models/gemma/test_modeling_gemma.py +++ b/tests/models/gemma/test_modeling_gemma.py @@ -319,9 +319,6 @@ class GemmaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi # This is because we are hitting edge cases with the causal_mask buffer model_split_percents = [0.5, 0.6] - # used in `test_torch_compile` - _torch_compile_test_ckpt = "google/gemma-2b" - # used in `test_torch_compile_for_training` _torch_compile_train_cls = GemmaForCausalLM if is_torch_available() else None @@ -419,51 +416,6 @@ def test_save_load_fast_init_from_base(self): def test_past_key_values_format(self): pass - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_use_cache(self): - import torch - - max_new_tokens = 30 - - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # NOTE: Gemma apparently does not support right padding + use_cache with FA2. - dummy_attention_mask[:, -1] = 1 - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_flash_attn @require_torch_gpu @pytest.mark.flash_attn_test diff --git a/tests/models/gemma2/test_modeling_gemma2.py b/tests/models/gemma2/test_modeling_gemma2.py index 94670803daa..7bca83f96d7 100644 --- a/tests/models/gemma2/test_modeling_gemma2.py +++ b/tests/models/gemma2/test_modeling_gemma2.py @@ -78,7 +78,6 @@ class Gemma2ModelTest(GemmaModelTest, unittest.TestCase): test_pruning = False _is_stateful = True model_split_percents = [0.5, 0.6] - _torch_compile_test_ckpt = "google/gemma-2-9b" def setUp(self): self.model_tester = Gemma2ModelTester(self) diff --git a/tests/models/glm/test_modeling_glm.py b/tests/models/glm/test_modeling_glm.py index 32bce7cbfa6..b92c5db815b 100644 --- a/tests/models/glm/test_modeling_glm.py +++ b/tests/models/glm/test_modeling_glm.py @@ -28,7 +28,6 @@ require_flash_attn, require_torch, require_torch_accelerator, - require_torch_gpu, require_torch_sdpa, slow, torch_device, @@ -306,10 +305,6 @@ class GlmModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, test_headmasking = False test_pruning = False - # used in `test_torch_compile` - _torch_compile_test_ckpt = "THUDM/glm-4-9b" - _torch_compile_test_revision = "refs/pr/15" - def setUp(self): self.model_tester = GlmModelTester(self) self.config_tester = ConfigTester(self, config_class=GlmConfig, hidden_size=37) @@ -426,41 +421,6 @@ def test_custom_4d_attention_mask(self): torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-3) - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - """Overwrite the common test as the test is flaky on tiny models.""" - model = GlmForCausalLM.from_pretrained( - "THUDM/glm-4-9b", - device_map={"": 0}, - torch_dtype=torch.bfloat16, - revision="refs/pr/15", - ) - - tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-4-9b", revision="refs/pr/15") - tokenizer.padding_side = "right" - - texts = ["hi", "Hello this is a very long sentence"] - inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0) - - output_native = model.generate(**inputs, max_new_tokens=15, do_sample=False) - output_native = tokenizer.batch_decode(output_native) - - model = GlmForCausalLM.from_pretrained( - "THUDM/glm-4-9b", - device_map={"": 0}, - attn_implementation="flash_attention_2", - torch_dtype=torch.bfloat16, - revision="refs/pr/15", - ) - - output_fa_2 = model.generate(**inputs, max_new_tokens=15, do_sample=False) - output_fa_2 = tokenizer.batch_decode(output_fa_2) - - self.assertListEqual(output_native, output_fa_2) - @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) @require_torch_sdpa @slow diff --git a/tests/models/gptj/test_modeling_gptj.py b/tests/models/gptj/test_modeling_gptj.py index 6f6fba50dc1..afc741cd502 100644 --- a/tests/models/gptj/test_modeling_gptj.py +++ b/tests/models/gptj/test_modeling_gptj.py @@ -17,14 +17,9 @@ import datetime import unittest -import pytest - -from transformers import BitsAndBytesConfig, GPTJConfig, is_torch_available +from transformers import GPTJConfig, is_torch_available from transformers.testing_utils import ( - require_bitsandbytes, - require_flash_attn, require_torch, - require_torch_gpu, slow, tooslow, torch_device, @@ -505,44 +500,6 @@ def test_model_from_pretrained(self): model = GPTJModel.from_pretrained(model_name, revision="float16", torch_dtype=torch.float16) self.assertIsNotNone(model) - @require_flash_attn - @require_torch_gpu - @require_bitsandbytes - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - """ - Overwritting the common test as the test is flaky on tiny models - """ - tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6b") - - texts = ["hi", "Hello this is a very long sentence"] - expected_outputs = [ - "hi<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>Q: I have a question about the new version of the game. I have a question about the", - "Hello this is a very long sentence.\n\nA:\n\nI think the best way to understand this is to think of it", - ] - - tokenizer.padding_side = "right" - tokenizer.pad_token = tokenizer.eos_token - - inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0) - - quantization_config = BitsAndBytesConfig(load_in_4bit=True) - - model = GPTJForCausalLM.from_pretrained( - "EleutherAI/gpt-j-6b", - device_map={"": 0}, - attn_implementation="flash_attention_2", - revision="float16", - torch_dtype=torch.float16, - quantization_config=quantization_config, - ) - - output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_fa_2 = tokenizer.batch_decode(output_fa_2) - - self.assertListEqual(expected_outputs, output_fa_2) - @require_torch class GPTJModelLanguageGenerationTest(unittest.TestCase): diff --git a/tests/models/granite/test_modeling_granite.py b/tests/models/granite/test_modeling_granite.py index 1bcb6641803..97b59f5aa50 100644 --- a/tests/models/granite/test_modeling_granite.py +++ b/tests/models/granite/test_modeling_granite.py @@ -17,12 +17,10 @@ import tempfile import unittest -import pytest from parameterized import parameterized -from transformers import AutoTokenizer, GraniteConfig, is_torch_available, set_seed +from transformers import GraniteConfig, is_torch_available, set_seed from transformers.testing_utils import ( - require_bitsandbytes, require_flash_attn, require_read_token, require_torch, @@ -303,9 +301,6 @@ class GraniteModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi # This is because we are hitting edge cases with the causal_mask buffer model_split_percents = [0.5, 0.7, 0.8] - # used in `test_torch_compile` - _torch_compile_test_ckpt = "ibm/PowerLM-3b" - def setUp(self): self.model_tester = GraniteModelTester(self) self.config_tester = ConfigTester(self, config_class=GraniteConfig, hidden_size=37) @@ -423,46 +418,6 @@ def test_model_rope_scaling(self): with self.assertRaises(AssertionError): torch.testing.assert_close(yarn_sin_long, original_sin_long) - @require_flash_attn - @require_torch_gpu - @require_bitsandbytes - @pytest.mark.flash_attn_test - @require_read_token - @slow - def test_flash_attn_2_generate_padding_right(self): - """ - Overwritting the common test as the test is flaky on tiny models - """ - model = GraniteForCausalLM.from_pretrained( - "ibm/PowerLM-3b", - load_in_4bit=True, - device_map={"": 0}, - ) - - tokenizer = AutoTokenizer.from_pretrained("ibm/PowerLM-3b") - - texts = ["hi", "Hello this is a very long sentence"] - - tokenizer.padding_side = "right" - tokenizer.pad_token = tokenizer.eos_token - - inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0) - - output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_native = tokenizer.batch_decode(output_native) - - model = GraniteForCausalLM.from_pretrained( - "ibm/PowerLM-3b", - load_in_4bit=True, - device_map={"": 0}, - attn_implementation="flash_attention_2", - ) - - output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_fa_2 = tokenizer.batch_decode(output_fa_2) - - self.assertListEqual(output_native, output_fa_2) - @require_flash_attn @require_torch_gpu @slow diff --git a/tests/models/granitemoe/test_modeling_granitemoe.py b/tests/models/granitemoe/test_modeling_granitemoe.py index 124ce0c3bb5..f2f76b9fa75 100644 --- a/tests/models/granitemoe/test_modeling_granitemoe.py +++ b/tests/models/granitemoe/test_modeling_granitemoe.py @@ -17,12 +17,10 @@ import tempfile import unittest -import pytest from parameterized import parameterized from transformers import AutoTokenizer, GraniteMoeConfig, is_torch_available, set_seed from transformers.testing_utils import ( - require_bitsandbytes, require_flash_attn, require_read_token, require_torch, @@ -302,9 +300,6 @@ class GraniteMoeModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test # This is because we are hitting edge cases with the causal_mask buffer model_split_percents = [0.5, 0.7, 0.8] - # used in `test_torch_compile` - _torch_compile_test_ckpt = "ibm/PowerMoE-3b" - def setUp(self): self.model_tester = GraniteMoeModelTester(self) self.config_tester = ConfigTester(self, config_class=GraniteMoeConfig, hidden_size=37) @@ -422,46 +417,6 @@ def test_model_rope_scaling(self): with self.assertRaises(AssertionError): torch.testing.assert_close(yarn_sin_long, original_sin_long) - @require_flash_attn - @require_torch_gpu - @require_bitsandbytes - @pytest.mark.flash_attn_test - @require_read_token - @slow - def test_flash_attn_2_generate_padding_right(self): - """ - Overwritting the common test as the test is flaky on tiny models - """ - model = GraniteMoeForCausalLM.from_pretrained( - "ibm-granite/granitemoe-3b", - load_in_4bit=True, - device_map={"": 0}, - ) - - tokenizer = AutoTokenizer.from_pretrained("ibm-granite/granitemoe-3b") - - texts = ["hi", "Hello this is a very long sentence"] - - tokenizer.padding_side = "right" - tokenizer.pad_token = tokenizer.eos_token - - inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0) - - output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_native = tokenizer.batch_decode(output_native) - - model = GraniteMoeForCausalLM.from_pretrained( - "ibm-granite/granitemoe-3b", - load_in_4bit=True, - device_map={"": 0}, - attn_implementation="flash_attention_2", - ) - - output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_fa_2 = tokenizer.batch_decode(output_fa_2) - - self.assertListEqual(output_native, output_fa_2) - @require_flash_attn @require_torch_gpu @slow diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py index c2f0ef8ccd0..d19d10932bf 100644 --- a/tests/models/idefics/test_modeling_idefics.py +++ b/tests/models/idefics/test_modeling_idefics.py @@ -770,13 +770,6 @@ def test_contrastive_generate_low_memory(self): def test_custom_4d_attention_mask(self): pass - @unittest.skip( - reason="IDEFICS has specific requirements for working with inputs embeds like passing also the ids and pixels" - ) - @parameterized.expand([(1,), (2,)]) - def test_generate_from_inputs_embeds_decoder_only(self, num_beams): - pass - @unittest.skip(reason="IDEFICS cannot compile due to dynamic control flow when checking inputs") def test_generate_compile_fullgraph(self): pass diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py index 854b8b93457..042fecf4bd2 100644 --- a/tests/models/idefics2/test_modeling_idefics2.py +++ b/tests/models/idefics2/test_modeling_idefics2.py @@ -20,7 +20,6 @@ import unittest from io import BytesIO -import pytest import requests from transformers import ( @@ -420,50 +419,6 @@ def test_prompt_lookup_decoding_matches_greedy_search(self): def test_flash_attn_2_fp32_ln(self): pass - @pytest.mark.generate - def test_generate_from_inputs_embeds_decoder_only(self): - # overwrite because IDEFICS needs ids and embeds at the input to be not None - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.prepare_config_and_inputs_for_generate() - - # Ignore: - # a) eos (to always output 20 tokens) and pad (so we don't try to infer the attn mask from the input_ids, - # which would cause a mismatch), - config.pad_token_id = config.eos_token_id = -1 - config.is_decoder = True - model = model_class(config).to(torch_device).eval() - input_ids = inputs_dict.pop("input_ids") - - # Traditional way of generating text - outputs_from_ids = model.generate( - input_ids, max_new_tokens=5, return_dict_in_generate=True, output_scores=True - ) - self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 5)) - - # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output) - inputs_embeds = model.get_input_embeddings()(input_ids) - outputs_from_embeds = model.generate( - input_ids, - inputs_embeds=inputs_embeds, - max_new_tokens=5, - return_dict_in_generate=True, - output_scores=True, - ) - self.assertListEqual(outputs_from_ids.sequences.tolist(), outputs_from_embeds.sequences.tolist()) - - # But if we pass different inputs_embeds, we should get different outputs (the output text may be the - # same, but the logits will almost surely be different) - random_embeds = torch.rand_like(inputs_embeds) - outputs_from_rand_embeds = model.generate( - input_ids, - inputs_embeds=random_embeds, - max_new_tokens=5, - return_dict_in_generate=True, - output_scores=True, - ) - for i in range(len(outputs_from_rand_embeds.scores)): - self.assertFalse(torch.allclose(outputs_from_embeds.scores[i], outputs_from_rand_embeds.scores[i])) - # We need to override as we need to prepare such that the image token is the last token def test_resize_tokens_embeddings(self): (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/idefics3/test_modeling_idefics3.py b/tests/models/idefics3/test_modeling_idefics3.py index f0366e7b539..5dc352d22fe 100644 --- a/tests/models/idefics3/test_modeling_idefics3.py +++ b/tests/models/idefics3/test_modeling_idefics3.py @@ -19,7 +19,6 @@ import unittest from io import BytesIO -import pytest import requests from transformers import ( @@ -180,10 +179,6 @@ def test_inputs_embeds(): def test_inputs_embeds_matches_input_ids(self): pass - @unittest.skip(reason="Model does not support padding right") - def test_flash_attn_2_generate_padding_right(self): - pass - @unittest.skip(reason="Model does not support padding right") def test_flash_attn_2_inference_padding_right(self): pass @@ -337,10 +332,6 @@ def setUp(self): def test_inputs_embeds(): pass - @unittest.skip(reason="Model does not support padding right") - def test_flash_attn_2_generate_padding_right(self): - pass - @unittest.skip(reason="Model does not support padding right") def test_flash_attn_2_inference_padding_right(self): pass @@ -367,50 +358,6 @@ def test_prompt_lookup_decoding_matches_greedy_search(self): def test_flash_attn_2_fp32_ln(self): pass - @pytest.mark.generate - def test_generate_from_inputs_embeds_decoder_only(self): - # overwrite because IDEFICS needs ids and embeds at the input to be not None - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.prepare_config_and_inputs_for_generate() - - # Ignore: - # a) eos (to always output 20 tokens) and pad (so we don't try to infer the attn mask from the input_ids, - # which would cause a mismatch), - config.pad_token_id = config.eos_token_id = -1 - config.is_decoder = True - model = model_class(config).to(torch_device).eval() - input_ids = inputs_dict.pop("input_ids") - - # Traditional way of generating text - outputs_from_ids = model.generate( - input_ids, max_new_tokens=5, return_dict_in_generate=True, output_scores=True - ) - self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 5)) - - # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output) - inputs_embeds = model.get_input_embeddings()(input_ids) - outputs_from_embeds = model.generate( - input_ids, - inputs_embeds=inputs_embeds, - max_new_tokens=5, - return_dict_in_generate=True, - output_scores=True, - ) - self.assertListEqual(outputs_from_ids.sequences.tolist(), outputs_from_embeds.sequences.tolist()) - - # But if we pass different inputs_embeds, we should get different outputs (the output text may be the - # same, but the logits will almost surely be different) - random_embeds = torch.rand_like(inputs_embeds) - outputs_from_rand_embeds = model.generate( - input_ids, - inputs_embeds=random_embeds, - max_new_tokens=5, - return_dict_in_generate=True, - output_scores=True, - ) - for i in range(len(outputs_from_rand_embeds.scores)): - self.assertFalse(torch.allclose(outputs_from_embeds.scores[i], outputs_from_rand_embeds.scores[i])) - # We need to override as we need to prepare such that the image token is the last token def test_resize_tokens_embeddings(self): (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common() @@ -526,31 +473,6 @@ def test_resize_embeddings_untied(self): # Check that the model can still do a forward pass successfully (every parameter should be resized) model(**self._prepare_for_class(inputs_dict, model_class)) - def test_inputs_embeds_matches_input_ids_with_generate(self): - # overwrite because IDEFICS needs ids and embeds at the input to be not None - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - for model_class in self.all_model_classes: - model = model_class(config) - model.to(torch_device) - model.eval() - - inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class)) - pad_token_id = config.pad_token_id if config.pad_token_id is not None else 1 - - wte = model.get_input_embeddings() - - input_ids = inputs["input_ids"] - # some models infer position ids/attn mask differently when input ids - # by check if pad_token let's make sure no padding is in input ids - not_pad_token_id = pad_token_id + 1 if max(0, pad_token_id - 1) == 0 else pad_token_id - 1 - input_ids[input_ids == pad_token_id] = not_pad_token_id - del inputs["input_ids"] - inputs_embeds = wte(input_ids) - out_ids = model.generate(input_ids=input_ids, **inputs, max_new_tokens=2) - out_embeds = model.generate(input_ids=input_ids, inputs_embeds=inputs_embeds, **inputs, max_new_tokens=2) - - self.assertTrue(torch.allclose(out_embeds, out_ids)) - @require_torch class Idefics3ForConditionalGenerationIntegrationTest(unittest.TestCase): diff --git a/tests/models/jamba/test_modeling_jamba.py b/tests/models/jamba/test_modeling_jamba.py index 251f293f722..ef0b5831587 100644 --- a/tests/models/jamba/test_modeling_jamba.py +++ b/tests/models/jamba/test_modeling_jamba.py @@ -539,93 +539,6 @@ def test_flash_attn_2_fp32_ln(self): # with attention mask _ = model(dummy_input, attention_mask=dummy_attention_mask) - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - r""" - Overriding the test_flash_attn_2_generate_padding_right test as the Jamba model, like Mixtral, doesn't support - right padding + use cache with FA2 - """ - import torch - - for model_class in self.all_generative_model_classes: - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device) - dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device) - - model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - with self.assertRaises(ValueError): - _ = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_use_cache(self): - r""" - Overriding the test_flash_attn_2_generate_use_cache test as the Jamba model, like Mixtral, doesn't support - right padding + use cache with FA2 - """ - import torch - - max_new_tokens = 30 - - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # NOTE: Jamba does not support right padding + use_cache with FA2. - dummy_attention_mask[:, -1] = 1 - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_flash_attn @require_torch_gpu @pytest.mark.flash_attn_test diff --git a/tests/models/jetmoe/test_modeling_jetmoe.py b/tests/models/jetmoe/test_modeling_jetmoe.py index a04d8bba741..dc510f0ff04 100644 --- a/tests/models/jetmoe/test_modeling_jetmoe.py +++ b/tests/models/jetmoe/test_modeling_jetmoe.py @@ -15,7 +15,6 @@ """Testing suite for the PyTorch JetMoe model.""" import gc -import tempfile import unittest import pytest @@ -377,85 +376,6 @@ def test_save_load_fast_init_from_base(self): def test_past_key_values_format(self): pass - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - import torch - - for model_class in self.all_generative_model_classes: - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device) - dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device) - - model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - with self.assertRaises(ValueError): - _ = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_use_cache(self): - import torch - - max_new_tokens = 30 - - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # NOTE: JetMoe apparently does not support right padding + use_cache with FA2. - dummy_attention_mask[:, -1] = 1 - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_flash_attn @require_torch_gpu @pytest.mark.flash_attn_test diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py index de6c0b15d66..0f0b595d3d2 100644 --- a/tests/models/kosmos2/test_modeling_kosmos2.py +++ b/tests/models/kosmos2/test_modeling_kosmos2.py @@ -438,12 +438,6 @@ def check_same_values(layer_1, layer_2): # self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape) # self.assertTrue(check_same_values(model.transformer.wte, model.lm_head)) - @unittest.skip( - "KOSMOS-2 doesn't support inputs embeds. The test isn't skipped by checking ipnut args because KOSMOS-2 has `generate()` overwritten" - ) - def test_inputs_embeds_matches_input_ids_with_generate(self): - pass - @slow def test_model_from_pretrained(self): model_name = "microsoft/kosmos-2-patch14-224" diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py index 824337d8bdd..375ec1dd3e6 100644 --- a/tests/models/llama/test_modeling_llama.py +++ b/tests/models/llama/test_modeling_llama.py @@ -26,7 +26,6 @@ from transformers.generation.configuration_utils import GenerationConfig from transformers.testing_utils import ( backend_empty_cache, - require_bitsandbytes, require_flash_attn, require_read_token, require_torch, @@ -316,9 +315,6 @@ class LlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi # This is because we are hitting edge cases with the causal_mask buffer model_split_percents = [0.5, 0.7, 0.8] - # used in `test_torch_compile` - _torch_compile_test_ckpt = "meta-llama/Llama-2-7b-hf" - # used in `test_torch_compile_for_training` _torch_compile_train_cls = LlamaForCausalLM if is_torch_available() else None @@ -585,43 +581,6 @@ def _reinitialize_config(base_config, new_kwargs): with self.assertRaises(KeyError): config = _reinitialize_config(base_config, {"rope_scaling": {"rope_type": "linear"}}) # missing "factor" - @require_flash_attn - @require_torch_gpu - @require_bitsandbytes - @pytest.mark.flash_attn_test - @require_read_token - @slow - def test_flash_attn_2_generate_padding_right(self): - """ - Overwritting the common test as the test is flaky on tiny models - """ - model = LlamaForCausalLM.from_pretrained( - "meta-llama/Llama-2-7b-hf", - load_in_4bit=True, - device_map={"": 0}, - ) - - tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") - - texts = ["hi", "Hello this is a very long sentence"] - - tokenizer.padding_side = "right" - tokenizer.pad_token = tokenizer.eos_token - - inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0) - - output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_native = tokenizer.batch_decode(output_native) - - model = LlamaForCausalLM.from_pretrained( - "meta-llama/Llama-2-7b-hf", load_in_4bit=True, device_map={"": 0}, attn_implementation="flash_attention_2" - ) - - output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_fa_2 = tokenizer.batch_decode(output_fa_2) - - self.assertListEqual(output_native, output_fa_2) - @require_flash_attn @require_torch_gpu @slow diff --git a/tests/models/mamba2/test_modeling_mamba2.py b/tests/models/mamba2/test_modeling_mamba2.py index 1a8cf047745..9b3a9563b58 100644 --- a/tests/models/mamba2/test_modeling_mamba2.py +++ b/tests/models/mamba2/test_modeling_mamba2.py @@ -204,8 +204,8 @@ def test_generate_without_input_ids(self): pass @unittest.skip(reason="To fix, Mamba 2 cache slicing test case is an edge case") - @parameterized.expand([(1,), (2,)]) - def test_generate_from_inputs_embeds_decoder_only(self, num_beams): + @parameterized.expand([("greedy", 1), ("beam search", 2)]) + def test_generate_from_inputs_embeds(self, _, num_beams): pass @unittest.skip(reason="To fix, Mamba 2 cache slicing test case is an edge case") @@ -276,12 +276,6 @@ def recursive_check(tuple_object, dict_object): dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) - @unittest.skip( - reason="Mamba2 does not support generating with input embeddings (custom cache_position computation)" - ) - def test_inputs_embeds_matches_input_ids_with_generate(self): - pass - @require_torch @slow diff --git a/tests/models/mimi/test_modeling_mimi.py b/tests/models/mimi/test_modeling_mimi.py index 074dceae155..df0007d666a 100644 --- a/tests/models/mimi/test_modeling_mimi.py +++ b/tests/models/mimi/test_modeling_mimi.py @@ -21,7 +21,6 @@ import numpy as np from datasets import Audio, load_dataset -from packaging import version from parameterized import parameterized from pytest import mark @@ -745,22 +744,6 @@ def test_flash_attn_2_inference_equivalence_right_padding(self): def test_sdpa_can_compile_dynamic(self): pass - # For now, Let's focus only on GPU for `torch.compile` - @slow - @require_torch_gpu - def test_torch_compile(self): - if version.parse(torch.__version__) < version.parse("2.3"): - self.skipTest(reason="This test requires torch >= 2.3 to run.") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - n_iter = 3 - for model_class in self.all_model_classes: - model = model_class(config).to(torch_device) - model.forward = torch.compile(model.forward) - for i in range(n_iter): - _ = model(inputs_dict["input_values"].to(torch_device)) - @is_flaky() def test_batching_equivalence(self): super().test_batching_equivalence() diff --git a/tests/models/mistral/test_modeling_mistral.py b/tests/models/mistral/test_modeling_mistral.py index f2ee714bcdb..1538735ad78 100644 --- a/tests/models/mistral/test_modeling_mistral.py +++ b/tests/models/mistral/test_modeling_mistral.py @@ -15,7 +15,6 @@ """Testing suite for the PyTorch Mistral model.""" import gc -import tempfile import unittest import pytest @@ -416,85 +415,6 @@ def test_save_load_fast_init_from_base(self): def test_past_key_values_format(self): pass - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - import torch - - for model_class in self.all_generative_model_classes: - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device) - dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device) - - model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - with self.assertRaises(ValueError): - _ = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_use_cache(self): - import torch - - max_new_tokens = 30 - - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # NOTE: Mistral apparently does not support right padding + use_cache with FA2. - dummy_attention_mask[:, -1] = 1 - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_flash_attn @require_torch_gpu @pytest.mark.flash_attn_test diff --git a/tests/models/mixtral/test_modeling_mixtral.py b/tests/models/mixtral/test_modeling_mixtral.py index b9b5faed851..931bb1f17be 100644 --- a/tests/models/mixtral/test_modeling_mixtral.py +++ b/tests/models/mixtral/test_modeling_mixtral.py @@ -14,7 +14,6 @@ # limitations under the License. """Testing suite for the PyTorch Mixtral model.""" -import tempfile import unittest import pytest @@ -415,85 +414,6 @@ def test_save_load_fast_init_from_base(self): def test_past_key_values_format(self): pass - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - import torch - - for model_class in self.all_generative_model_classes: - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device) - dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device) - - model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - with self.assertRaises(ValueError): - _ = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_use_cache(self): - import torch - - max_new_tokens = 30 - - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # NOTE: Mixtral apparently does not support right padding + use_cache with FA2. - dummy_attention_mask[:, -1] = 1 - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_flash_attn @require_torch_gpu @pytest.mark.flash_attn_test diff --git a/tests/models/mllama/test_modeling_mllama.py b/tests/models/mllama/test_modeling_mllama.py index 3efa7b778fb..5174247b895 100644 --- a/tests/models/mllama/test_modeling_mllama.py +++ b/tests/models/mllama/test_modeling_mllama.py @@ -126,7 +126,6 @@ class MllamaForCausalLMModelTest(ModelTesterMixin, GenerationTesterMixin, unitte all_generative_model_classes = (MllamaForCausalLM,) if is_torch_available() else () test_pruning = False test_head_masking = False - _torch_compile_test_ckpt = "nltpt/Llama-3.2-11B-Vision" def setUp(self): self.model_tester = MllamaText2TextModelTester(self) diff --git a/tests/models/moshi/test_modeling_moshi.py b/tests/models/moshi/test_modeling_moshi.py index b77a6ff1036..7d4b855c10d 100644 --- a/tests/models/moshi/test_modeling_moshi.py +++ b/tests/models/moshi/test_modeling_moshi.py @@ -560,7 +560,7 @@ def _get_input_ids_and_config(self, batch_size=2): return config, input_ids, attention_mask, inputs_dict def prepare_config_and_inputs_for_generate(self, batch_size=2): - config, filtered_inputs_dict = super().prepare_config_and_inputs_for_generate() + config, filtered_inputs_dict = super().prepare_config_and_inputs_for_generate(batch_size=batch_size) # Make sure we only return `input_ids`. # Note that audio_codes will still be generated internally, so the ability to test audio codes is still there. @@ -591,9 +591,11 @@ def _check_hidden_states_for_generate( [expected_shape] * len(iter_hidden_states), ) - def _check_outputs(self, output, input_ids, config, use_cache=False, num_return_sequences=1): + def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1): # Overwrite because the generate method actually alway uses `inputs_embeds` so `use_cache` is always `True` - super()._check_outputs(output, input_ids, config, use_cache=True, num_return_sequences=num_return_sequences) + super()._check_outputs( + output, config, use_cache=True, num_return_sequences=num_return_sequences, num_beams=num_beams + ) def _check_hidden_states_for_generate( self, batch_size, hidden_states, min_length, max_length, config, use_cache=False, num_beam_groups=1 @@ -655,59 +657,6 @@ def test_initialization(self): msg=f"Parameter {name} of model {model_class} seems not properly initialized", ) - @pytest.mark.generate - @parameterized.expand([(1,), (2,)]) - def test_generate_from_inputs_embeds_decoder_only(self, num_beams): - for model_class in self.all_generative_model_classes: - config, input_ids, _, inputs_dict = self._get_input_ids_and_config() - - model = model_class(config).to(torch_device).eval() - generation_kwargs = { - "return_dict_in_generate": True, - "output_scores": True, - "num_beams": num_beams, - "do_sample": False, - } - - # Traditional way of generating text - outputs_from_ids = model.generate(input_ids, max_new_tokens=5, **generation_kwargs, **inputs_dict) - self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 5)) - - # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output) - inputs_embeds = model.get_input_embeddings()(input_ids) - outputs_from_embeds = model.generate( - input_ids, - inputs_embeds=inputs_embeds, - max_new_tokens=5, - **generation_kwargs, - **inputs_dict, - ) - - # But if we pass different inputs_embeds, we should get different outputs (the output text may be the - # same, but the logits will almost surely be different) - random_embeds = torch.rand_like(inputs_embeds) - outputs_from_rand_embeds = model.generate( - input_ids, - inputs_embeds=random_embeds, - max_new_tokens=5, - **generation_kwargs, - **inputs_dict, - ) - for i in range(len(outputs_from_rand_embeds.scores)): - self.assertFalse(torch.allclose(outputs_from_embeds.scores[i], outputs_from_rand_embeds.scores[i])) - - # input_ids is not a required input -- if we don't pass it, the newly generated tokens will be the same - outputs_from_embeds_wo_ids = model.generate( - inputs_embeds=inputs_embeds, - max_new_tokens=5, - **generation_kwargs, - **inputs_dict, - ) - self.assertListEqual( - outputs_from_embeds.sequences[:, inputs_embeds.shape[1] :].tolist(), - outputs_from_embeds_wo_ids.sequences.tolist(), - ) - @unittest.skip(reason="Continuing from past key values is not straightforward as we're dealing with 3 inputs") def test_generate_continue_from_past_key_values(self): pass diff --git a/tests/models/mt5/test_modeling_mt5.py b/tests/models/mt5/test_modeling_mt5.py index 20412da2e1d..1628d3a5893 100644 --- a/tests/models/mt5/test_modeling_mt5.py +++ b/tests/models/mt5/test_modeling_mt5.py @@ -576,9 +576,6 @@ class MT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, # The small MT5 model needs higher percentages for CPU/MP tests model_split_percents = [0.5, 0.8, 0.9] - # used in `test_torch_compile` - _torch_compile_test_ckpt = "google/mt5-small" - def setUp(self): self.model_tester = MT5ModelTester(self) self.config_tester = ConfigTester(self, config_class=MT5Config, d_model=37) diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py index 346ad60debe..963cace28d6 100644 --- a/tests/models/musicgen/test_modeling_musicgen.py +++ b/tests/models/musicgen/test_modeling_musicgen.py @@ -450,144 +450,6 @@ def test_flash_attn_2_inference_equivalence_right_padding(self): assert torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2) - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_left_padding - def test_flash_attn_2_generate_left_padding(self): - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = inputs_dict[model.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # make sure we do left padding - dummy_attention_mask[:, :-1] = 0 - dummy_attention_mask[:, -1:] = 1 - - out = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - out_fa = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - self.assertTrue(torch.allclose(out, out_fa)) - - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_padding_right - def test_flash_attn_2_generate_padding_right(self): - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = inputs_dict[model.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # make sure we do right padding - dummy_attention_mask[:, :-1] = 1 - dummy_attention_mask[:, -1:] = 0 - - out = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - out_fa = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - self.assertTrue(torch.allclose(out, out_fa)) - - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_use_cache - def test_flash_attn_2_generate_use_cache(self): - max_new_tokens = 30 - - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) @require_torch_sdpa @slow @@ -1585,149 +1447,6 @@ def test_flash_attn_2_inference_equivalence_right_padding(self): assert torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2) - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_left_padding - def test_flash_attn_2_generate_left_padding(self): - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = inputs_dict[model.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - dummy_attention_mask = inputs_dict.get("attention_mask") - if dummy_attention_mask is None: - dummy_attention_mask = torch.ones_like(dummy_input) - - # make sure we do left padding - dummy_attention_mask[:, :-1] = 0 - dummy_attention_mask[:, -1:] = 1 - - out = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None}, - low_cpu_mem_usage=True, - ).to(torch_device) - - out_fa = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - self.assertTrue(torch.allclose(out, out_fa)) - - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_padding_right - def test_flash_attn_2_generate_padding_right(self): - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = inputs_dict[model.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - dummy_attention_mask = inputs_dict.get("attention_mask") - if dummy_attention_mask is None: - dummy_attention_mask = torch.ones_like(dummy_input) - # make sure we do right padding - dummy_attention_mask[:, :-1] = 1 - dummy_attention_mask[:, -1:] = 0 - - out = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None}, - low_cpu_mem_usage=True, - ).to(torch_device) - - out_fa = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - self.assertTrue(torch.allclose(out, out_fa)) - - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_use_cache - def test_flash_attn_2_generate_use_cache(self): - max_new_tokens = 30 - - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None}, - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_torch_sdpa def test_sdpa_can_dispatch_composite_models(self): if not self.has_attentions: diff --git a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py index f3b6be0ac65..957db9f23b0 100644 --- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py +++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py @@ -1437,149 +1437,6 @@ def test_flash_attn_2_inference_equivalence_right_padding(self): assert torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2) - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_left_padding - def test_flash_attn_2_generate_left_padding(self): - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = inputs_dict[model.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - dummy_attention_mask = inputs_dict.get("attention_mask") - if dummy_attention_mask is None: - dummy_attention_mask = torch.ones_like(dummy_input) - - # make sure we do left padding - dummy_attention_mask[:, :-1] = 0 - dummy_attention_mask[:, -1:] = 1 - - out = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None}, - low_cpu_mem_usage=True, - ).to(torch_device) - - out_fa = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - self.assertTrue(torch.allclose(out, out_fa)) - - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_padding_right - def test_flash_attn_2_generate_padding_right(self): - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = inputs_dict[model.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - dummy_attention_mask = inputs_dict.get("attention_mask") - if dummy_attention_mask is None: - dummy_attention_mask = torch.ones_like(dummy_input) - # make sure we do right padding - dummy_attention_mask[:, :-1] = 1 - dummy_attention_mask[:, -1:] = 0 - - out = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None}, - low_cpu_mem_usage=True, - ).to(torch_device) - - out_fa = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - self.assertTrue(torch.allclose(out, out_fa)) - - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_use_cache - def test_flash_attn_2_generate_use_cache(self): - max_new_tokens = 30 - - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None}, - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_torch_sdpa def test_sdpa_can_dispatch_composite_models(self): if not self.has_attentions: diff --git a/tests/models/nemotron/test_modeling_nemotron.py b/tests/models/nemotron/test_modeling_nemotron.py index 13adfe1e579..37a581a3386 100644 --- a/tests/models/nemotron/test_modeling_nemotron.py +++ b/tests/models/nemotron/test_modeling_nemotron.py @@ -92,8 +92,6 @@ class NemotronModelTest(GemmaModelTest): test_pruning = False fx_compatible = False - # used in `test_torch_compile` - _torch_compile_test_ckpt = "nvidia/nemotron-3-8b-base-4k-hf" # used in `test_torch_compile_for_training` _torch_compile_train_cls = NemotronForCausalLM if is_torch_available() else None diff --git a/tests/models/paligemma/test_modeling_paligemma.py b/tests/models/paligemma/test_modeling_paligemma.py index 95ae59dfc08..1d96b9c338f 100644 --- a/tests/models/paligemma/test_modeling_paligemma.py +++ b/tests/models/paligemma/test_modeling_paligemma.py @@ -346,10 +346,6 @@ def test_save_load_low_cpu_mem_usage_no_safetensors(self): def test_generate_from_inputs_embeds_with_static_cache(self): pass - @unittest.skip(reason="TODO (@joao): fix me -- failing to produce similar results") - def test_static_cache_matches_dynamic(self): - pass - @unittest.skip("FlashAttention only support fp16 and bf16 data type") def test_flash_attn_2_fp32_ln(self): pass diff --git a/tests/models/phi/test_modeling_phi.py b/tests/models/phi/test_modeling_phi.py index c17f69a4998..eae6789bef2 100644 --- a/tests/models/phi/test_modeling_phi.py +++ b/tests/models/phi/test_modeling_phi.py @@ -17,15 +17,11 @@ import unittest -import pytest from parameterized import parameterized from transformers import PhiConfig, is_torch_available, set_seed from transformers.testing_utils import ( - require_bitsandbytes, - require_flash_attn, require_torch, - require_torch_gpu, slow, torch_device, ) @@ -468,43 +464,6 @@ def test_model_rope_scaling(self): torch.testing.assert_close(ntk_sin_long, original_sin_long) self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all()) - @require_flash_attn - @require_torch_gpu - @require_bitsandbytes - @pytest.mark.flash_attn_test - @slow - # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_flash_attn_2_generate_padding_right with LlamaForCausalLM->PhiForCausalLM,LlamaTokenizer->AutoTokenizer,meta-llama/Llama-2-7b-hf->microsoft/phi-1 - def test_flash_attn_2_generate_padding_right(self): - """ - Overwritting the common test as the test is flaky on tiny models - """ - model = PhiForCausalLM.from_pretrained( - "microsoft/phi-1", - load_in_4bit=True, - device_map={"": 0}, - ) - - tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1") - - texts = ["hi", "Hello this is a very long sentence"] - - tokenizer.padding_side = "right" - tokenizer.pad_token = tokenizer.eos_token - - inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0) - - output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_native = tokenizer.batch_decode(output_native) - - model = PhiForCausalLM.from_pretrained( - "microsoft/phi-1", load_in_4bit=True, device_map={"": 0}, attn_implementation="flash_attention_2" - ) - - output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_fa_2 = tokenizer.batch_decode(output_fa_2) - - self.assertListEqual(output_native, output_fa_2) - @slow @require_torch diff --git a/tests/models/qwen2/test_modeling_qwen2.py b/tests/models/qwen2/test_modeling_qwen2.py index 4e57f8e0f00..f51dc2e0a5e 100644 --- a/tests/models/qwen2/test_modeling_qwen2.py +++ b/tests/models/qwen2/test_modeling_qwen2.py @@ -15,7 +15,6 @@ """Testing suite for the PyTorch Qwen2 model.""" import gc -import tempfile import unittest import pytest @@ -428,85 +427,6 @@ def test_save_load_fast_init_from_base(self): def test_past_key_values_format(self): pass - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - import torch - - for model_class in self.all_generative_model_classes: - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device) - dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device) - - model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - with self.assertRaises(ValueError): - _ = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_use_cache(self): - import torch - - max_new_tokens = 30 - - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # NOTE: Qwen2 apparently does not support right padding + use_cache with FA2. - dummy_attention_mask[:, -1] = 1 - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_flash_attn @require_torch_gpu @pytest.mark.flash_attn_test diff --git a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py index c545e882fae..abc7b57919b 100644 --- a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py +++ b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py @@ -15,7 +15,6 @@ """Testing suite for the PyTorch Qwen2MoE model.""" import gc -import tempfile import unittest import pytest @@ -453,85 +452,6 @@ def test_save_load_fast_init_from_base(self): def test_past_key_values_format(self): pass - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - import torch - - for model_class in self.all_generative_model_classes: - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device) - dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device) - - model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - with self.assertRaises(ValueError): - _ = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_use_cache(self): - import torch - - max_new_tokens = 30 - - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # NOTE: Qwen2Moe apparently does not support right padding + use_cache with FA2. - dummy_attention_mask[:, -1] = 1 - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_flash_attn @require_torch_gpu @pytest.mark.flash_attn_test diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py index e1cd715f8f1..a3272853a78 100644 --- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py @@ -301,10 +301,6 @@ def test_training_gradient_checkpointing_use_reentrant_false(self): def test_feed_forward_chunking(self): pass - @unittest.skip(reason="Generate needs input ids") - def test_inputs_embeds_matches_input_ids_with_generate(self): - pass - @unittest.skip(reason="CPU offload is not yet supported") def test_cpu_offload(self): pass diff --git a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py index 542955f9fa4..985115d7707 100644 --- a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py +++ b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py @@ -420,10 +420,6 @@ def _check_hidden_states_for_generate( def test_initialization(self): pass - @unittest.skip(reason="RecurrentGemma does not support generating with input embeddings (missing position_ids)") - def test_inputs_embeds_matches_input_ids_with_generate(self): - pass - @require_torch_accelerator @slow diff --git a/tests/models/starcoder2/test_modeling_starcoder2.py b/tests/models/starcoder2/test_modeling_starcoder2.py index 32d28143d72..df743f132c1 100644 --- a/tests/models/starcoder2/test_modeling_starcoder2.py +++ b/tests/models/starcoder2/test_modeling_starcoder2.py @@ -14,7 +14,6 @@ # limitations under the License. """Testing suite for the PyTorch Starcoder2 model.""" -import tempfile import unittest import pytest @@ -404,85 +403,6 @@ def test_save_load_fast_init_from_base(self): def test_past_key_values_format(self): pass - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - import torch - - for model_class in self.all_generative_model_classes: - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device) - dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device) - - model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - with self.assertRaises(ValueError): - _ = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_use_cache(self): - import torch - - max_new_tokens = 30 - - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # NOTE: Starcoder2 apparently does not support right padding + use_cache with FA2. - dummy_attention_mask[:, -1] = 1 - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_flash_attn @require_torch_gpu @pytest.mark.flash_attn_test diff --git a/tests/models/t5/test_modeling_t5.py b/tests/models/t5/test_modeling_t5.py index 68dd5a52b3d..b0341639076 100644 --- a/tests/models/t5/test_modeling_t5.py +++ b/tests/models/t5/test_modeling_t5.py @@ -580,9 +580,6 @@ class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, # The small T5 model needs higher percentages for CPU/MP tests model_split_percents = [0.5, 0.8, 0.9] - # used in `test_torch_compile` - _torch_compile_test_ckpt = "google-t5/t5-small" - def setUp(self): self.model_tester = T5ModelTester(self) self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37) diff --git a/tests/models/umt5/test_modeling_umt5.py b/tests/models/umt5/test_modeling_umt5.py index ec4c1d019b6..377668851c5 100644 --- a/tests/models/umt5/test_modeling_umt5.py +++ b/tests/models/umt5/test_modeling_umt5.py @@ -317,9 +317,6 @@ class UMT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin # The small UMT5 model needs higher percentages for CPU/MP tests model_split_percents = [0.5, 0.8, 0.9] - # used in `test_torch_compile` - _torch_compile_test_ckpt = "google/umt5-small" - def setUp(self): self.model_tester = UMT5ModelTester(self) diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py index b24c577a16e..12aedaca8cf 100644 --- a/tests/models/whisper/test_modeling_whisper.py +++ b/tests/models/whisper/test_modeling_whisper.py @@ -1574,59 +1574,6 @@ def test_generate_output_type(self, return_dict_in_generate): ) assert isinstance(pred_ids, expected_output_type) - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_reuse_cache(self): - max_new_tokens = 2 - for model_class in self.all_generative_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name][..., :10] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = dummy_input.shape[1] * 2 + max_new_tokens * 2 + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # run generate once to get filled cache - output = model.generate( - dummy_input, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - return_dict_in_generate=True, - ) - past_key_values = output.past_key_values - - # Try to continue generation from where we left, given that we have more than 1 new token to process - # e.g. this can happen in speculative decoding when feeding candidate tokens back to target model - _ = model.generate( - dummy_input, - decoder_input_ids=output.sequences, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - past_key_values=past_key_values, - ) - def test_labels_sequence_max_length_correct(self): config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() @@ -3961,11 +3908,6 @@ def test_generate_without_input_ids(self): # generate only works with input ids for whisper pass - @unittest.skip(reason="Generate needs input ids") - def test_inputs_embeds_matches_input_ids_with_generate(self): - # generate only works with input ids for whisper - pass - @unittest.skip(reason="Decoder can't keep attention grads") def test_retain_grad_hidden_states_attentions(self): return @@ -3974,18 +3916,6 @@ def test_retain_grad_hidden_states_attentions(self): def test_save_load_fast_init_from_base(self): pass - @unittest.skip( - reason="FA2 testing suite needs to be refactored to be compatible with WhisperDecoder for that test" - ) - def test_flash_attn_2_generate_reuse_cache(self): - pass - - @unittest.skip( - "Duplicated test with WhisperModelTest + the FA2 testing suite needs to be refactored to be compatible with WhisperDecoder for that test" - ) - def test_flash_attn_2_generate_padding_right(self): - pass - @unittest.skip( "Duplicated test with WhisperModelTest + the FA2 testing suite needs to be refactored to be compatible with WhisperDecoder for that test" ) diff --git a/tests/models/zamba/test_modeling_zamba.py b/tests/models/zamba/test_modeling_zamba.py index c0a8020bedd..a6dd516f98a 100644 --- a/tests/models/zamba/test_modeling_zamba.py +++ b/tests/models/zamba/test_modeling_zamba.py @@ -542,93 +542,6 @@ def test_flash_attn_2_fp32_ln(self): # with attention mask _ = model(dummy_input, attention_mask=dummy_attention_mask) - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - r""" - Overriding the test_flash_attn_2_generate_padding_right test as the Zamba model, like Mixtral, doesn't support - right padding + use cache with FA2 - """ - import torch - - for model_class in self.all_generative_model_classes: - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device) - dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device) - - model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - with self.assertRaises(ValueError): - _ = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_use_cache(self): - r""" - Overriding the test_flash_attn_2_generate_use_cache test as the Zamba model, like Mixtral, doesn't support - right padding + use cache with FA2 - """ - import torch - - max_new_tokens = 30 - - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # NOTE: Zamba does not support right padding + use_cache with FA2. - dummy_attention_mask[:, -1] = 1 - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_flash_attn @require_torch_gpu @pytest.mark.flash_attn_test diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index d88b0dc5f02..e2719d8cf1b 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -22,7 +22,6 @@ import random import re import tempfile -import time import warnings from collections import defaultdict from contextlib import contextmanager @@ -37,10 +36,7 @@ from transformers import ( AutoModel, AutoModelForCausalLM, - AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, - AutoTokenizer, - GenerationConfig, PretrainedConfig, PreTrainedModel, is_torch_available, @@ -86,7 +82,6 @@ require_deepspeed, require_flash_attn, require_non_xpu, - require_read_token, require_safetensors, require_torch, require_torch_accelerator, @@ -3000,71 +2995,6 @@ def test_inputs_embeds_matches_input_ids(self): )[0] self.assertTrue(torch.allclose(out_embeds, out_ids)) - def test_inputs_embeds_matches_input_ids_with_generate(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - for model_class in self.all_generative_model_classes: - if model_class.__name__ not in [ - *get_values(MODEL_FOR_CAUSAL_LM_MAPPING_NAMES), - *get_values(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES), - ]: - continue - - model = model_class(config) - model.to(torch_device) - model.eval() - - model_forward_args = inspect.signature(model.forward).parameters - if any(argument not in model_forward_args for argument in ["inputs_embeds", "position_ids"]): - self.skipTest(reason="This model doesn't use `inputs_embeds` or `position_ids`.") - has_inputs_embeds_forwarding = "inputs_embeds" in set( - inspect.signature(model.prepare_inputs_for_generation).parameters.keys() - ) - if not has_inputs_embeds_forwarding: - self.skipTest(reason="This model doesn't support `inputs_embeds` passed to `generate`.") - inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class)) - pad_token_id = config.pad_token_id if config.pad_token_id is not None else 1 - - # VLMs can't generate with embeds and pixels at the same time. We expect the user to pass merged - # embeds already - if model_class.__name__ in get_values(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES): - inputs.pop("pixel_values", None) - inputs.pop("pixel_values_videos", None) - inputs.pop("pixel_values_images", None) - - wte = model.get_input_embeddings() - if not self.is_encoder_decoder: - input_ids = inputs["input_ids"] - # some models infer position ids/attn mask differently when input ids - # by check if pad_token let's make sure no padding is in input ids - not_pad_token_id = pad_token_id + 1 if max(0, pad_token_id - 1) == 0 else pad_token_id - 1 - input_ids[input_ids == pad_token_id] = not_pad_token_id - del inputs["input_ids"] - inputs_embeds = wte(input_ids) - out_ids = model.generate(input_ids=input_ids, **inputs, max_new_tokens=2)[:, -2:] - out_embeds = model.generate(inputs_embeds=inputs_embeds, **inputs, max_new_tokens=2) - else: - encoder_input_ids = inputs["input_ids"] - decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids) - encoder_input_ids[encoder_input_ids == pad_token_id] = max(0, pad_token_id + 1) - decoder_input_ids[decoder_input_ids == pad_token_id] = max(0, pad_token_id + 1) - del inputs["input_ids"] - inputs.pop("decoder_input_ids", None) - inputs_embeds = wte(encoder_input_ids) - decoder_inputs_embeds = wte(decoder_input_ids) - out_ids = model.generate( - input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids, **inputs, max_new_tokens=2 - )[:, -2:] - out_embeds = model.generate( - inputs_embeds=inputs_embeds, - decoder_inputs_embeds=decoder_inputs_embeds, - **inputs, - max_new_tokens=2, - ) - # NOTE: this test changes the order of FP ops, there may be tiny differences in the output - number_of_different_tokens = (out_ids != out_embeds).sum() - max_differences = int(out_ids.shape[0] * out_ids.shape[1] * 0.1) - self.assertTrue(number_of_different_tokens <= max_differences) # accept up to 10% mismatch - @require_non_xpu @require_torch_multi_gpu def test_multi_gpu_data_parallel_forward(self): @@ -3857,102 +3787,6 @@ def test_flash_attn_2_inference_equivalence_right_padding(self): assert torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2) - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - @is_flaky() - def test_flash_attn_2_generate_left_padding(self): - if not self.has_attentions: - self.skipTest(reason="Model architecture does not support attentions") - - for model_class in self.all_generative_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = inputs_dict[model.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # make sure we do left padding - dummy_attention_mask[:, :-1] = 0 - dummy_attention_mask[:, -1:] = 1 - - out = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - out_fa = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - self.assertTrue(torch.allclose(out, out_fa)) - - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @is_flaky() - @slow - def test_flash_attn_2_generate_padding_right(self): - if not self.has_attentions: - self.skipTest(reason="Model architecture does not support attentions") - - for model_class in self.all_generative_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = inputs_dict[model.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # make sure we do right padding - dummy_attention_mask[:, :-1] = 1 - dummy_attention_mask[:, -1:] = 0 - - out = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - out_fa = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - self.assertTrue(torch.allclose(out, out_fa)) - def test_attn_implementation_composite_models(self): """ Tests if composite models can receive a dict object as attn_implementation, where each key should be @@ -4525,65 +4359,6 @@ def test_sdpa_matches_eager_sliding_window(self): torch.allclose(res_eager[attention_mask == 1], res_sdpa[attention_mask == 1], rtol=1e-4, atol=1e-4) ) - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - def test_flash_attn_2_generate_use_cache(self): - if not self.has_attentions: - self.skipTest(reason="Model architecture does not support attentions") - - max_new_tokens = 30 - - for model_class in self.all_generative_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - - # Generate with one batch only to test generation when attention mask will be None - # when real inputs are used, because there is no padding. See issue #32237 for more - dummy_input = dummy_input[:1, ...] - dummy_attention_mask = torch.ones_like(dummy_attention_mask[:1, ...]) - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_flash_attn @require_torch_gpu @mark.flash_attn_test @@ -4640,62 +4415,6 @@ def test_flash_attn_2_can_dispatch_composite_models(self): if not has_fa2: raise ValueError("The FA2 model should have FA2 layers") - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - def test_flash_attn_2_generate_reuse_cache(self): - if not self.has_attentions: - self.skipTest(reason="Model architecture does not support attentions") - - max_new_tokens = 2 - for model_class in self.all_generative_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = dummy_input.shape[1] * 2 + max_new_tokens * 2 + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # run generate once to get filled cache - output = model.generate( - dummy_input, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - return_dict_in_generate=True, - ) - past_key_values = output.past_key_values - - # Try to continue generation from where we left, given that we have more than 1 new token to process - # e.g. this can happen in speculative decoding when feeding candidate tokens back to target model - dummy_input_updated = torch.cat([dummy_input, output.sequences], dim=-1) - _ = model.generate( - dummy_input_updated, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - past_key_values=past_key_values, - ) - @require_flash_attn @require_torch_gpu @require_bitsandbytes @@ -4999,82 +4718,6 @@ def test_custom_4d_attention_mask(self): normalized_1 = F.softmax(out_shared_prefix_last_tokens) torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-4) - def test_static_cache_matches_dynamic(self): - """ - Tests that generating with static cache give almost same results as with dynamic cache. - This test does not compile the model and check only logits similarity for numerical precision - errors. - """ - if len(self.all_generative_model_classes) == 0: - self.skipTest( - reason="Model architecture has no generative classes, and thus not necessarily supporting 4D masks" - ) - for model_class in self.all_generative_model_classes: - if not model_class._supports_static_cache: - self.skipTest(f"{model_class.__name__} does not support static cache") - - if not model_class._supports_cache_class: - self.skipTest(f"{model_class.__name__} does not support cache class") - - config, inputs = self.model_tester.prepare_config_and_inputs_for_common() - if getattr(config, "sliding_window", 0) is not None and getattr(config, "sliding_window", 0) > 0: - self.skipTest(f"{model_class.__name__} with sliding window attention is not supported by this test") - - model = model_class(config).to(device=torch_device, dtype=torch.float32) - model.eval() - - dynamic_out = model.generate( - **inputs, do_sample=False, max_new_tokens=10, output_logits=True, return_dict_in_generate=True - ) - static_out = model.generate( - **inputs, - do_sample=False, - max_new_tokens=10, - cache_implementation="static", - output_logits=True, - return_dict_in_generate=True, - ) - self.assertTrue(torch.allclose(dynamic_out.logits[0], static_out.logits[0], rtol=1e-3, atol=1e-4)) - - # For now, Let's focus only on GPU for `torch.compile` - @slow - @require_torch_accelerator - @require_read_token - def test_torch_compile(self): - if version.parse(torch.__version__) < version.parse("2.3"): - self.skipTest(reason="This test requires torch >= 2.3 to run.") - torch.compiler.reset() - if not hasattr(self, "_torch_compile_test_ckpt"): - self.skipTest(f"{self.__class__.__name__} doesn't have the attribute `_torch_compile_test_ckpt`.") - ckpt = self._torch_compile_test_ckpt - revision = "main" if not hasattr(self, "_torch_compile_test_revision") else self._torch_compile_test_revision - - os.environ["TOKENIZERS_PARALLELISM"] = "false" - - batch_size = 1 - n_iter = 3 - - tokenizer = AutoTokenizer.from_pretrained(ckpt) - if self.is_encoder_decoder: - model = AutoModelForSeq2SeqLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to( - torch_device - ) - else: - model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to( - torch_device - ) - - model.generation_config.max_new_tokens = 4 - - model.generation_config.cache_implementation = "static" - model.forward = torch.compile(model.forward, mode="reduce-overhead") - - input_text = "Why dogs are cute?" - input_ids = tokenizer([input_text] * batch_size, return_tensors="pt").to(torch_device) - - for i in range(n_iter): - _ = model.generate(**input_ids, do_sample=False) - @slow @require_torch_gpu def test_torch_compile_for_training(self): @@ -5118,74 +4761,6 @@ def test_torch_compile_for_training(self): for name, param in model._orig_mod.named_parameters(): torch.testing.assert_close(param.grad.detach().cpu(), params[name], rtol=1e-4, atol=1e-4) - @slow - @require_torch_gpu # Testing cuda graphs. - @require_read_token - def test_compile_cuda_graph_time(self): - if version.parse(torch.__version__) < version.parse("2.3"): - self.skipTest(reason="This test requires torch >= 2.3 to run.") - - # TODO felix: All models supporting `StaticCache` or `torch.compile` should be tested. - # At the moment, only llama, gemma and gemma2 are tested here! - if not hasattr(self, "_torch_compile_test_ckpt"): - self.skipTest(f"{self.__class__.__name__} doesn't have the attribute `_torch_compile_test_ckpt`.") - ckpt = self._torch_compile_test_ckpt - revision = "main" if not hasattr(self, "_torch_compile_test_revision") else self._torch_compile_test_revision - - os.environ["TOKENIZERS_PARALLELISM"] = "false" - - tokenizer = AutoTokenizer.from_pretrained(ckpt) - if self.is_encoder_decoder: - model = AutoModelForSeq2SeqLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to( - torch_device - ) - else: - model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to( - torch_device - ) - - cache_implementation = "static" - if model.config.model_type == "gemma2": - cache_implementation = "hybrid" - - new_tokens = 50 - gen_config = GenerationConfig( - max_new_tokens=new_tokens, - min_new_tokens=new_tokens, - use_cache=True, - pad_token_id=tokenizer.pad_token_id, - num_beams=1, - do_sample=False, - eos_token_id=None, # This is required for min_new_tokens to actually have an effect. - ) - model.generation_config.eos_token_id = None # greedy_search falls back on this eos_token_id that we need to set to None as well for min_new_tokens to have an effect. - - model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True) - - inp = tokenizer("Why cats are cute?", return_tensors="pt").to(torch_device) - - # First run: the first run warms up each graph, which does things like CuBlas or Triton benchmarking - start = time.perf_counter() - _ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation) - end = time.perf_counter() - graph_warmup_time = end - start - - # Second run: CUDA Graph recording, and replays it - start = time.perf_counter() - _ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation) - end = time.perf_counter() - record_time = end - start - - # Finally: we hit the optimized, CUDA Graph replay path - start = time.perf_counter() - _ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation) - end = time.perf_counter() - opt_time = end - start - - # For the recording step, we expect only two cuda graphs and this step should be much faster than the first. - self.assertTrue(record_time < 0.15 * graph_warmup_time) - self.assertTrue(opt_time < record_time) - def test_forward_with_num_logits_to_keep(self): for model_class in self.all_generative_model_classes: if "num_logits_to_keep" not in set(inspect.signature(model_class.forward).parameters.keys()): From 241d79026f1030124dbb957de936b3d617b621f2 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo <39954772+molbap@users.noreply.github.com> Date: Wed, 30 Oct 2024 14:17:20 +0100 Subject: [PATCH 72/99] fix pixtral processor (#34486) * fix pixtral processor * test out full length batches + remove undue ValueError * fix up processing * fix tests * fix * last fixup * style * [run-slow] pixtral * [run-slow] pixtral * fix config key * skip torchscript tests * [run-slow] pixtral * add missing key * [run-slow] pixtral * fix docs * [run-slow] pixtral * fix wrong url for integration test * [run-slow] pixtral * pixtralVisionModel does not have a lm head * [run-slow] pixtral --- .../models/pixtral/configuration_pixtral.py | 4 ++ .../models/pixtral/modeling_pixtral.py | 2 +- .../models/pixtral/processing_pixtral.py | 15 +++---- tests/models/pixtral/test_modeling_pixtral.py | 41 +------------------ .../models/pixtral/test_processor_pixtral.py | 21 +++++++++- 5 files changed, 35 insertions(+), 48 deletions(-) diff --git a/src/transformers/models/pixtral/configuration_pixtral.py b/src/transformers/models/pixtral/configuration_pixtral.py index 32325a92941..14db51b947e 100644 --- a/src/transformers/models/pixtral/configuration_pixtral.py +++ b/src/transformers/models/pixtral/configuration_pixtral.py @@ -52,6 +52,8 @@ class PixtralVisionConfig(PretrainedConfig): Dropout probability for the attention layers. rope_theta (`float`, *optional*, defaults to 10000.0): The base period of the RoPE embeddings. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. Example: @@ -82,6 +84,7 @@ def __init__( hidden_act="gelu", attention_dropout=0.0, rope_theta=10000.0, + initializer_range=0.02, **kwargs, ): super().__init__(**kwargs) @@ -97,3 +100,4 @@ def __init__( self.hidden_act = hidden_act self.rope_theta = rope_theta self.head_dim = hidden_size // num_attention_heads + self.initializer_range = initializer_range diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py index 06b9701a756..b65fbd634ba 100644 --- a/src/transformers/models/pixtral/modeling_pixtral.py +++ b/src/transformers/models/pixtral/modeling_pixtral.py @@ -407,7 +407,7 @@ def _init_weights(self, module): std = ( self.config.initializer_range if hasattr(self.config, "initializer_range") - else self.config.text_config.initializer_range + else self.config.initializer_range ) if isinstance(module, (nn.Linear, nn.Conv2d)): diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py index 70d28fb7b79..5913e8688d0 100644 --- a/src/transformers/models/pixtral/processing_pixtral.py +++ b/src/transformers/models/pixtral/processing_pixtral.py @@ -206,14 +206,15 @@ def __call__( if is_image_or_image_url(images): images = [[images]] elif isinstance(images, list) and is_image_or_image_url(images[0]): - images = [images] - elif ( - not isinstance(images, list) - and not isinstance(images[0], list) - and not is_image_or_image_url(images[0][0]) - ): + if isinstance(text, list): + images = [[im] for im in images] + else: + images = [images] + elif isinstance(images, list) and isinstance(images[0], list) and is_image_or_image_url(images[0][0]): + pass + else: raise ValueError( - "Invalid input images. Please provide a single image or a list of images or a list of list of images." + "Invalid input images. Please provide a single image, a list of images, or a list of lists of images." ) images = [[load_image(im) for im in sample] for sample in images] image_inputs = self.image_processor(images, patch_size=self.patch_size, **output_kwargs["images_kwargs"]) diff --git a/tests/models/pixtral/test_modeling_pixtral.py b/tests/models/pixtral/test_modeling_pixtral.py index 9a128f6ad28..0c36cb5a4e0 100644 --- a/tests/models/pixtral/test_modeling_pixtral.py +++ b/tests/models/pixtral/test_modeling_pixtral.py @@ -14,22 +14,16 @@ # limitations under the License. """Testing suite for the PyTorch Pixtral model.""" -import gc import unittest -import requests - from transformers import ( - AutoProcessor, PixtralVisionConfig, PixtralVisionModel, is_torch_available, is_vision_available, ) from transformers.testing_utils import ( - require_bitsandbytes, require_torch, - slow, torch_device, ) @@ -43,7 +37,7 @@ is_torch_greater_or_equal_than_2_0 = False if is_vision_available(): - from PIL import Image + pass class PixtralVisionModelTester: @@ -148,6 +142,7 @@ class PixtralVisionModelModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = (PixtralVisionModel,) if is_torch_available() else () test_pruning = False test_head_masking = False + test_torchscript = False def setUp(self): self.model_tester = PixtralVisionModelTester(self) @@ -258,35 +253,3 @@ def test_disk_offload_safetensors(self): @unittest.skip(reason="Not supported yet") def test_determinism(self): pass - - -@require_torch -class PixtralVisionModelIntegrationTest(unittest.TestCase): - def setUp(self): - self.processor = AutoProcessor.from_pretrained("hf-internal-testing/pixtral-12b") - - def tearDown(self): - gc.collect() - torch.cuda.empty_cache() - - @slow - @require_bitsandbytes - def test_small_model_integration_test(self): - # Let' s make sure we test the preprocessing to replace what is used - model = PixtralVisionModel.from_pretrained("hf-internal-testing/pixtral-12b", load_in_4bit=True) - - prompt = "[INST][IMG]\nWhat are the things I should be cautious about when I visit this place?[/INST]" - image_file = "https://pixtral-vl.github.io/static/images/view.jpg" - raw_image = Image.open(requests.get(image_file, stream=True).raw) - inputs = self.processor(prompt, raw_image, return_tensors="pt") - - EXPECTED_INPUT_IDS = torch.tensor([[1, 32000, 28705, 13, 11123, 28747, 1824, 460, 272, 1722,315, 1023, 347, 13831, 925, 684, 739, 315, 3251, 456,1633, 28804, 13, 4816, 8048, 12738, 28747]]) # fmt: skip - self.assertTrue(torch.equal(inputs["input_ids"], EXPECTED_INPUT_IDS)) - - output = model.generate(**inputs, max_new_tokens=20) - EXPECTED_DECODED_TEXT = "\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT: When visiting this place, there are a few things one should be cautious about. Firstly," # fmt: skip - - self.assertEqual( - self.processor.decode(output[0], skip_special_tokens=True), - EXPECTED_DECODED_TEXT, - ) diff --git a/tests/models/pixtral/test_processor_pixtral.py b/tests/models/pixtral/test_processor_pixtral.py index 8cdbf93c647..c3496dff3cd 100644 --- a/tests/models/pixtral/test_processor_pixtral.py +++ b/tests/models/pixtral/test_processor_pixtral.py @@ -171,7 +171,7 @@ def test_processor_with_multiple_images_single_list(self): input_ids[0].tolist(), # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"] [21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058] - ) + ) # fmt: on # Test passing in a url @@ -246,6 +246,25 @@ def test_processor_with_multiple_images_multiple_lists(self): ) # fmt: on + def test_processor_returns_full_length_batches(self): + # to avoid https://github.com/huggingface/transformers/issues/34204 + processor = self.processor_class.from_pretrained(self.tmpdirname) + prompt_string = [ + "USER: [IMG]\nWhat's the content of the image? ASSISTANT:", + ] * 5 + processor.tokenizer.pad_token = "" + image_inputs = [self.image_0] * 5 + + # Make small for checking image token expansion + processor.image_processor.size = {"longest_edge": 30} + processor.image_processor.patch_size = {"height": 2, "width": 2} + + # Test passing in an image + inputs_image = processor(text=prompt_string, images=image_inputs, return_tensors="pt", padding=True) + self.assertIn("input_ids", inputs_image) + self.assertTrue(len(inputs_image["input_ids"]) == 5) + self.assertTrue(len(inputs_image["pixel_values"]) == 5) + # Override as PixtralProcessor needs nested images to work properly with batched inputs @require_vision def prepare_image_inputs(self, batch_size: Optional[int] = None): From eab6c491d439e83d5e31c660df6f7e36592eb0a2 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Wed, 30 Oct 2024 14:54:10 +0100 Subject: [PATCH 73/99] Use torch 2.5 in scheduled CI (#34465) * torch 2.5 * try --------- Co-authored-by: ydshieh --- docker/transformers-all-latest-gpu/Dockerfile | 2 +- docker/transformers-pytorch-gpu/Dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile index 7ad4e96d62c..b597f5a73fb 100644 --- a/docker/transformers-all-latest-gpu/Dockerfile +++ b/docker/transformers-all-latest-gpu/Dockerfile @@ -9,7 +9,7 @@ SHELL ["sh", "-lc"] # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant # to be used as arguments for docker build (so far). -ARG PYTORCH='2.4.0' +ARG PYTORCH='2.5.1' # (not always a valid torch version) ARG INTEL_TORCH_EXT='2.3.0' # Example: `cu102`, `cu113`, etc. diff --git a/docker/transformers-pytorch-gpu/Dockerfile b/docker/transformers-pytorch-gpu/Dockerfile index 62578ad0f36..f22d77b9372 100644 --- a/docker/transformers-pytorch-gpu/Dockerfile +++ b/docker/transformers-pytorch-gpu/Dockerfile @@ -11,7 +11,7 @@ ARG REF=main RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF # If set to nothing, will install the latest version -ARG PYTORCH='2.4.0' +ARG PYTORCH='2.5.1' ARG TORCH_VISION='' ARG TORCH_AUDIO='' # Example: `cu102`, `cu113`, etc. From 5251fe6271bec670f71a6c1a86f4a2049fb03a90 Mon Sep 17 00:00:00 2001 From: Vladislav Bronzov <58587565+VladOS95-cyber@users.noreply.github.com> Date: Wed, 30 Oct 2024 16:52:17 +0100 Subject: [PATCH 74/99] Add GGUF for Mamba (#34200) * add mamba architecture for gguf * add logic for weights conversion, some fixes and refactoring * add lm_head layers, unit test refactoring * more fixes for tests * remove lm_head creation * remove unused comments --- docs/source/en/gguf.md | 1 + src/transformers/integrations/ggml.py | 25 +++++++++ .../modeling_gguf_pytorch_utils.py | 13 +++++ tests/quantization/ggml/test_ggml.py | 56 ++++++++++++++++++- 4 files changed, 93 insertions(+), 2 deletions(-) diff --git a/docs/source/en/gguf.md b/docs/source/en/gguf.md index 20531b990bc..2da721b2898 100644 --- a/docs/source/en/gguf.md +++ b/docs/source/en/gguf.md @@ -86,6 +86,7 @@ For now the supported model architectures are the architectures that have been v - GPT2 - Starcoder2 - T5 +- Mamba ## Example usage diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py index 4a2740fcb30..f4545f2698c 100644 --- a/src/transformers/integrations/ggml.py +++ b/src/transformers/integrations/ggml.py @@ -235,6 +235,19 @@ "output.weight": "lm_head.weight", "output_norm": "model.norm", }, + "mamba": { + "token_embd": "backbone.embeddings", + "blk": "backbone.layers", + "ssm_a": "mixer.A_log", + "ssm_conv1d": "mixer.conv1d", + "ssm_in": "mixer.in_proj", + "ssm_out": "mixer.out_proj", + "ssm_x": "mixer.x_proj", + "ssm_dt": "mixer.dt_proj", + "attn_norm": "norm", + "output_norm": "backbone.norm_f", + "output.weight": "lm_head.weight", + }, } @@ -373,6 +386,17 @@ "attention.head_count_kv": "num_key_value_heads", "attention.layer_norm_epsilon": "norm_epsilon", }, + "mamba": { + "vocab_size": "vocab_size", + "context_length": "max_position_embeddings", + "embedding_length": "hidden_size", + "attention.layer_norm_rms_epsilon": "layer_norm_epsilon", + "block_count": "num_hidden_layers", + "ssm.conv_kernel": "conv_kernel", + "ssm.state_size": "state_size", + "ssm.time_step_rank": "time_step_rank", + "ssm.inner_size": "intermediate_size", + }, } GGUF_TOKENIZER_MAPPING = { @@ -768,6 +792,7 @@ def converted(self) -> Tokenizer: "gpt2": GGUFGPTConverter, "starcoder2": GGUFGPTConverter, "t5": GGUFT5Converter, + "mamba": GGUFGPTConverter, } diff --git a/src/transformers/modeling_gguf_pytorch_utils.py b/src/transformers/modeling_gguf_pytorch_utils.py index 171b2f4d15b..c784ca0eb4c 100644 --- a/src/transformers/modeling_gguf_pytorch_utils.py +++ b/src/transformers/modeling_gguf_pytorch_utils.py @@ -220,6 +220,19 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): name = "lm_head.weight" parsed_parameters["tensors"][name] = torch.from_numpy(np.copy(weights)) continue + if architecture == "mamba": + if "ssm_d" in name and "bias" not in name and "weight" not in name: + # ssm_d has conflicts with ssm_dt in name checking + # we have to explicitly check that name is exactly ssm_d + name = name.replace("ssm_d", "mixer.D") + if "ssm_conv1d.weight" in name: + # for compatibility tensor ssm_conv1d must be (5120, 1, 4]) dim, + # quantized one is (5120, 4) + weights = np.expand_dims(weights, axis=1) + if "ssm_a" in name: + # Original exponential implementation + # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L2975-L2977 + weights = np.log(-weights) for tensor_name in tensor_key_mapping: if tensor_name.format(bid=bid) in name: diff --git a/tests/quantization/ggml/test_ggml.py b/tests/quantization/ggml/test_ggml.py index ddc791e96a6..da1af9bff8d 100644 --- a/tests/quantization/ggml/test_ggml.py +++ b/tests/quantization/ggml/test_ggml.py @@ -59,6 +59,8 @@ class GgufIntegrationTests(unittest.TestCase): starcoder2_model_id = "QuantFactory/starcoder2-3b-GGUF" starcoder2_fp16_model_id = "brittlewis12/starcoder2-3b-GGUF" starcoder2_original_model_id = "bigcode/starcoder2-3b" + mamba_original_model_id = "state-spaces/mamba-2.8b-hf" + mamba_model_id = "jpodivin/mamba-2.8b-hf-GGUF" # standard quants q4_0_gguf_model_id = "tinyllama-1.1b-chat-v1.0.Q4_0.gguf" @@ -102,6 +104,8 @@ class GgufIntegrationTests(unittest.TestCase): q6_k_gpt2_xl_model_id = "gpt2-xl.Q6_K.gguf" q6_k_starcoder2_model_id = "starcoder2-3b.Q6_K.gguf" fp16_starcoder2_gguf_model_id = "starcoder2-3b.fp16.gguf" + q6_k_mamba_model_id = "ggml-model-Q6_K.gguf" + fp16_mamba_model_id = "ggml-model-f16.gguf" example_text = "Hello" @@ -573,6 +577,8 @@ def test_gpt2_weights_conversion_fp16(self): if layer_name in quantized_state_dict: self.assertTrue(original_params.shape == quantized_state_dict[layer_name].shape) torch.testing.assert_close(original_params, quantized_state_dict[layer_name]) + else: + raise ValueError(f"Layer {layer_name} is not presented in GGUF model") def test_gpt2_xl_Q6_K(self): tokenizer = AutoTokenizer.from_pretrained(self.gpt2_xl_model_id, gguf_file=self.q6_k_gpt2_xl_model_id) @@ -639,6 +645,8 @@ def test_falcon7b_weights_conversion_fp16(self): if layer_name in quantized_state_dict: self.assertTrue(original_params.shape == quantized_state_dict[layer_name].shape) torch.testing.assert_close(original_params, quantized_state_dict[layer_name]) + else: + raise ValueError(f"Layer {layer_name} is not presented in GGUF model") def test_stablelm_q4_k_m(self): model = AutoModelForCausalLM.from_pretrained( @@ -708,6 +716,8 @@ def test_stablelm_weights_conversion_fp16(self): if layer_name in converted_state_dict: self.assertTrue(original_params.shape == converted_state_dict[layer_name].shape) torch.testing.assert_close(original_params, converted_state_dict[layer_name]) + else: + raise ValueError(f"Layer {layer_name} is not presented in GGUF model") def test_starcoder2_weights_conversion_fp16(self): original_model = AutoModelForCausalLM.from_pretrained( @@ -727,10 +737,11 @@ def test_starcoder2_weights_conversion_fp16(self): original_state_dict = original_model.state_dict() for layer_name, original_params in original_state_dict.items(): - if layer_name in converted_state_dict and layer_name != "lm_head.weight": - # quantized models do not contain "lm_head.weight" layer + if layer_name in converted_state_dict: self.assertTrue(original_params.shape == converted_state_dict[layer_name].shape) torch.testing.assert_close(original_params, converted_state_dict[layer_name]) + else: + raise ValueError(f"Layer {layer_name} is not presented in GGUF model") def test_starcoder2_q6_k(self): example_function_text = "def print_hello_world():" @@ -748,6 +759,47 @@ def test_starcoder2_q6_k(self): EXPECTED_TEXT = 'def print_hello_world():\n print("Hello World")\n\ndef print' self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT) + def test_mamba_weights_conversion_fp16(self): + original_model = AutoModelForCausalLM.from_pretrained( + self.mamba_original_model_id, + torch_dtype=torch.float16, + ) + + converted_model = AutoModelForCausalLM.from_pretrained( + self.mamba_model_id, + gguf_file=self.fp16_mamba_model_id, + torch_dtype=torch.float16, + ) + + converted_state_dict = converted_model.state_dict() + original_state_dict = original_model.state_dict() + + for layer_name, original_params in original_state_dict.items(): + if layer_name in converted_state_dict: + self.assertTrue(original_params.shape == converted_state_dict[layer_name].shape) + if "mixer.A_log" in layer_name: + # we should increase tolerance after exponential reversing + # and performing np.log(-weights) operation as numbers are slightly different + torch.testing.assert_close(original_params, converted_state_dict[layer_name], atol=1e-3, rtol=1e-3) + else: + torch.testing.assert_close(original_params, converted_state_dict[layer_name]) + else: + raise ValueError(f"Layer {layer_name} is not presented in GGUF model") + + def test_mamba_q6_k(self): + model = AutoModelForCausalLM.from_pretrained( + self.mamba_model_id, + gguf_file=self.q6_k_mamba_model_id, + torch_dtype=torch.float16, + ) + + tokenizer = AutoTokenizer.from_pretrained(self.mamba_model_id, gguf_file=self.q6_k_mamba_model_id) + text = tokenizer(self.example_text, return_tensors="pt")["input_ids"] + out = model.generate(text, max_new_tokens=10) + + EXPECTED_TEXT = "Hello,I answerthe question.\n\nA" + self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT) + def test_tokenization_xnli(self): import tqdm from datasets import load_dataset From 9f06fb05059a973048f5865e7e385c9db5d6daa4 Mon Sep 17 00:00:00 2001 From: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com> Date: Wed, 30 Oct 2024 23:55:16 +0800 Subject: [PATCH 75/99] Fix super tiny extra space typo (#34440) Update training_args.py --- src/transformers/training_args.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index 3e5c6cc2f37..0653c8a2cb7 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -1030,7 +1030,7 @@ class TrainingArguments: use_cpu: bool = field( default=False, metadata={ - "help": " Whether or not to use cpu. If set to False, we will use cuda/tpu/mps/npu device if available." + "help": "Whether or not to use cpu. If set to False, we will use cuda/tpu/mps/npu device if available." }, ) use_mps_device: bool = field( From 48872fd6ae336fbde6fac7706910a9a4bc48210e Mon Sep 17 00:00:00 2001 From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com> Date: Wed, 30 Oct 2024 13:49:47 -0400 Subject: [PATCH 76/99] Add Image Processor Fast RT-DETR (#34354) * add fast image processor rtdetr * add gpu/cpu test and fix docstring * remove prints * add to doc * nit docstring * avoid iterating over images/annotations several times * change torch typing * Add image processor fast documentation --- .../source/en/main_classes/image_processor.md | 43 + docs/source/en/model_doc/rt_detr.md | 8 +- src/transformers/__init__.py | 4 +- .../image_processing_utils_fast.py | 67 +- .../models/auto/image_processing_auto.py | 2 +- .../models/detr/image_processing_detr_fast.py | 208 ++--- src/transformers/models/rt_detr/__init__.py | 2 + .../rt_detr/image_processing_rt_detr.py | 17 +- .../rt_detr/image_processing_rt_detr_fast.py | 798 ++++++++++++++++++ .../utils/dummy_vision_objects.py | 7 + .../models/detr/test_image_processing_detr.py | 4 +- .../rt_detr/test_image_processing_rt_detr.py | 424 ++++++---- 12 files changed, 1259 insertions(+), 325 deletions(-) create mode 100644 src/transformers/models/rt_detr/image_processing_rt_detr_fast.py diff --git a/docs/source/en/main_classes/image_processor.md b/docs/source/en/main_classes/image_processor.md index 59a78e68214..320916f1ce9 100644 --- a/docs/source/en/main_classes/image_processor.md +++ b/docs/source/en/main_classes/image_processor.md @@ -18,6 +18,49 @@ rendered properly in your Markdown viewer. An image processor is in charge of preparing input features for vision models and post processing their outputs. This includes transformations such as resizing, normalization, and conversion to PyTorch, TensorFlow, Flax and Numpy tensors. It may also include model specific post-processing such as converting logits to segmentation masks. +Fast image processors are available for a few models and more will be added in the future. They are based on the [torchvision](https://pytorch.org/vision/stable/index.html) library and provide a significant speed-up, especially when processing on GPU. +They have the same API as the base image processors and can be used as drop-in replacements. +To use a fast image processor, you need to install the `torchvision` library, and set the `use_fast` argument to `True` when instantiating the image processor: + +```python +from transformers import AutoImageProcessor + +processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50", use_fast=True) +``` + +When using a fast image processor, you can also set the `device` argument to specify the device on which the processing should be done. By default, the processing is done on the same device as the inputs if the inputs are tensors, or on the CPU otherwise. + +```python +from torchvision.io import read_image +from transformers import DetrImageProcessorFast + +images = read_image("image.jpg") +processor = DetrImageProcessorFast.from_pretrained("facebook/detr-resnet-50") +images_processed = processor(images, return_tensors="pt", device="cuda") +``` + +Here are some speed comparisons between the base and fast image processors for the `DETR` and `RT-DETR` models, and how they impact overall inference time: + +
+
+ +
+
+ +
+
+ +
+
+ +
+
+ +
+
+ +These benchmarks were run on an [AWS EC2 g5.2xlarge instance](https://aws.amazon.com/ec2/instance-types/g5/), utilizing an NVIDIA A10G Tensor Core GPU. + ## ImageProcessingMixin diff --git a/docs/source/en/model_doc/rt_detr.md b/docs/source/en/model_doc/rt_detr.md index 5540266c621..8ad220dc4bd 100644 --- a/docs/source/en/model_doc/rt_detr.md +++ b/docs/source/en/model_doc/rt_detr.md @@ -46,7 +46,7 @@ Initially, an image is processed using a pre-trained convolutional neural networ >>> from PIL import Image >>> from transformers import RTDetrForObjectDetection, RTDetrImageProcessor ->>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg' +>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg' >>> image = Image.open(requests.get(url, stream=True).raw) >>> image_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd") @@ -95,6 +95,12 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h - preprocess - post_process_object_detection +## RTDetrImageProcessorFast + +[[autodoc]] RTDetrImageProcessorFast + - preprocess + - post_process_object_detection + ## RTDetrModel [[autodoc]] RTDetrModel diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index cc8b0739502..e6789c77fb8 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1228,7 +1228,7 @@ _import_structure["models.poolformer"].extend(["PoolFormerFeatureExtractor", "PoolFormerImageProcessor"]) _import_structure["models.pvt"].extend(["PvtImageProcessor"]) _import_structure["models.qwen2_vl"].extend(["Qwen2VLImageProcessor"]) - _import_structure["models.rt_detr"].extend(["RTDetrImageProcessor"]) + _import_structure["models.rt_detr"].extend(["RTDetrImageProcessor", "RTDetrImageProcessorFast"]) _import_structure["models.sam"].extend(["SamImageProcessor"]) _import_structure["models.segformer"].extend(["SegformerFeatureExtractor", "SegformerImageProcessor"]) _import_structure["models.seggpt"].extend(["SegGptImageProcessor"]) @@ -6152,7 +6152,7 @@ ) from .models.pvt import PvtImageProcessor from .models.qwen2_vl import Qwen2VLImageProcessor - from .models.rt_detr import RTDetrImageProcessor + from .models.rt_detr import RTDetrImageProcessor, RTDetrImageProcessorFast from .models.sam import SamImageProcessor from .models.segformer import SegformerFeatureExtractor, SegformerImageProcessor from .models.seggpt import SegGptImageProcessor diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py index d1a08132d73..3c1be325b7e 100644 --- a/src/transformers/image_processing_utils_fast.py +++ b/src/transformers/image_processing_utils_fast.py @@ -15,14 +15,18 @@ import functools from dataclasses import dataclass +from typing import Any, Iterable, List, Optional, Tuple from .image_processing_utils import BaseImageProcessor -from .utils.import_utils import is_torchvision_available +from .utils.import_utils import is_torch_available, is_torchvision_available if is_torchvision_available(): from torchvision.transforms import Compose +if is_torch_available(): + import torch + @dataclass(frozen=True) class SizeDict: @@ -66,3 +70,64 @@ def to_dict(self): encoder_dict = super().to_dict() encoder_dict.pop("_transform_params", None) return encoder_dict + + +def get_image_size_for_max_height_width( + image_size: Tuple[int, int], + max_height: int, + max_width: int, +) -> Tuple[int, int]: + """ + Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio. + Important, even if image_height < max_height and image_width < max_width, the image will be resized + to at least one of the edges be equal to max_height or max_width. + + For example: + - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50) + - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400) + + Args: + image_size (`Tuple[int, int]`): + The image to resize. + max_height (`int`): + The maximum allowed height. + max_width (`int`): + The maximum allowed width. + """ + height, width = image_size + height_scale = max_height / height + width_scale = max_width / width + min_scale = min(height_scale, width_scale) + new_height = int(height * min_scale) + new_width = int(width * min_scale) + return new_height, new_width + + +def safe_squeeze(tensor: "torch.Tensor", axis: Optional[int] = None) -> "torch.Tensor": + """ + Squeezes a tensor, but only if the axis specified has dim 1. + """ + if axis is None: + return tensor.squeeze() + + try: + return tensor.squeeze(axis=axis) + except ValueError: + return tensor + + +def max_across_indices(values: Iterable[Any]) -> List[Any]: + """ + Return the maximum value across all indices of an iterable of values. + """ + return [max(values_i) for values_i in zip(*values)] + + +def get_max_height_width(images: List["torch.Tensor"]) -> Tuple[int]: + """ + Get the maximum height and width across all images in a batch. + """ + + _, max_height, max_width = max_across_indices([img.shape for img in images]) + + return (max_height, max_width) diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index d181afeb2d4..5698abe15c8 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -123,7 +123,7 @@ ("qwen2_vl", ("Qwen2VLImageProcessor",)), ("regnet", ("ConvNextImageProcessor",)), ("resnet", ("ConvNextImageProcessor",)), - ("rt_detr", "RTDetrImageProcessor"), + ("rt_detr", ("RTDetrImageProcessor", "RTDetrImageProcessorFast")), ("sam", ("SamImageProcessor",)), ("segformer", ("SegformerImageProcessor",)), ("seggpt", ("SegGptImageProcessor",)), diff --git a/src/transformers/models/detr/image_processing_detr_fast.py b/src/transformers/models/detr/image_processing_detr_fast.py index 0fa1d0ffd9d..eadde59e55e 100644 --- a/src/transformers/models/detr/image_processing_detr_fast.py +++ b/src/transformers/models/detr/image_processing_detr_fast.py @@ -21,7 +21,13 @@ from typing import Any, Dict, List, Optional, Set, Tuple, Union from ...image_processing_utils import BatchFeature, get_size_dict -from ...image_processing_utils_fast import BaseImageProcessorFast, SizeDict +from ...image_processing_utils_fast import ( + BaseImageProcessorFast, + SizeDict, + get_image_size_for_max_height_width, + get_max_height_width, + safe_squeeze, +) from ...image_transforms import ( center_to_corners_format, corners_to_center_format, @@ -55,7 +61,6 @@ compute_segments, convert_segmentation_to_rle, get_size_with_aspect_ratio, - max_across_indices, remove_low_and_no_objects, ) @@ -85,60 +90,6 @@ SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) -def get_image_size_for_max_height_width( - image_size: Tuple[int, int], - max_height: int, - max_width: int, -) -> Tuple[int, int]: - """ - Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio. - Important, even if image_height < max_height and image_width < max_width, the image will be resized - to at least one of the edges be equal to max_height or max_width. - - For example: - - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50) - - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400) - - Args: - image_size (`Tuple[int, int]`): - The image to resize. - max_height (`int`): - The maximum allowed height. - max_width (`int`): - The maximum allowed width. - """ - height, width = image_size - height_scale = max_height / height - width_scale = max_width / width - min_scale = min(height_scale, width_scale) - new_height = int(height * min_scale) - new_width = int(width * min_scale) - return new_height, new_width - - -def safe_squeeze(tensor: torch.Tensor, axis: Optional[int] = None) -> torch.Tensor: - """ - Squeezes a tensor, but only if the axis specified has dim 1. - """ - if axis is None: - return tensor.squeeze() - - try: - return tensor.squeeze(axis=axis) - except ValueError: - return tensor - - -def get_max_height_width(images: List[torch.Tensor]) -> Tuple[int]: - """ - Get the maximum height and width across all images in a batch. - """ - - _, max_height, max_width = max_across_indices([img.shape for img in images]) - - return (max_height, max_width) - - # inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L33 def convert_coco_poly_to_mask(segmentations, height: int, width: int, device: torch.device) -> torch.Tensor: """ @@ -191,18 +142,21 @@ def prepare_coco_detection_annotation( # Get all COCO annotations for the given image. annotations = target["annotations"] - annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0] + classes = [] + area = [] + boxes = [] + keypoints = [] + for obj in annotations: + if "iscrowd" not in obj or obj["iscrowd"] == 0: + classes.append(obj["category_id"]) + area.append(obj["area"]) + boxes.append(obj["bbox"]) + if "keypoints" in obj: + keypoints.append(obj["keypoints"]) - classes = [obj["category_id"] for obj in annotations] classes = torch.as_tensor(classes, dtype=torch.int64, device=image.device) - - # for conversion to coco api - area = torch.as_tensor([obj["area"] for obj in annotations], dtype=torch.float32, device=image.device) - iscrowd = torch.as_tensor( - [obj["iscrowd"] if "iscrowd" in obj else 0 for obj in annotations], dtype=torch.int64, device=image.device - ) - - boxes = [obj["bbox"] for obj in annotations] + area = torch.as_tensor(area, dtype=torch.float32, device=image.device) + iscrowd = torch.zeros_like(classes, dtype=torch.int64, device=image.device) # guard against no boxes via resizing boxes = torch.as_tensor(boxes, dtype=torch.float32, device=image.device).reshape(-1, 4) boxes[:, 2:] += boxes[:, :2] @@ -211,19 +165,16 @@ def prepare_coco_detection_annotation( keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) - new_target = {} - new_target["image_id"] = image_id - new_target["class_labels"] = classes[keep] - new_target["boxes"] = boxes[keep] - new_target["area"] = area[keep] - new_target["iscrowd"] = iscrowd[keep] - new_target["orig_size"] = torch.as_tensor( - [int(image_height), int(image_width)], dtype=torch.int64, device=image.device - ) + new_target = { + "image_id": image_id, + "class_labels": classes[keep], + "boxes": boxes[keep], + "area": area[keep], + "iscrowd": iscrowd[keep], + "orig_size": torch.as_tensor([int(image_height), int(image_width)], dtype=torch.int64, device=image.device), + } - if annotations and "keypoints" in annotations[0]: - keypoints = [obj["keypoints"] for obj in annotations] - # Converting the filtered keypoints list to a numpy array + if keypoints: keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=image.device) # Apply the keep mask here to filter the relevant annotations keypoints = keypoints[keep] @@ -911,84 +862,81 @@ def preprocess( if input_data_format == ChannelDimension.LAST: images = [image.permute(2, 0, 1).contiguous() for image in images] - # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image) - if annotations is not None: - prepared_images = [] - prepared_annotations = [] - for image, target in zip(images, annotations): - target = self.prepare_annotation( + if do_rescale and do_normalize: + # fused rescale and normalize + new_mean = torch.tensor(image_mean, device=images[0].device) * (1.0 / rescale_factor) + new_std = torch.tensor(image_std, device=images[0].device) * (1.0 / rescale_factor) + + processed_images = [] + processed_annotations = [] + pixel_masks = [] # Initialize pixel_masks here + for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)): + # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image) + if annotations is not None: + annotation = self.prepare_annotation( image, - target, + annotation, format, return_segmentation_masks=return_segmentation_masks, masks_path=masks_path, input_data_format=input_data_format, ) - prepared_images.append(image) - prepared_annotations.append(target) - images = prepared_images - annotations = prepared_annotations - del prepared_images, prepared_annotations - - if do_resize: - if isinstance(resample, (PILImageResampling, int)): - interpolation = pil_torch_interpolation_mapping[resample] - else: - interpolation = resample - resized_images = [self.resize(image, size=size, interpolation=interpolation) for image in images] - if annotations is not None: - for i, (image, target) in enumerate(zip(resized_images, annotations)): - annotations[i] = self.resize_annotation( - target, - orig_size=images[i].size()[-2:], - target_size=image.size()[-2:], + + if do_resize: + interpolation = ( + pil_torch_interpolation_mapping[resample] + if isinstance(resample, (PILImageResampling, int)) + else resample + ) + resized_image = self.resize(image, size=size, interpolation=interpolation) + if annotations is not None: + annotation = self.resize_annotation( + annotation, + orig_size=image.size()[-2:], + target_size=resized_image.size()[-2:], ) - images = resized_images - del resized_images + image = resized_image - if do_rescale and do_normalize: - # fused rescale and normalize - new_mean = torch.tensor(image_mean, device=images[0].device) * (1.0 / rescale_factor) - new_std = torch.tensor(image_std, device=images[0].device) * (1.0 / rescale_factor) - images = [F.normalize(image.to(dtype=torch.float32), new_mean, new_std) for image in images] - elif do_rescale: - images = [image * rescale_factor for image in images] - elif do_normalize: - images = [F.normalize(image, image_mean, image_std) for image in images] - - if do_convert_annotations and annotations is not None: - annotations = [ - self.normalize_annotation(annotation, get_image_size(image, input_data_format)) - for annotation, image in zip(annotations, images) - ] + if do_rescale and do_normalize: + # fused rescale and normalize + image = F.normalize(image.to(dtype=torch.float32), new_mean, new_std) + elif do_rescale: + image = image * rescale_factor + elif do_normalize: + image = F.normalize(image, image_mean, image_std) + + if do_convert_annotations and annotations is not None: + annotation = self.normalize_annotation(annotation, get_image_size(image, input_data_format)) + + processed_images.append(image) + processed_annotations.append(annotation) + images = processed_images + annotations = processed_annotations if annotations is not None else None if do_pad: - # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...} + # depends on all resized image shapes so we need another loop if pad_size is not None: padded_size = (pad_size["height"], pad_size["width"]) else: padded_size = get_max_height_width(images) - annotation_list = annotations if annotations is not None else [None] * len(images) padded_images = [] - pixel_masks = [] padded_annotations = [] - for image, annotation in zip(images, annotation_list): + for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)): + # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...} if padded_size == image.size()[-2:]: padded_images.append(image) pixel_masks.append(torch.ones(padded_size, dtype=torch.int64, device=image.device)) padded_annotations.append(annotation) continue - padded_image, pixel_mask, padded_annotation = self.pad( + image, pixel_mask, annotation = self.pad( image, padded_size, annotation=annotation, update_bboxes=do_convert_annotations ) - padded_images.append(padded_image) + padded_images.append(image) + padded_annotations.append(annotation) pixel_masks.append(pixel_mask) - padded_annotations.append(padded_annotation) images = padded_images - if annotations is not None: - annotations = padded_annotations - del padded_images, padded_annotations + annotations = padded_annotations if annotations is not None else None data.update({"pixel_mask": torch.stack(pixel_masks, dim=0)}) data.update({"pixel_values": torch.stack(images, dim=0)}) diff --git a/src/transformers/models/rt_detr/__init__.py b/src/transformers/models/rt_detr/__init__.py index 94a428c6668..52453f38b2c 100644 --- a/src/transformers/models/rt_detr/__init__.py +++ b/src/transformers/models/rt_detr/__init__.py @@ -26,6 +26,7 @@ pass else: _import_structure["image_processing_rt_detr"] = ["RTDetrImageProcessor"] + _import_structure["image_processing_rt_detr_fast"] = ["RTDetrImageProcessorFast"] try: if not is_torch_available(): @@ -55,6 +56,7 @@ pass else: from .image_processing_rt_detr import RTDetrImageProcessor + from .image_processing_rt_detr_fast import RTDetrImageProcessorFast try: if not is_torch_available(): diff --git a/src/transformers/models/rt_detr/image_processing_rt_detr.py b/src/transformers/models/rt_detr/image_processing_rt_detr.py index 44b2702aa63..eead5b18693 100644 --- a/src/transformers/models/rt_detr/image_processing_rt_detr.py +++ b/src/transformers/models/rt_detr/image_processing_rt_detr.py @@ -1062,10 +1062,8 @@ def post_process_object_detection( raise ValueError( "Make sure that you pass in as many target sizes as the batch dimension of the logits" ) - if isinstance(target_sizes, List): - img_h = torch.Tensor([i[0] for i in target_sizes]) - img_w = torch.Tensor([i[1] for i in target_sizes]) + img_h, img_w = torch.as_tensor(target_sizes).unbind(1) else: img_h, img_w = target_sizes.unbind(1) scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) @@ -1089,10 +1087,13 @@ def post_process_object_detection( boxes = torch.gather(boxes, dim=1, index=index.unsqueeze(-1).tile(1, 1, boxes.shape[-1])) results = [] - for s, l, b in zip(scores, labels, boxes): - score = s[s > threshold] - label = l[s > threshold] - box = b[s > threshold] - results.append({"scores": score, "labels": label, "boxes": box}) + for score, label, box in zip(scores, labels, boxes): + results.append( + { + "scores": score[score > threshold], + "labels": label[score > threshold], + "boxes": box[score > threshold], + } + ) return results diff --git a/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py b/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py new file mode 100644 index 00000000000..9f63b5b7ced --- /dev/null +++ b/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py @@ -0,0 +1,798 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for RT-DETR.""" + +import functools +import pathlib +from typing import Any, Dict, List, Optional, Tuple, Union + +from ...image_processing_utils import BatchFeature, get_size_dict +from ...image_processing_utils_fast import ( + BaseImageProcessorFast, + SizeDict, + get_image_size_for_max_height_width, + get_max_height_width, + safe_squeeze, +) +from ...image_transforms import ( + center_to_corners_format, + corners_to_center_format, +) +from ...image_utils import ( + IMAGENET_DEFAULT_MEAN, + IMAGENET_DEFAULT_STD, + AnnotationFormat, + AnnotationType, + ChannelDimension, + ImageInput, + ImageType, + PILImageResampling, + get_image_size, + get_image_type, + infer_channel_dimension_format, + make_list_of_images, + pil_torch_interpolation_mapping, + validate_annotations, +) +from ...utils import ( + TensorType, + filter_out_non_signature_kwargs, + is_torch_available, + is_torchvision_available, + is_torchvision_v2_available, + logging, + requires_backends, +) +from .image_processing_rt_detr import ( + get_size_with_aspect_ratio, +) + + +if is_torch_available(): + import torch + + +if is_torchvision_available(): + from ...image_utils import pil_torch_interpolation_mapping + + if is_torchvision_v2_available(): + from torchvision.transforms.v2 import functional as F + else: + from torchvision.transforms import functional as F + + +logger = logging.get_logger(__name__) + +SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION,) + + +def prepare_coco_detection_annotation( + image, + target, + return_segmentation_masks: bool = False, + input_data_format: Optional[Union[ChannelDimension, str]] = None, +): + """ + Convert the target in COCO format into the format expected by RT-DETR. + """ + image_height, image_width = image.size()[-2:] + + image_id = target["image_id"] + image_id = torch.as_tensor([image_id], dtype=torch.int64, device=image.device) + + # Get all COCO annotations for the given image. + annotations = target["annotations"] + classes = [] + area = [] + boxes = [] + keypoints = [] + for obj in annotations: + if "iscrowd" not in obj or obj["iscrowd"] == 0: + classes.append(obj["category_id"]) + area.append(obj["area"]) + boxes.append(obj["bbox"]) + if "keypoints" in obj: + keypoints.append(obj["keypoints"]) + + classes = torch.as_tensor(classes, dtype=torch.int64, device=image.device) + area = torch.as_tensor(area, dtype=torch.float32, device=image.device) + iscrowd = torch.zeros_like(classes, dtype=torch.int64, device=image.device) + # guard against no boxes via resizing + boxes = torch.as_tensor(boxes, dtype=torch.float32, device=image.device).reshape(-1, 4) + boxes[:, 2:] += boxes[:, :2] + boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width) + boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height) + + keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) + + new_target = { + "image_id": image_id, + "class_labels": classes[keep], + "boxes": boxes[keep], + "area": area[keep], + "iscrowd": iscrowd[keep], + "orig_size": torch.as_tensor([int(image_height), int(image_width)], dtype=torch.int64, device=image.device), + } + + if keypoints: + keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=image.device) + # Apply the keep mask here to filter the relevant annotations + keypoints = keypoints[keep] + num_keypoints = keypoints.shape[0] + keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints + new_target["keypoints"] = keypoints + + return new_target + + +class RTDetrImageProcessorFast(BaseImageProcessorFast): + r""" + Constructs a fast RT-DETR DETR image processor. + + Args: + format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): + Data format of the annotations. One of "coco_detection" or "coco_panoptic". + do_resize (`bool`, *optional*, defaults to `True`): + Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be + overridden by the `do_resize` parameter in the `preprocess` method. + size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): + Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter + in the `preprocess` method. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + Resampling filter to use if resizing the image. + do_rescale (`bool`, *optional*, defaults to `True`): + Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the + `do_rescale` parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the + `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `False`): + Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the + `preprocess` method. + image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`): + Mean values to use when normalizing the image. Can be a single value or a list of values, one for each + channel. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`): + Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one + for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_annotations (`bool`, *optional*, defaults to `True`): + Controls whether to convert the annotations to the format expected by the DETR model. Converts the + bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. + Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. + do_pad (`bool`, *optional*, defaults to `False`): + Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess` + method. If `True`, padding will be applied to the bottom and right of the image with zeros. + If `pad_size` is provided, the image will be padded to the specified dimensions. + Otherwise, the image will be padded to the maximum height and width of the batch. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. + """ + + model_input_names = ["pixel_values", "pixel_mask"] + + def __init__( + self, + format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION, + do_resize: bool = True, + size: Dict[str, int] = None, + resample: Union[PILImageResampling, F.InterpolationMode] = PILImageResampling.BILINEAR, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = False, + image_mean: Union[float, List[float]] = None, + image_std: Union[float, List[float]] = None, + do_convert_annotations: bool = True, + do_pad: bool = False, + pad_size: Optional[Dict[str, int]] = None, + **kwargs, + ) -> None: + size = size if size is not None else {"height": 640, "width": 640} + size = get_size_dict(size, default_to_square=False) + + if do_convert_annotations is None: + do_convert_annotations = do_normalize + + super().__init__(**kwargs) + self.format = format + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.do_convert_annotations = do_convert_annotations + self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD + self.do_pad = do_pad + self.pad_size = pad_size + + def prepare_annotation( + self, + image: torch.Tensor, + target: Dict, + format: Optional[AnnotationFormat] = None, + return_segmentation_masks: bool = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> Dict: + """ + Prepare an annotation for feeding into RTDETR model. + """ + format = format if format is not None else self.format + + if format == AnnotationFormat.COCO_DETECTION: + return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks + target = prepare_coco_detection_annotation( + image, target, return_segmentation_masks, input_data_format=input_data_format + ) + else: + raise ValueError(f"Format {format} is not supported.") + return target + + # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.resize + def resize( + self, + image: torch.Tensor, + size: SizeDict, + interpolation: F.InterpolationMode = F.InterpolationMode.BILINEAR, + **kwargs, + ) -> torch.Tensor: + """ + Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an + int, smaller edge of the image will be matched to this number. + + Args: + image (`torch.Tensor`): + Image to resize. + size (`SizeDict`): + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. + interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`): + Resampling filter to use if resizing the image. + """ + if size.shortest_edge and size.longest_edge: + # Resize the image so that the shortest edge or the longest edge is of the given size + # while maintaining the aspect ratio of the original image. + new_size = get_size_with_aspect_ratio( + image.size()[-2:], + size["shortest_edge"], + size["longest_edge"], + ) + elif size.max_height and size.max_width: + new_size = get_image_size_for_max_height_width(image.size()[-2:], size["max_height"], size["max_width"]) + elif size.height and size.width: + new_size = (size["height"], size["width"]) + else: + raise ValueError( + "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" + f" {size.keys()}." + ) + + image = F.resize( + image, + size=new_size, + interpolation=interpolation, + **kwargs, + ) + return image + + # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.resize_annotation + def resize_annotation( + self, + annotation: Dict[str, Any], + orig_size: Tuple[int, int], + target_size: Tuple[int, int], + threshold: float = 0.5, + interpolation: F.InterpolationMode = F.InterpolationMode.NEAREST, + ): + """ + Resizes an annotation to a target size. + + Args: + annotation (`Dict[str, Any]`): + The annotation dictionary. + orig_size (`Tuple[int, int]`): + The original size of the input image. + target_size (`Tuple[int, int]`): + The target size of the image, as returned by the preprocessing `resize` step. + threshold (`float`, *optional*, defaults to 0.5): + The threshold used to binarize the segmentation masks. + resample (`InterpolationMode`, defaults to `InterpolationMode.NEAREST`): + The resampling filter to use when resizing the masks. + """ + ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)] + + new_annotation = {} + new_annotation["size"] = target_size + + for key, value in annotation.items(): + if key == "boxes": + boxes = value + scaled_boxes = boxes * torch.as_tensor( + [ratio_width, ratio_height, ratio_width, ratio_height], dtype=torch.float32, device=boxes.device + ) + new_annotation["boxes"] = scaled_boxes + elif key == "area": + area = value + scaled_area = area * (ratio_width * ratio_height) + new_annotation["area"] = scaled_area + elif key == "masks": + masks = value[:, None] + masks = [F.resize(mask, target_size, interpolation=interpolation) for mask in masks] + masks = torch.stack(masks).to(torch.float32) + masks = masks[:, 0] > threshold + new_annotation["masks"] = masks + elif key == "size": + new_annotation["size"] = target_size + else: + new_annotation[key] = value + + return new_annotation + + # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.normalize_annotation + def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict: + image_height, image_width = image_size + norm_annotation = {} + for key, value in annotation.items(): + if key == "boxes": + boxes = value + boxes = corners_to_center_format(boxes) + boxes /= torch.as_tensor( + [image_width, image_height, image_width, image_height], dtype=torch.float32, device=boxes.device + ) + norm_annotation[key] = boxes + else: + norm_annotation[key] = value + return norm_annotation + + # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast._update_annotation_for_padded_image + def _update_annotation_for_padded_image( + self, + annotation: Dict, + input_image_size: Tuple[int, int], + output_image_size: Tuple[int, int], + padding, + update_bboxes, + ) -> Dict: + """ + Update the annotation for a padded image. + """ + new_annotation = {} + new_annotation["size"] = output_image_size + ratio_height, ratio_width = (input / output for output, input in zip(output_image_size, input_image_size)) + + for key, value in annotation.items(): + if key == "masks": + masks = value + masks = F.pad( + masks, + padding, + fill=0, + ) + masks = safe_squeeze(masks, 1) + new_annotation["masks"] = masks + elif key == "boxes" and update_bboxes: + boxes = value + boxes *= torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height], device=boxes.device) + new_annotation["boxes"] = boxes + elif key == "size": + new_annotation["size"] = output_image_size + else: + new_annotation[key] = value + return new_annotation + + # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.pad + def pad( + self, + image: torch.Tensor, + padded_size: Tuple[int, int], + annotation: Optional[Dict[str, Any]] = None, + update_bboxes: bool = True, + fill: int = 0, + ): + original_size = image.size()[-2:] + padding_bottom = padded_size[0] - original_size[0] + padding_right = padded_size[1] - original_size[1] + if padding_bottom < 0 or padding_right < 0: + raise ValueError( + f"Padding dimensions are negative. Please make sure that the padded size is larger than the " + f"original size. Got padded size: {padded_size}, original size: {original_size}." + ) + if original_size != padded_size: + padding = [0, 0, padding_right, padding_bottom] + image = F.pad(image, padding, fill=fill) + if annotation is not None: + annotation = self._update_annotation_for_padded_image( + annotation, original_size, padded_size, padding, update_bboxes + ) + + # Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding. + pixel_mask = torch.zeros(padded_size, dtype=torch.int64, device=image.device) + pixel_mask[: original_size[0], : original_size[1]] = 1 + + return image, pixel_mask, annotation + + @functools.lru_cache(maxsize=1) + # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast._validate_input_arguments + def _validate_input_arguments( + self, + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Union[float, List[float]], + image_std: Union[float, List[float]], + do_resize: bool, + size: Dict[str, int], + resample: "PILImageResampling", + data_format: Union[str, ChannelDimension], + return_tensors: Union[TensorType, str], + ): + if return_tensors != "pt": + raise ValueError("Only returning PyTorch tensors is currently supported.") + + if data_format != ChannelDimension.FIRST: + raise ValueError("Only channel first data format is currently supported.") + + if do_resize and None in (size, resample): + raise ValueError("Size and resample must be specified if do_resize is True.") + + if do_rescale and rescale_factor is None: + raise ValueError("Rescale factor must be specified if do_rescale is True.") + + if do_normalize and None in (image_mean, image_std): + raise ValueError("Image mean and standard deviation must be specified if do_normalize is True.") + + @filter_out_non_signature_kwargs(extra=["device"]) + def preprocess( + self, + images: ImageInput, + annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None, + return_segmentation_masks: bool = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + do_resize: Optional[bool] = None, + size: Optional[Dict[str, int]] = None, + resample: Optional[Union[PILImageResampling, F.InterpolationMode]] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[Union[int, float]] = None, + do_normalize: Optional[bool] = None, + do_convert_annotations: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_pad: Optional[bool] = None, + format: Optional[Union[str, AnnotationFormat]] = None, + return_tensors: Optional[Union[TensorType, str]] = None, + data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + pad_size: Optional[Dict[str, int]] = None, + **kwargs, + ) -> BatchFeature: + """ + Preprocess an image or a batch of images so that it can be used by the model. + + Args: + images (`ImageInput`): + Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging + from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`. + annotations (`AnnotationType` or `List[AnnotationType]`, *optional*): + List of annotations associated with the image or batch of images. If annotation is for object + detection, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a + dictionary. An image can have no annotations, in which case the list should be empty. + If annotation is for segmentation, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary. + An image can have no segments, in which case the list should be empty. + - "file_name" (`str`): The file name of the image. + return_segmentation_masks (`bool`, *optional*, defaults to self.return_segmentation_masks): + Whether to return segmentation masks. + masks_path (`str` or `pathlib.Path`, *optional*): + Path to the directory containing the segmentation masks. + do_resize (`bool`, *optional*, defaults to self.do_resize): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to self.size): + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. + resample (`PILImageResampling` or `InterpolationMode`, *optional*, defaults to self.resample): + Resampling filter to use when resizing the image. + do_rescale (`bool`, *optional*, defaults to self.do_rescale): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to self.rescale_factor): + Rescale factor to use when rescaling the image. + do_normalize (`bool`, *optional*, defaults to self.do_normalize): + Whether to normalize the image. + do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations): + Whether to convert the annotations to the format expected by the model. Converts the bounding + boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)` + and in relative coordinates. + image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean): + Mean to use when normalizing the image. + image_std (`float` or `List[float]`, *optional*, defaults to self.image_std): + Standard deviation to use when normalizing the image. + do_pad (`bool`, *optional*, defaults to self.do_pad): + Whether to pad the image. If `True`, padding will be applied to the bottom and right of + the image with zeros. If `pad_size` is provided, the image will be padded to the specified + dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. + format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): + Format of the annotations. + return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): + Type of tensors to return. If `None`, will return the list of images. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. + """ + do_resize = self.do_resize if do_resize is None else do_resize + size = self.size if size is None else size + size = get_size_dict(size=size, default_to_square=True) + resample = self.resample if resample is None else resample + do_rescale = self.do_rescale if do_rescale is None else do_rescale + rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor + do_normalize = self.do_normalize if do_normalize is None else do_normalize + image_mean = self.image_mean if image_mean is None else image_mean + image_std = self.image_std if image_std is None else image_std + do_convert_annotations = ( + self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations + ) + do_pad = self.do_pad if do_pad is None else do_pad + pad_size = self.pad_size if pad_size is None else pad_size + format = self.format if format is None else format + return_tensors = "pt" if return_tensors is None else return_tensors + device = kwargs.pop("device", None) + + # Make hashable for cache + size = SizeDict(**size) + image_mean = tuple(image_mean) if isinstance(image_mean, list) else image_mean + image_std = tuple(image_std) if isinstance(image_std, list) else image_std + + images = make_list_of_images(images) + image_type = get_image_type(images[0]) + + if image_type not in [ImageType.PIL, ImageType.TORCH, ImageType.NUMPY]: + raise ValueError(f"Unsupported input image type {image_type}") + + self._validate_input_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + return_tensors=return_tensors, + data_format=data_format, + ) + + if annotations is not None and isinstance(annotations, dict): + annotations = [annotations] + + if annotations is not None and len(images) != len(annotations): + raise ValueError( + f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match." + ) + + format = AnnotationFormat(format) + if annotations is not None: + validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations) + + data = {} + if image_type == ImageType.PIL: + images = [F.pil_to_tensor(image) for image in images] + elif image_type == ImageType.NUMPY: + # not using F.to_tensor as it doesn't handle (C, H, W) numpy arrays + images = [torch.from_numpy(image).contiguous() for image in images] + + if device is not None: + images = [image.to(device) for image in images] + + # We assume that all images have the same channel dimension format. + if input_data_format is None: + input_data_format = infer_channel_dimension_format(images[0]) + if input_data_format == ChannelDimension.LAST: + images = [image.permute(2, 0, 1).contiguous() for image in images] + + if do_rescale and do_normalize: + # fused rescale and normalize + new_mean = torch.tensor(image_mean, device=images[0].device) * (1.0 / rescale_factor) + new_std = torch.tensor(image_std, device=images[0].device) * (1.0 / rescale_factor) + + processed_images = [] + processed_annotations = [] + pixel_masks = [] # Initialize pixel_masks here + for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)): + # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image) + if annotations is not None: + annotation = self.prepare_annotation( + image, + annotation, + format, + return_segmentation_masks=return_segmentation_masks, + masks_path=masks_path, + input_data_format=input_data_format, + ) + + if do_resize: + interpolation = ( + pil_torch_interpolation_mapping[resample] + if isinstance(resample, (PILImageResampling, int)) + else resample + ) + resized_image = self.resize(image, size=size, interpolation=interpolation) + if annotations is not None: + annotation = self.resize_annotation( + annotation, + orig_size=image.size()[-2:], + target_size=resized_image.size()[-2:], + ) + image = resized_image + + if do_rescale and do_normalize: + # fused rescale and normalize + image = F.normalize(image.to(dtype=torch.float32), new_mean, new_std) + elif do_rescale: + image = image * rescale_factor + elif do_normalize: + image = F.normalize(image, image_mean, image_std) + + if do_convert_annotations and annotations is not None: + annotation = self.normalize_annotation(annotation, get_image_size(image, input_data_format)) + + processed_images.append(image) + processed_annotations.append(annotation) + images = processed_images + annotations = processed_annotations if annotations is not None else None + + if do_pad: + # depends on all resized image shapes so we need another loop + if pad_size is not None: + padded_size = (pad_size["height"], pad_size["width"]) + else: + padded_size = get_max_height_width(images) + + padded_images = [] + padded_annotations = [] + for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)): + # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...} + if padded_size == image.size()[-2:]: + padded_images.append(image) + pixel_masks.append(torch.ones(padded_size, dtype=torch.int64, device=image.device)) + padded_annotations.append(annotation) + continue + image, pixel_mask, annotation = self.pad( + image, padded_size, annotation=annotation, update_bboxes=do_convert_annotations + ) + padded_images.append(image) + padded_annotations.append(annotation) + pixel_masks.append(pixel_mask) + images = padded_images + annotations = padded_annotations if annotations is not None else None + data.update({"pixel_mask": torch.stack(pixel_masks, dim=0)}) + + data.update({"pixel_values": torch.stack(images, dim=0)}) + encoded_inputs = BatchFeature(data, tensor_type=return_tensors) + if annotations is not None: + encoded_inputs["labels"] = [ + BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations + ] + return encoded_inputs + + # Copied from transformers.models.rt_detr.image_processing_rt_detr.RTDetrImageProcessor.post_process_object_detection + def post_process_object_detection( + self, + outputs, + threshold: float = 0.5, + target_sizes: Union[TensorType, List[Tuple]] = None, + use_focal_loss: bool = True, + ): + """ + Converts the raw output of [`DetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, + bottom_right_x, bottom_right_y) format. Only supports PyTorch. + + Args: + outputs ([`DetrObjectDetectionOutput`]): + Raw outputs of the model. + threshold (`float`, *optional*, defaults to 0.5): + Score threshold to keep object detection predictions. + target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size + `(height, width)` of each image in the batch. If unset, predictions will not be resized. + use_focal_loss (`bool` defaults to `True`): + Variable informing if the focal loss was used to predict the outputs. If `True`, a sigmoid is applied + to compute the scores of each detection, otherwise, a softmax function is used. + + Returns: + `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image + in the batch as predicted by the model. + """ + requires_backends(self, ["torch"]) + out_logits, out_bbox = outputs.logits, outputs.pred_boxes + # convert from relative cxcywh to absolute xyxy + boxes = center_to_corners_format(out_bbox) + if target_sizes is not None: + if len(out_logits) != len(target_sizes): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the logits" + ) + if isinstance(target_sizes, List): + img_h, img_w = torch.as_tensor(target_sizes).unbind(1) + else: + img_h, img_w = target_sizes.unbind(1) + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) + boxes = boxes * scale_fct[:, None, :] + + num_top_queries = out_logits.shape[1] + num_classes = out_logits.shape[2] + + if use_focal_loss: + scores = torch.nn.functional.sigmoid(out_logits) + scores, index = torch.topk(scores.flatten(1), num_top_queries, axis=-1) + labels = index % num_classes + index = index // num_classes + boxes = boxes.gather(dim=1, index=index.unsqueeze(-1).repeat(1, 1, boxes.shape[-1])) + else: + scores = torch.nn.functional.softmax(out_logits)[:, :, :-1] + scores, labels = scores.max(dim=-1) + if scores.shape[1] > num_top_queries: + scores, index = torch.topk(scores, num_top_queries, dim=-1) + labels = torch.gather(labels, dim=1, index=index) + boxes = torch.gather(boxes, dim=1, index=index.unsqueeze(-1).tile(1, 1, boxes.shape[-1])) + + results = [] + for score, label, box in zip(scores, labels, boxes): + results.append( + { + "scores": score[score > threshold], + "labels": label[score > threshold], + "boxes": box[score > threshold], + } + ) + + return results diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py index d7f87717ca8..19cf02a4e85 100644 --- a/src/transformers/utils/dummy_vision_objects.py +++ b/src/transformers/utils/dummy_vision_objects.py @@ -569,6 +569,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["vision"]) +class RTDetrImageProcessorFast(metaclass=DummyObject): + _backends = ["vision"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["vision"]) + + class SamImageProcessor(metaclass=DummyObject): _backends = ["vision"] diff --git a/tests/models/detr/test_image_processing_detr.py b/tests/models/detr/test_image_processing_detr.py index 976b306115b..f91c5208736 100644 --- a/tests/models/detr/test_image_processing_detr.py +++ b/tests/models/detr/test_image_processing_detr.py @@ -677,7 +677,7 @@ def test_fast_processor_equivalence_cpu_gpu_coco_detection_annotations(self): target = {"image_id": 39769, "annotations": target} - processor = self.image_processor_list[1].from_pretrained("facebook/detr-resnet-50") + processor = self.image_processor_list[1]() # 1. run processor on CPU encoding_cpu = processor(images=image, annotations=target, return_tensors="pt", device="cpu") # 2. run processor on GPU @@ -734,7 +734,7 @@ def test_fast_processor_equivalence_cpu_gpu_coco_panoptic_annotations(self): masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic") - processor = self.image_processor_list[1].from_pretrained("facebook/detr-resnet-50-panoptic") + processor = self.image_processor_list[1](format="coco_panoptic") # 1. run processor on CPU encoding_cpu = processor( images=image, annotations=target, masks_path=masks_path, return_tensors="pt", device="cpu" diff --git a/tests/models/rt_detr/test_image_processing_rt_detr.py b/tests/models/rt_detr/test_image_processing_rt_detr.py index 2a38664d433..e7bfbae3f9c 100644 --- a/tests/models/rt_detr/test_image_processing_rt_detr.py +++ b/tests/models/rt_detr/test_image_processing_rt_detr.py @@ -16,8 +16,8 @@ import requests -from transformers.testing_utils import require_torch, require_vision, slow -from transformers.utils import is_torch_available, is_vision_available +from transformers.testing_utils import require_torch, require_torch_gpu, require_vision, slow +from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs @@ -25,7 +25,7 @@ if is_vision_available(): from PIL import Image - from transformers import RTDetrImageProcessor + from transformers import RTDetrImageProcessor, RTDetrImageProcessorFast if is_torch_available(): import torch @@ -91,6 +91,7 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F @require_vision class RtDetrImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): image_processing_class = RTDetrImageProcessor if is_vision_available() else None + fast_image_processing_class = RTDetrImageProcessorFast if is_torchvision_available() else None def setUp(self): super().setUp() @@ -101,17 +102,19 @@ def image_processor_dict(self): return self.image_processor_tester.prepare_image_processor_dict() def test_image_processor_properties(self): - image_processing = self.image_processing_class(**self.image_processor_dict) - self.assertTrue(hasattr(image_processing, "do_resize")) - self.assertTrue(hasattr(image_processing, "size")) - self.assertTrue(hasattr(image_processing, "resample")) - self.assertTrue(hasattr(image_processing, "do_rescale")) - self.assertTrue(hasattr(image_processing, "rescale_factor")) - self.assertTrue(hasattr(image_processing, "return_tensors")) + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "resample")) + self.assertTrue(hasattr(image_processing, "do_rescale")) + self.assertTrue(hasattr(image_processing, "rescale_factor")) + self.assertTrue(hasattr(image_processing, "return_tensors")) def test_image_processor_from_dict_with_kwargs(self): - image_processor = self.image_processing_class.from_dict(self.image_processor_dict) - self.assertEqual(image_processor.size, {"height": 640, "width": 640}) + for image_processing_class in self.image_processor_list: + image_processor = image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"height": 640, "width": 640}) def test_valid_coco_detection_annotations(self): # prepare image and target @@ -121,32 +124,33 @@ def test_valid_coco_detection_annotations(self): params = {"image_id": 39769, "annotations": target} - # encode them - image_processing = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd") + for image_processing_class in self.image_processor_list: + # encode them + image_processing = image_processing_class.from_pretrained("PekingU/rtdetr_r50vd") - # legal encodings (single image) - _ = image_processing(images=image, annotations=params, return_tensors="pt") - _ = image_processing(images=image, annotations=[params], return_tensors="pt") + # legal encodings (single image) + _ = image_processing(images=image, annotations=params, return_tensors="pt") + _ = image_processing(images=image, annotations=[params], return_tensors="pt") - # legal encodings (batch of one image) - _ = image_processing(images=[image], annotations=params, return_tensors="pt") - _ = image_processing(images=[image], annotations=[params], return_tensors="pt") + # legal encodings (batch of one image) + _ = image_processing(images=[image], annotations=params, return_tensors="pt") + _ = image_processing(images=[image], annotations=[params], return_tensors="pt") - # legal encoding (batch of more than one image) - n = 5 - _ = image_processing(images=[image] * n, annotations=[params] * n, return_tensors="pt") + # legal encoding (batch of more than one image) + n = 5 + _ = image_processing(images=[image] * n, annotations=[params] * n, return_tensors="pt") - # example of an illegal encoding (missing the 'image_id' key) - with self.assertRaises(ValueError) as e: - image_processing(images=image, annotations={"annotations": target}, return_tensors="pt") + # example of an illegal encoding (missing the 'image_id' key) + with self.assertRaises(ValueError) as e: + image_processing(images=image, annotations={"annotations": target}, return_tensors="pt") - self.assertTrue(str(e.exception).startswith("Invalid COCO detection annotations")) + self.assertTrue(str(e.exception).startswith("Invalid COCO detection annotations")) - # example of an illegal encoding (unequal lengths of images and annotations) - with self.assertRaises(ValueError) as e: - image_processing(images=[image] * n, annotations=[params] * (n - 1), return_tensors="pt") + # example of an illegal encoding (unequal lengths of images and annotations) + with self.assertRaises(ValueError) as e: + image_processing(images=[image] * n, annotations=[params] * (n - 1), return_tensors="pt") - self.assertTrue(str(e.exception) == "The number of images (5) and annotations (4) do not match.") + self.assertTrue(str(e.exception) == "The number of images (5) and annotations (4) do not match.") @slow def test_call_pytorch_with_coco_detection_annotations(self): @@ -157,55 +161,57 @@ def test_call_pytorch_with_coco_detection_annotations(self): target = {"image_id": 39769, "annotations": target} - # encode them - image_processing = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd") - encoding = image_processing(images=image, annotations=target, return_tensors="pt") - - # verify pixel values - expected_shape = torch.Size([1, 3, 640, 640]) - self.assertEqual(encoding["pixel_values"].shape, expected_shape) - - expected_slice = torch.tensor([0.5490, 0.5647, 0.5725]) - self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)) - - # verify area - expected_area = torch.tensor([2827.9883, 5403.4761, 235036.7344, 402070.2188, 71068.8281, 79601.2812]) - self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area)) - # verify boxes - expected_boxes_shape = torch.Size([6, 4]) - self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape) - expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215]) - self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)) - # verify image_id - expected_image_id = torch.tensor([39769]) - self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)) - # verify is_crowd - expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0]) - self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)) - # verify class_labels - expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17]) - self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)) - # verify orig_size - expected_orig_size = torch.tensor([480, 640]) - self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)) - # verify size - expected_size = torch.tensor([640, 640]) - self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size)) + for image_processing_class in self.image_processor_list: + # encode them + image_processing = image_processing_class.from_pretrained("PekingU/rtdetr_r50vd") + encoding = image_processing(images=image, annotations=target, return_tensors="pt") + + # verify pixel values + expected_shape = torch.Size([1, 3, 640, 640]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + expected_slice = torch.tensor([0.5490, 0.5647, 0.5725]) + self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)) + + # verify area + expected_area = torch.tensor([2827.9883, 5403.4761, 235036.7344, 402070.2188, 71068.8281, 79601.2812]) + self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area)) + # verify boxes + expected_boxes_shape = torch.Size([6, 4]) + self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape) + expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215]) + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)) + # verify image_id + expected_image_id = torch.tensor([39769]) + self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)) + # verify is_crowd + expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0]) + self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)) + # verify class_labels + expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17]) + self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)) + # verify orig_size + expected_orig_size = torch.tensor([480, 640]) + self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)) + # verify size + expected_size = torch.tensor([640, 640]) + self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size)) @slow def test_image_processor_outputs(self): image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") - image_processing = self.image_processing_class(**self.image_processor_dict) - encoding = image_processing(images=image, return_tensors="pt") + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class(**self.image_processor_dict) + encoding = image_processing(images=image, return_tensors="pt") - # verify pixel values: shape - expected_shape = torch.Size([1, 3, 640, 640]) - self.assertEqual(encoding["pixel_values"].shape, expected_shape) + # verify pixel values: shape + expected_shape = torch.Size([1, 3, 640, 640]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) - # verify pixel values: output values - expected_slice = torch.tensor([0.5490196347236633, 0.5647059082984924, 0.572549045085907]) - self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-5)) + # verify pixel values: output values + expected_slice = torch.tensor([0.5490196347236633, 0.5647059082984924, 0.572549045085907]) + self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-5)) def test_multiple_images_processor_outputs(self): images_urls = [ @@ -224,31 +230,32 @@ def test_multiple_images_processor_outputs(self): image = Image.open(requests.get(url, stream=True).raw) images.append(image) - # apply image processing - image_processing = self.image_processing_class(**self.image_processor_dict) - encoding = image_processing(images=images, return_tensors="pt") - - # verify if pixel_values is part of the encoding - self.assertIn("pixel_values", encoding) - - # verify pixel values: shape - expected_shape = torch.Size([8, 3, 640, 640]) - self.assertEqual(encoding["pixel_values"].shape, expected_shape) - - # verify pixel values: output values - expected_slices = torch.tensor( - [ - [0.5333333611488342, 0.5568627715110779, 0.5647059082984924], - [0.5372549295425415, 0.4705882668495178, 0.4274510145187378], - [0.3960784673690796, 0.35686275362968445, 0.3686274588108063], - [0.20784315466880798, 0.1882353127002716, 0.15294118225574493], - [0.364705890417099, 0.364705890417099, 0.3686274588108063], - [0.8078432083129883, 0.8078432083129883, 0.8078432083129883], - [0.4431372880935669, 0.4431372880935669, 0.4431372880935669], - [0.19607844948768616, 0.21176472306251526, 0.3607843220233917], - ] - ) - self.assertTrue(torch.allclose(encoding["pixel_values"][:, 1, 0, :3], expected_slices, atol=1e-5)) + for image_processing_class in self.image_processor_list: + # apply image processing + image_processing = image_processing_class(**self.image_processor_dict) + encoding = image_processing(images=images, return_tensors="pt") + + # verify if pixel_values is part of the encoding + self.assertIn("pixel_values", encoding) + + # verify pixel values: shape + expected_shape = torch.Size([8, 3, 640, 640]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + # verify pixel values: output values + expected_slices = torch.tensor( + [ + [0.5333333611488342, 0.5568627715110779, 0.5647059082984924], + [0.5372549295425415, 0.4705882668495178, 0.4274510145187378], + [0.3960784673690796, 0.35686275362968445, 0.3686274588108063], + [0.20784315466880798, 0.1882353127002716, 0.15294118225574493], + [0.364705890417099, 0.364705890417099, 0.3686274588108063], + [0.8078432083129883, 0.8078432083129883, 0.8078432083129883], + [0.4431372880935669, 0.4431372880935669, 0.4431372880935669], + [0.19607844948768616, 0.21176472306251526, 0.3607843220233917], + ] + ) + self.assertTrue(torch.allclose(encoding["pixel_values"][:, 1, 0, :3], expected_slices, atol=1e-5)) @slow def test_batched_coco_detection_annotations(self): @@ -277,89 +284,146 @@ def test_batched_coco_detection_annotations(self): images = [image_0, image_1] annotations = [annotations_0, annotations_1] - image_processing = RTDetrImageProcessor() - encoding = image_processing( - images=images, - annotations=annotations, - return_segmentation_masks=True, - return_tensors="pt", # do_convert_annotations=True - ) + for image_processing_class in self.image_processor_list: + image_processing = image_processing_class() + encoding = image_processing( + images=images, + annotations=annotations, + return_segmentation_masks=True, + return_tensors="pt", # do_convert_annotations=True + ) + + # Check the pixel values have been padded + postprocessed_height, postprocessed_width = 640, 640 + expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + # Check the bounding boxes have been adjusted for padded images + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + expected_boxes_0 = torch.tensor( + [ + [0.6879, 0.4609, 0.0755, 0.3691], + [0.2118, 0.3359, 0.2601, 0.1566], + [0.5011, 0.5000, 0.9979, 1.0000], + [0.5010, 0.5020, 0.9979, 0.9959], + [0.3284, 0.5944, 0.5884, 0.8112], + [0.8394, 0.5445, 0.3213, 0.9110], + ] + ) + expected_boxes_1 = torch.tensor( + [ + [0.5503, 0.2765, 0.0604, 0.2215], + [0.1695, 0.2016, 0.2080, 0.0940], + [0.5006, 0.4933, 0.9977, 0.9865], + [0.5008, 0.5002, 0.9983, 0.9955], + [0.2627, 0.5456, 0.4707, 0.8646], + [0.7715, 0.4115, 0.4570, 0.7161], + ] + ) + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3)) + self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3)) + + # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height + # format and not in the range [0, 1] + encoding = image_processing( + images=images, + annotations=annotations, + return_segmentation_masks=True, + do_convert_annotations=False, + return_tensors="pt", + ) + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + # Convert to absolute coordinates + unnormalized_boxes_0 = torch.vstack( + [ + expected_boxes_0[:, 0] * postprocessed_width, + expected_boxes_0[:, 1] * postprocessed_height, + expected_boxes_0[:, 2] * postprocessed_width, + expected_boxes_0[:, 3] * postprocessed_height, + ] + ).T + unnormalized_boxes_1 = torch.vstack( + [ + expected_boxes_1[:, 0] * postprocessed_width, + expected_boxes_1[:, 1] * postprocessed_height, + expected_boxes_1[:, 2] * postprocessed_width, + expected_boxes_1[:, 3] * postprocessed_height, + ] + ).T + # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max + expected_boxes_0 = torch.vstack( + [ + unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, + unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, + ] + ).T + expected_boxes_1 = torch.vstack( + [ + unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, + unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, + ] + ).T + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) + self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) - # Check the pixel values have been padded - postprocessed_height, postprocessed_width = 640, 640 - expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) - self.assertEqual(encoding["pixel_values"].shape, expected_shape) - - # Check the bounding boxes have been adjusted for padded images - self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) - self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) - expected_boxes_0 = torch.tensor( - [ - [0.6879, 0.4609, 0.0755, 0.3691], - [0.2118, 0.3359, 0.2601, 0.1566], - [0.5011, 0.5000, 0.9979, 1.0000], - [0.5010, 0.5020, 0.9979, 0.9959], - [0.3284, 0.5944, 0.5884, 0.8112], - [0.8394, 0.5445, 0.3213, 0.9110], - ] + @slow + @require_torch_gpu + # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_fast_processor_equivalence_cpu_gpu_coco_detection_annotations + def test_fast_processor_equivalence_cpu_gpu_coco_detection_annotations(self): + # prepare image and target + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f: + target = json.loads(f.read()) + + target = {"image_id": 39769, "annotations": target} + + processor = self.image_processor_list[1]() + # 1. run processor on CPU + encoding_cpu = processor(images=image, annotations=target, return_tensors="pt", device="cpu") + # 2. run processor on GPU + encoding_gpu = processor(images=image, annotations=target, return_tensors="pt", device="cuda") + + # verify pixel values + self.assertEqual(encoding_cpu["pixel_values"].shape, encoding_gpu["pixel_values"].shape) + self.assertTrue( + torch.allclose( + encoding_cpu["pixel_values"][0, 0, 0, :3], + encoding_gpu["pixel_values"][0, 0, 0, :3].to("cpu"), + atol=1e-4, + ) ) - expected_boxes_1 = torch.tensor( - [ - [0.5503, 0.2765, 0.0604, 0.2215], - [0.1695, 0.2016, 0.2080, 0.0940], - [0.5006, 0.4933, 0.9977, 0.9865], - [0.5008, 0.5002, 0.9983, 0.9955], - [0.2627, 0.5456, 0.4707, 0.8646], - [0.7715, 0.4115, 0.4570, 0.7161], - ] + # verify area + self.assertTrue(torch.allclose(encoding_cpu["labels"][0]["area"], encoding_gpu["labels"][0]["area"].to("cpu"))) + # verify boxes + self.assertEqual(encoding_cpu["labels"][0]["boxes"].shape, encoding_gpu["labels"][0]["boxes"].shape) + self.assertTrue( + torch.allclose( + encoding_cpu["labels"][0]["boxes"][0], encoding_gpu["labels"][0]["boxes"][0].to("cpu"), atol=1e-3 + ) ) - self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3)) - self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3)) - - # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height - # format and not in the range [0, 1] - encoding = image_processing( - images=images, - annotations=annotations, - return_segmentation_masks=True, - do_convert_annotations=False, - return_tensors="pt", + # verify image_id + self.assertTrue( + torch.allclose(encoding_cpu["labels"][0]["image_id"], encoding_gpu["labels"][0]["image_id"].to("cpu")) ) - self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) - self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) - # Convert to absolute coordinates - unnormalized_boxes_0 = torch.vstack( - [ - expected_boxes_0[:, 0] * postprocessed_width, - expected_boxes_0[:, 1] * postprocessed_height, - expected_boxes_0[:, 2] * postprocessed_width, - expected_boxes_0[:, 3] * postprocessed_height, - ] - ).T - unnormalized_boxes_1 = torch.vstack( - [ - expected_boxes_1[:, 0] * postprocessed_width, - expected_boxes_1[:, 1] * postprocessed_height, - expected_boxes_1[:, 2] * postprocessed_width, - expected_boxes_1[:, 3] * postprocessed_height, - ] - ).T - # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max - expected_boxes_0 = torch.vstack( - [ - unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, - unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, - unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, - unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, - ] - ).T - expected_boxes_1 = torch.vstack( - [ - unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, - unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, - unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, - unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, - ] - ).T - self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) - self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) + # verify is_crowd + self.assertTrue( + torch.allclose(encoding_cpu["labels"][0]["iscrowd"], encoding_gpu["labels"][0]["iscrowd"].to("cpu")) + ) + # verify class_labels + self.assertTrue( + torch.allclose( + encoding_cpu["labels"][0]["class_labels"], encoding_gpu["labels"][0]["class_labels"].to("cpu") + ) + ) + # verify orig_size + self.assertTrue( + torch.allclose(encoding_cpu["labels"][0]["orig_size"], encoding_gpu["labels"][0]["orig_size"].to("cpu")) + ) + # verify size + self.assertTrue(torch.allclose(encoding_cpu["labels"][0]["size"], encoding_gpu["labels"][0]["size"].to("cpu"))) From 405b56269812056d9593869e22b7b264d806cb1e Mon Sep 17 00:00:00 2001 From: anshumangahlot Date: Thu, 31 Oct 2024 01:07:39 +0530 Subject: [PATCH 77/99] UPDATE Documentation for #TRANSLATING.md Documentation into Multiple Languages.(Changes made) (#34226) * Update TRANSLATING.md * Apply suggestions from code review Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update TRANSLATING.md --------- Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/TRANSLATING.md | 81 ++++++++++++++++++++++++++------------------- 1 file changed, 47 insertions(+), 34 deletions(-) diff --git a/docs/TRANSLATING.md b/docs/TRANSLATING.md index 49747821f47..64dced45098 100644 --- a/docs/TRANSLATING.md +++ b/docs/TRANSLATING.md @@ -1,57 +1,70 @@ -### Translating the Transformers documentation into your language +# Translating the Transformers documentation into your language -As part of our mission to democratize machine learning, we'd love to make the Transformers library available in many more languages! Follow the steps below if you want to help translate the documentation into your language 🙏. +As part of our mission to democratize machine learning, we aim to make the Transformers library available in many more languages! Follow the steps below to help translate the documentation into your language. -**🗞️ Open an issue** +## Open an Issue -To get started, navigate to the [Issues](https://github.com/huggingface/transformers/issues) page of this repo and check if anyone else has opened an issue for your language. If not, open a new issue by selecting the "Translation template" from the "New issue" button. +1. Navigate to the Issues page of this repository. +2. Check if anyone has already opened an issue for your language. +3. If not, create a new issue by selecting the "Translation template" from the "New issue" button. +4. Post a comment indicating which chapters you’d like to work on, and we’ll add your name to the list. -Once an issue exists, post a comment to indicate which chapters you'd like to work on, and we'll add your name to the list. +## Fork the Repository +1. First, fork the Transformers repo by clicking the Fork button in the top-right corner. +2. Clone your fork to your local machine for editing with the following command: -**🍴 Fork the repository** + ```bash + git clone https://github.com/YOUR-USERNAME/transformers.git + ``` + + Replace `YOUR-USERNAME` with your GitHub username. -First, you'll need to [fork the Transformers repo](https://docs.github.com/en/get-started/quickstart/fork-a-repo). You can do this by clicking on the **Fork** button on the top-right corner of this repo's page. +## Copy-paste the English version with a new language code -Once you've forked the repo, you'll want to get the files on your local machine for editing. You can do that by cloning the fork with Git as follows: +The documentation files are organized in the following directory: -```bash -git clone https://github.com/YOUR-USERNAME/transformers.git -``` +- **docs/source**: This contains all documentation materials organized by language. -**📋 Copy-paste the English version with a new language code** +To copy the English version to your new language directory: -The documentation files are in one leading directory: +1. Navigate to your fork of the repository: -- [`docs/source`](https://github.com/huggingface/transformers/tree/main/docs/source): All the documentation materials are organized here by language. + ```bash + cd ~/path/to/transformers/docs + ``` -You'll only need to copy the files in the [`docs/source/en`](https://github.com/huggingface/transformers/tree/main/docs/source/en) directory, so first navigate to your fork of the repo and run the following: + Replace `~/path/to` with your actual path. -```bash -cd ~/path/to/transformers/docs -cp -r source/en source/LANG-ID -``` +2. Run the following command: -Here, `LANG-ID` should be one of the ISO 639-1 or ISO 639-2 language codes -- see [here](https://www.loc.gov/standards/iso639-2/php/code_list.php) for a handy table. + ```bash + cp -r source/en source/LANG-ID + ``` -**✍️ Start translating** + Replace `LANG-ID` with the appropriate ISO 639-1 or ISO 639-2 language code (see [this table](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) for reference). -The fun part comes - translating the text! +## Start translating -The first thing we recommend is translating the part of the `_toctree.yml` file that corresponds to your doc chapter. This file is used to render the table of contents on the website. +Begin translating the text! -> 🙋 If the `_toctree.yml` file doesn't yet exist for your language, you can create one by copy-pasting from the English version and deleting the sections unrelated to your chapter. Just make sure it exists in the `docs/source/LANG-ID/` directory! +1. Start with the `_toctree.yml` file that corresponds to your documentation chapter. This file is essential for rendering the table of contents on the website. -The fields you should add are `local` (with the name of the file containing the translation; e.g. `autoclass_tutorial`), and `title` (with the title of the doc in your language; e.g. `Load pretrained instances with an AutoClass`) -- as a reference, here is the `_toctree.yml` for [English](https://github.com/huggingface/transformers/blob/main/docs/source/en/_toctree.yml): + - If the `_toctree.yml` file doesn’t exist for your language, create one by copying the English version and removing unrelated sections. + - Ensure it is placed in the `docs/source/LANG-ID/` directory. -```yaml -- sections: - - local: pipeline_tutorial # Do not change this! Use the same name for your .md file - title: Pipelines for inference # Translate this! - ... - title: Tutorials # Translate this! -``` + Here’s an example structure for the `_toctree.yml` file: -Once you have translated the `_toctree.yml` file, you can start translating the [MDX](https://mdxjs.com/) files associated with your docs chapter. + ```yaml + - sections: + - local: pipeline_tutorial # Keep this name for your .md file + title: Pipelines for Inference # Translate this + ... + title: Tutorials # Translate this + ``` -> 🙋 If you'd like others to help you with the translation, you should [open an issue](https://github.com/huggingface/transformers/issues) and tag @stevhliu. +2. Once you’ve translated the `_toctree.yml`, move on to translating the associated MDX files. + +## Collaborate and share + +If you'd like assistance with your translation, open an issue and tag `@stevhliu`. Feel free to share resources or glossaries to ensure consistent terminology. From f38531619ddff23a510d5f7ccbc257a1bb1a3cb7 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Thu, 31 Oct 2024 20:55:53 +0800 Subject: [PATCH 78/99] enable QA bf16 pipeline (#34483) * enable QA bf16 pipeline * add tests --- .../pipelines/question_answering.py | 10 ++++-- .../test_pipelines_question_answering.py | 33 +++++++++++++++++++ 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/src/transformers/pipelines/question_answering.py b/src/transformers/pipelines/question_answering.py index 6039e5ad1ee..7b876eefc49 100644 --- a/src/transformers/pipelines/question_answering.py +++ b/src/transformers/pipelines/question_answering.py @@ -540,8 +540,14 @@ def postprocess( min_null_score = 1000000 # large and positive answers = [] for output in model_outputs: - start_ = output["start"] - end_ = output["end"] + if self.framework == "pt" and output["start"].dtype == torch.bfloat16: + start_ = output["start"].to(torch.float32) + else: + start_ = output["start"] + if self.framework == "pt" and output["start"].dtype == torch.bfloat16: + end_ = output["end"].to(torch.float32) + else: + end_ = output["end"] example = output["example"] p_mask = output["p_mask"] attention_mask = ( diff --git a/tests/pipelines/test_pipelines_question_answering.py b/tests/pipelines/test_pipelines_question_answering.py index d06f88d1f08..bf4fc7db1db 100644 --- a/tests/pipelines/test_pipelines_question_answering.py +++ b/tests/pipelines/test_pipelines_question_answering.py @@ -27,6 +27,7 @@ from transformers.testing_utils import ( compare_pipeline_output_to_hub_spec, is_pipeline_test, + is_torch_available, nested_simplify, require_tf, require_torch, @@ -34,6 +35,10 @@ slow, ) + +if is_torch_available(): + import torch + from .test_pipelines_common import ANY @@ -165,6 +170,34 @@ def test_small_model_pt(self): self.assertEqual(nested_simplify(outputs), {"score": 0.01, "start": 0, "end": 11, "answer": "HuggingFace"}) + @require_torch + def test_small_model_pt_fp16(self): + question_answerer = pipeline( + "question-answering", + model="sshleifer/tiny-distilbert-base-cased-distilled-squad", + torch_dtype=torch.float16, + ) + + outputs = question_answerer( + question="Where was HuggingFace founded ?", context="HuggingFace was founded in Paris." + ) + + self.assertEqual(nested_simplify(outputs), {"score": 0.01, "start": 0, "end": 11, "answer": "HuggingFace"}) + + @require_torch + def test_small_model_pt_bf16(self): + question_answerer = pipeline( + "question-answering", + model="sshleifer/tiny-distilbert-base-cased-distilled-squad", + torch_dtype=torch.bfloat16, + ) + + outputs = question_answerer( + question="Where was HuggingFace founded ?", context="HuggingFace was founded in Paris." + ) + + self.assertEqual(nested_simplify(outputs), {"score": 0.01, "start": 0, "end": 11, "answer": "HuggingFace"}) + @require_torch def test_small_model_pt_iterator(self): # https://github.com/huggingface/transformers/issues/18510 From 1b86772de5d8429ff5ffe5caf7855548d2f4f072 Mon Sep 17 00:00:00 2001 From: jp Date: Thu, 31 Oct 2024 22:32:45 +0900 Subject: [PATCH 79/99] Fix: img size mismatch caused by incorrect unpadding in LLaVA-Next (#34522) Fix: unpadding img mismatch --- src/transformers/models/llava_next/modeling_llava_next.py | 4 ++-- .../models/llava_next_video/modeling_llava_next_video.py | 4 ++-- .../models/llava_onevision/modeling_llava_onevision.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index 5a49337b2b5..2d23c48225c 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -138,12 +138,12 @@ def unpad_image(tensor, original_size): if original_aspect_ratio > current_aspect_ratio: scale_factor = current_width / original_width - new_height = int(original_height * scale_factor) + new_height = int(round(original_height * scale_factor, 7)) padding = (current_height - new_height) // 2 unpadded_tensor = tensor[:, padding : current_height - padding, :] else: scale_factor = current_height / original_height - new_width = int(original_width * scale_factor) + new_width = int(round(original_width * scale_factor, 7)) padding = (current_width - new_width) // 2 unpadded_tensor = tensor[:, :, padding : current_width - padding] diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index 85c109919da..a2328c1d2d9 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -143,12 +143,12 @@ def unpad_image(tensor, original_size): if original_aspect_ratio > current_aspect_ratio: scale_factor = current_width / original_width - new_height = int(original_height * scale_factor) + new_height = int(round(original_height * scale_factor, 7)) padding = (current_height - new_height) // 2 unpadded_tensor = tensor[:, padding : current_height - padding, :] else: scale_factor = current_height / original_height - new_width = int(original_width * scale_factor) + new_width = int(round(original_width * scale_factor, 7)) padding = (current_width - new_width) // 2 unpadded_tensor = tensor[:, :, padding : current_width - padding] diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py index 2aa6b2fa1d6..626db4d96aa 100644 --- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py @@ -139,12 +139,12 @@ def unpad_image(tensor, original_size): if original_aspect_ratio > current_aspect_ratio: scale_factor = current_width / original_width - new_height = int(original_height * scale_factor) + new_height = int(round(original_height * scale_factor, 7)) padding = (current_height - new_height) // 2 unpadded_tensor = tensor[:, padding : current_height - padding, :] else: scale_factor = current_height / original_height - new_width = int(original_width * scale_factor) + new_width = int(round(original_width * scale_factor, 7)) padding = (current_width - new_width) // 2 unpadded_tensor = tensor[:, :, padding : current_width - padding] From dca93ca076c68372dcf3ad1239a2119afdda629c Mon Sep 17 00:00:00 2001 From: kibitzing Date: Thu, 31 Oct 2024 22:53:23 +0900 Subject: [PATCH 80/99] Fix step shifting when accumulate gradient (#33673) * replace total_batched_samples with step while counting grad accum step * remove unused variable * simplify condition for update step * fix format by ruff * simplify update step condition using accelerator.sync_gradients * simplify update condition using do_sync_step * remove print for test --------- Co-authored-by: Zach Mueller --- src/transformers/trainer.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index e2ae622e2b6..30caa2de260 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -2404,7 +2404,6 @@ def _inner_training_loop( if args.eval_on_start: self._evaluate(trial, ignore_keys_for_eval, skip_scheduler=True) - total_batched_samples = 0 for epoch in range(epochs_trained, num_train_epochs): epoch_dataloader = train_dataloader if hasattr(epoch_dataloader, "set_epoch"): @@ -2447,13 +2446,7 @@ def _inner_training_loop( batch_samples, num_items_in_batch = self.get_batch_samples(epoch_iterator, num_batches) for inputs in batch_samples: step += 1 - total_batched_samples += 1 - is_last_step_and_steps_less_than_grad_acc = ( - steps_in_epoch <= args.gradient_accumulation_steps and (step + 1) == steps_in_epoch - ) - do_sync_step = is_last_step_and_steps_less_than_grad_acc or ( - total_batched_samples % args.gradient_accumulation_steps == 0 - ) + do_sync_step = (step + 1) % args.gradient_accumulation_steps == 0 or (step + 1) == steps_in_epoch # Since we perform prefetching, we need to manually set sync_gradients if not do_sync_step: self.accelerator.gradient_state._set_sync_gradients(False) From ab98f0b0a1cd90b1c72948daf83c098037212fc4 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Thu, 31 Oct 2024 16:36:13 +0100 Subject: [PATCH 81/99] avoid calling `gc.collect` and `cuda.empty_cache` (#34514) * update * update * update * update * update --------- Co-authored-by: ydshieh --- src/transformers/testing_utils.py | 8 ++++++++ tests/models/clvp/test_feature_extraction_clvp.py | 12 ++++++++---- tests/models/clvp/test_modeling_clvp.py | 14 +++++--------- tests/models/ctrl/test_modeling_ctrl.py | 9 +++------ tests/models/gpt2/test_modeling_gpt2.py | 9 +++------ .../gpt_bigcode/test_modeling_gpt_bigcode.py | 8 ++++---- tests/models/idefics2/test_modeling_idefics2.py | 5 ++--- tests/models/idefics3/test_modeling_idefics3.py | 6 ++---- tests/models/llama/test_modeling_llama.py | 6 ++---- tests/models/llava/test_modeling_llava.py | 5 ++--- .../models/llava_next/test_modeling_llava_next.py | 5 ++--- .../test_modeling_llava_next_video.py | 5 ++--- .../test_modeling_llava_onevision.py | 5 ++--- tests/models/mistral/test_modeling_mistral.py | 7 +++---- tests/models/mllama/test_modeling_mllama.py | 5 ++--- tests/models/paligemma/test_modeling_paligemma.py | 5 ++--- .../qwen2_audio/test_modeling_qwen2_audio.py | 5 ++--- tests/models/rag/test_modeling_rag.py | 11 ++++------- tests/models/sam/test_modeling_sam.py | 6 ++---- tests/models/univnet/test_modeling_univnet.py | 6 ++---- .../video_llava/test_modeling_video_llava.py | 5 ++--- tests/models/vipllava/test_modeling_vipllava.py | 13 +++++++++---- tests/models/wav2vec2/test_modeling_wav2vec2.py | 6 ++---- tests/models/xglm/test_modeling_xglm.py | 5 ++--- 24 files changed, 77 insertions(+), 94 deletions(-) diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 0eef286732d..8d6c1b19377 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -16,6 +16,7 @@ import contextlib import doctest import functools +import gc import importlib import inspect import logging @@ -2679,3 +2680,10 @@ def compare_pipeline_output_to_hub_spec(output, hub_spec): if unexpected_keys: error.append(f"Keys in pipeline output that are not in Hub spec: {unexpected_keys}") raise KeyError("\n".join(error)) + + +@require_torch +def cleanup(device: str, gc_collect=False): + if gc_collect: + gc.collect() + backend_empty_cache(device) diff --git a/tests/models/clvp/test_feature_extraction_clvp.py b/tests/models/clvp/test_feature_extraction_clvp.py index db641eaf614..1f059ca4694 100644 --- a/tests/models/clvp/test_feature_extraction_clvp.py +++ b/tests/models/clvp/test_feature_extraction_clvp.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import gc import itertools import os import random @@ -24,7 +23,13 @@ from datasets import Audio, load_dataset from transformers import ClvpFeatureExtractor -from transformers.testing_utils import check_json_file_has_correct_format, require_torch, slow +from transformers.testing_utils import ( + check_json_file_has_correct_format, + cleanup, + require_torch, + slow, + torch_device, +) from transformers.utils.import_utils import is_torch_available from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin @@ -116,8 +121,7 @@ def setUp(self): def tearDown(self): super().tearDown() # clean-up as much as possible GPU memory occupied by PyTorch - gc.collect() - torch.cuda.empty_cache() + cleanup(torch_device) # Copied from transformers.tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.test_feat_extract_from_and_save_pretrained def test_feat_extract_from_and_save_pretrained(self): diff --git a/tests/models/clvp/test_modeling_clvp.py b/tests/models/clvp/test_modeling_clvp.py index 0cf89a74523..12e58500063 100644 --- a/tests/models/clvp/test_modeling_clvp.py +++ b/tests/models/clvp/test_modeling_clvp.py @@ -14,7 +14,6 @@ # limitations under the License. """Testing suite for the PyTorch Clvp model.""" -import gc import tempfile import unittest @@ -23,6 +22,7 @@ from transformers import ClvpConfig, ClvpDecoderConfig, ClvpEncoderConfig from transformers.testing_utils import ( + cleanup, require_torch, slow, torch_device, @@ -174,8 +174,7 @@ def setUp(self): def tearDown(self): super().tearDown() # clean-up as much as possible GPU memory occupied by PyTorch - gc.collect() - torch.cuda.empty_cache() + cleanup(torch_device) def test_config(self): self.encoder_config_tester.run_common_tests() @@ -294,8 +293,7 @@ def setUp(self): def tearDown(self): super().tearDown() # clean-up as much as possible GPU memory occupied by PyTorch - gc.collect() - torch.cuda.empty_cache() + cleanup(torch_device) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() @@ -421,8 +419,7 @@ def setUp(self): def tearDown(self): super().tearDown() # clean-up as much as possible GPU memory occupied by PyTorch - gc.collect() - torch.cuda.empty_cache() + cleanup(torch_device) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() @@ -571,8 +568,7 @@ def setUp(self): def tearDown(self): super().tearDown() # clean-up as much as possible GPU memory occupied by PyTorch - gc.collect() - torch.cuda.empty_cache() + cleanup(torch_device, gc_collect=True) def test_conditional_encoder(self): with torch.no_grad(): diff --git a/tests/models/ctrl/test_modeling_ctrl.py b/tests/models/ctrl/test_modeling_ctrl.py index a9bdddd7bfe..88efa9bb189 100644 --- a/tests/models/ctrl/test_modeling_ctrl.py +++ b/tests/models/ctrl/test_modeling_ctrl.py @@ -13,11 +13,10 @@ # limitations under the License. -import gc import unittest from transformers import CTRLConfig, is_torch_available -from transformers.testing_utils import backend_empty_cache, require_torch, slow, torch_device +from transformers.testing_utils import cleanup, require_torch, slow, torch_device from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester @@ -235,8 +234,7 @@ def setUp(self): def tearDown(self): super().tearDown() # clean-up as much as possible GPU memory occupied by PyTorch - gc.collect() - backend_empty_cache(torch_device) + cleanup(torch_device) def test_config(self): self.config_tester.run_common_tests() @@ -261,8 +259,7 @@ class CTRLModelLanguageGenerationTest(unittest.TestCase): def tearDown(self): super().tearDown() # clean-up as much as possible GPU memory occupied by PyTorch - gc.collect() - backend_empty_cache(torch_device) + cleanup(torch_device, gc_collect=True) @slow def test_lm_generate_ctrl(self): diff --git a/tests/models/gpt2/test_modeling_gpt2.py b/tests/models/gpt2/test_modeling_gpt2.py index 3f96c20ab2d..012444b472c 100644 --- a/tests/models/gpt2/test_modeling_gpt2.py +++ b/tests/models/gpt2/test_modeling_gpt2.py @@ -15,7 +15,6 @@ import datetime -import gc import math import unittest @@ -23,7 +22,7 @@ from transformers import GPT2Config, is_torch_available from transformers.testing_utils import ( - backend_empty_cache, + cleanup, require_flash_attn, require_torch, require_torch_gpu, @@ -542,8 +541,7 @@ def setUp(self): def tearDown(self): super().tearDown() # clean-up as much as possible GPU memory occupied by PyTorch - gc.collect() - backend_empty_cache(torch_device) + cleanup(torch_device) def test_config(self): self.config_tester.run_common_tests() @@ -753,8 +751,7 @@ class GPT2ModelLanguageGenerationTest(unittest.TestCase): def tearDown(self): super().tearDown() # clean-up as much as possible GPU memory occupied by PyTorch - gc.collect() - backend_empty_cache(torch_device) + cleanup(torch_device, gc_collect=True) def _test_lm_generate_gpt2_helper( self, diff --git a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py index 9d7750f5cf2..1db484c4062 100644 --- a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py +++ b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py @@ -18,7 +18,7 @@ from parameterized import parameterized from transformers import GPTBigCodeConfig, is_torch_available -from transformers.testing_utils import require_torch, slow, torch_device +from transformers.testing_utils import cleanup, require_torch, slow, torch_device from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester @@ -422,9 +422,9 @@ def setUp(self): self.config_tester = ConfigTester(self, config_class=GPTBigCodeConfig, n_embd=37) def tearDown(self): - import gc - - gc.collect() + super().tearDown() + # clean-up as much as possible GPU memory occupied by PyTorch + cleanup(torch_device) def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py index 042fecf4bd2..0b0f3c1f3d8 100644 --- a/tests/models/idefics2/test_modeling_idefics2.py +++ b/tests/models/idefics2/test_modeling_idefics2.py @@ -15,7 +15,6 @@ """Testing suite for the PyTorch Idefics2 model.""" import copy -import gc import tempfile import unittest from io import BytesIO @@ -31,6 +30,7 @@ is_vision_available, ) from transformers.testing_utils import ( + cleanup, require_bitsandbytes, require_flash_attn, require_torch, @@ -583,8 +583,7 @@ def setUp(self): ) def tearDown(self): - gc.collect() - torch.cuda.empty_cache() + cleanup(torch_device, gc_collect=True) @slow @require_torch_multi_gpu diff --git a/tests/models/idefics3/test_modeling_idefics3.py b/tests/models/idefics3/test_modeling_idefics3.py index 5dc352d22fe..dc5aad2fd04 100644 --- a/tests/models/idefics3/test_modeling_idefics3.py +++ b/tests/models/idefics3/test_modeling_idefics3.py @@ -15,7 +15,6 @@ """Testing suite for the PyTorch Idefics3 model.""" import copy -import gc import unittest from io import BytesIO @@ -26,7 +25,7 @@ is_torch_available, is_vision_available, ) -from transformers.testing_utils import require_bitsandbytes, require_torch, slow, torch_device +from transformers.testing_utils import cleanup, require_bitsandbytes, require_torch, slow, torch_device from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester @@ -497,8 +496,7 @@ def setUp(self): ) def tearDown(self): - gc.collect() - torch.cuda.empty_cache() + cleanup(torch_device, gc_collect=True) @slow @unittest.skip("multi-gpu tests are disabled for now") diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py index 375ec1dd3e6..9e67f4f7381 100644 --- a/tests/models/llama/test_modeling_llama.py +++ b/tests/models/llama/test_modeling_llama.py @@ -14,7 +14,6 @@ # limitations under the License. """Testing suite for the PyTorch LLaMA model.""" -import gc import tempfile import unittest @@ -25,7 +24,7 @@ from transformers import AutoTokenizer, LlamaConfig, StaticCache, is_torch_available, set_seed from transformers.generation.configuration_utils import GenerationConfig from transformers.testing_utils import ( - backend_empty_cache, + cleanup, require_flash_attn, require_read_token, require_torch, @@ -891,8 +890,7 @@ def test_export_static_cache(self): @require_torch_accelerator class Mask4DTestHard(unittest.TestCase): def tearDown(self): - gc.collect() - backend_empty_cache(torch_device) + cleanup(torch_device, gc_collect=True) def setUp(self): model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py index 1a17f18de34..af0eddcd35b 100644 --- a/tests/models/llava/test_modeling_llava.py +++ b/tests/models/llava/test_modeling_llava.py @@ -14,7 +14,6 @@ # limitations under the License. """Testing suite for the PyTorch Llava model.""" -import gc import unittest import requests @@ -28,6 +27,7 @@ is_vision_available, ) from transformers.testing_utils import ( + cleanup, require_bitsandbytes, require_torch, require_torch_gpu, @@ -307,8 +307,7 @@ def setUp(self): self.processor = AutoProcessor.from_pretrained("llava-hf/bakLlava-v1-hf") def tearDown(self): - gc.collect() - torch.cuda.empty_cache() + cleanup(torch_device, gc_collect=True) @slow @require_bitsandbytes diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py index e088b250536..e960f9f6759 100644 --- a/tests/models/llava_next/test_modeling_llava_next.py +++ b/tests/models/llava_next/test_modeling_llava_next.py @@ -14,7 +14,6 @@ # limitations under the License. """Testing suite for the PyTorch Llava-NeXT model.""" -import gc import unittest import requests @@ -28,6 +27,7 @@ is_vision_available, ) from transformers.testing_utils import ( + cleanup, require_bitsandbytes, require_torch, slow, @@ -370,8 +370,7 @@ def setUp(self): self.prompt = "[INST] \nWhat is shown in this image? [/INST]" def tearDown(self): - gc.collect() - torch.cuda.empty_cache() + cleanup(torch_device, gc_collect=True) @slow @require_bitsandbytes diff --git a/tests/models/llava_next_video/test_modeling_llava_next_video.py b/tests/models/llava_next_video/test_modeling_llava_next_video.py index edf1dd2d4c0..89cdce65ece 100644 --- a/tests/models/llava_next_video/test_modeling_llava_next_video.py +++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py @@ -14,7 +14,6 @@ # limitations under the License. """Testing suite for the PyTorch Llava-NeXT-Video model.""" -import gc import unittest import numpy as np @@ -29,6 +28,7 @@ is_vision_available, ) from transformers.testing_utils import ( + cleanup, require_bitsandbytes, require_torch, slow, @@ -400,8 +400,7 @@ def setUp(self): self.prompt_video = "USER: