From ef976a7e181b78abf2f1ba7ea02e506ea1cb111e Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Mon, 4 Nov 2024 07:47:34 -0500 Subject: [PATCH 001/318] Update trainer for easier handling of accumulate, compile fixes, and proper reporting (#34511) * Update trainer for easier handling of accumulate + proper reporting * test * Fixup tests * Full fix * Fix style * rm comment * Fix tests * Minimize test + remove py 311 check * Unused import * Forward contrib credits from discussions * Fix reported metrics * Refactor, good as it's going to get * rm pad tok id check * object detection and audio are being annoying * Fin * Fin x2 --------- Co-authored-by: Gyanateet Dutta --- src/transformers/modeling_utils.py | 3 +- src/transformers/trainer.py | 73 ++++++++++++++++-------------- tests/trainer/test_trainer.py | 43 +++++++++++++----- 3 files changed, 71 insertions(+), 48 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 8481fa7df9cd96..2ef4c3615c9fa2 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -28,7 +28,7 @@ import warnings from contextlib import contextmanager from dataclasses import dataclass -from functools import lru_cache, partial, wraps +from functools import partial, wraps from threading import Thread from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union from zipfile import is_zipfile @@ -5014,7 +5014,6 @@ def _is_quantized_training_enabled(self): return self.hf_quantizer.is_trainable @property - @lru_cache def loss_function(self): if getattr(self.config, "loss_type", None) is not None: loss_type = self.config.loss_type diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 30caa2de260cb7..d41b7181be6334 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -233,7 +233,6 @@ from accelerate.utils import ( DistributedDataParallelKwargs, DistributedType, - GradientAccumulationPlugin, load_fsdp_model, load_fsdp_optimizer, save_fsdp_model, @@ -601,8 +600,10 @@ def __init__( if not _is_peft_model(unwrapped_model) else unwrapped_model.get_base_model().forward ) - - self.model_accepts_loss_kwargs = "loss_kwargs" in inspect.signature(model_forward).parameters + forward_params = inspect.signature(model_forward).parameters + self.model_accepts_loss_kwargs = ( + "loss_kwargs" in forward_params and forward_params["loss_kwargs"].kind == inspect.Parameter.VAR_KEYWORD + ) self.neftune_noise_alpha = args.neftune_noise_alpha @@ -2444,7 +2445,7 @@ def _inner_training_loop( update_step += 1 num_batches = args.gradient_accumulation_steps if update_step != (total_updates - 1) else remainder batch_samples, num_items_in_batch = self.get_batch_samples(epoch_iterator, num_batches) - for inputs in batch_samples: + for i, inputs in enumerate(batch_samples): step += 1 do_sync_step = (step + 1) % args.gradient_accumulation_steps == 0 or (step + 1) == steps_in_epoch # Since we perform prefetching, we need to manually set sync_gradients @@ -2484,7 +2485,13 @@ def _inner_training_loop( if step % args.gradient_accumulation_steps == 0: self.control = self.callback_handler.on_step_begin(args, self.state, self.control) - with self.accelerator.accumulate(model): + # We explicitly want to avoid relying on `accelerator.accumulate` for generation training + context = ( + functools.partial(self.accelerator.no_sync, model=model) + if i == len(batch_samples) - 1 + else contextlib.nullcontext + ) + with context(): tr_loss_step = self.training_step(model, inputs, num_items_in_batch) if ( @@ -3636,15 +3643,11 @@ def training_step( with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: - if num_items_in_batch is not None: - if self.compute_loss_func or self.model_accepts_loss_kwargs: - loss *= self.args.gradient_accumulation_steps - # Average tokens across devices is orthogonal to gradient accumulation - if self.args.average_tokens_across_devices: - loss *= self.args.world_size self.accelerator.backward(loss, **kwargs) - - return loss.detach() / self.args.gradient_accumulation_steps + # Finally we need to normalize the loss for reporting + if num_items_in_batch is None: + return loss.detach() / self.args.gradient_accumulation_steps + return loss.detach() def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None): """ @@ -3656,9 +3659,6 @@ def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=N labels = inputs.pop("labels") else: labels = None - if self.args.average_tokens_across_devices and num_items_in_batch is not None: - num_items_in_batch_tensor = torch.tensor(num_items_in_batch, device=self.args.device) - num_items_in_batch = int(self.accelerator.gather(num_items_in_batch_tensor).sum().cpu()) if self.model_accepts_loss_kwargs: loss_kwargs = {} if num_items_in_batch is not None: @@ -3692,6 +3692,9 @@ def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=N # We don't use .loss here since the model may return tuples instead of ModelOutput. loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] + if self.args.average_tokens_across_devices and self.model_accepts_loss_kwargs: + loss *= self.accelerator.num_processes + return (loss, outputs) if return_outputs else loss def is_local_process_zero(self) -> bool: @@ -4946,24 +4949,21 @@ def _add_sm_patterns_to_gitignore(self) -> None: self.repo.git_push() def create_accelerator_and_postprocess(self): + # We explicitly don't rely on the `Accelerator` to do gradient accumulation grad_acc_kwargs = {} if is_accelerate_available("0.28.0") and self.args.accelerator_config.gradient_accumulation_kwargs is not None: grad_acc_kwargs = self.args.accelerator_config.gradient_accumulation_kwargs # check if num_steps is attempted to be passed in gradient_accumulation_kwargs - if "num_steps" in grad_acc_kwargs and self.args.gradient_accumulation_steps > 1: - # raise because we do not know which setting is intended. - raise ValueError( - "The `AcceleratorConfig`'s `num_steps` is set but `gradient_accumulation_steps` is greater than 1 in the passed `TrainingArguments`" - "If using the passed `AcceleratorConfig` is desired, do not set the `TrainingArguments` `gradient_accumulation_steps`." - ) - elif "num_steps" not in grad_acc_kwargs: - # take the gradient_accumulation_steps setting from TrainingArguments. - grad_acc_kwargs["num_steps"] = self.args.gradient_accumulation_steps - - grad_acc_kwargs["sync_with_dataloader"] = False - - gradient_accumulation_plugin = GradientAccumulationPlugin(**grad_acc_kwargs) + if "num_steps" in grad_acc_kwargs: + if self.args.gradient_accumulation_steps > 1: + # raise because we do not know which setting is intended. + raise ValueError( + "The `AcceleratorConfig`'s `num_steps` is set but `gradient_accumulation_steps` is greater than 1 in the passed `TrainingArguments`" + "If using the passed `AcceleratorConfig` is desired, do not set the `TrainingArguments` `gradient_accumulation_steps`." + ) + else: + self.args.gradient_accumulation_steps = grad_acc_kwargs["num_steps"] accelerator_config = self.args.accelerator_config.to_dict() @@ -4994,7 +4994,6 @@ def create_accelerator_and_postprocess(self): args = { "deepspeed_plugin": self.args.deepspeed_plugin, - "gradient_accumulation_plugin": gradient_accumulation_plugin, } if is_accelerate_available("0.28.0"): args["dataloader_config"] = dataloader_config @@ -5090,12 +5089,18 @@ def get_batch_samples(self, epoch_iterator, num_batches): batch_samples += [next(epoch_iterator)] except StopIteration: break + + # Keep default behavior the same + if not self.model_accepts_loss_kwargs: + return batch_samples, None + if len(batch_samples) > 0 and "labels" in batch_samples[0]: # For now we don't support object detection try: - num_items_in_batch = sum( - [data_batch["labels"][..., 1:].ne(-100).sum().item() for data_batch in batch_samples] - ) - except TypeError: + num_items_in_batch = sum([(batch["labels"].ne(-100)).sum() for batch in batch_samples]) + except (TypeError, AttributeError): pass + + if self.args.average_tokens_across_devices: + num_items_in_batch = self.accelerator.gather(num_items_in_batch).sum().item() return batch_samples, num_items_in_batch diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index b6fe807fa4961a..5658372fa71308 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -272,6 +272,19 @@ def __getitem__(self, i): return {"input_ids": self.x, "labels": self.x} +class SequenceClassificationDataset: + def __init__(self, length=64, vocab_size=100, num_labels=5): + self.length = length + self.sequences = [torch.randint(0, vocab_size, (64,)).tolist() for _ in range(length)] + self.labels = torch.randint(0, num_labels, (length,)).tolist() + + def __len__(self): + return self.length + + def __getitem__(self, i): + return {"input_ids": self.sequences[i], "label": self.labels[i]} + + class DynamicShapesDataset: def __init__(self, length=64, seed=42, batch_size=8): self.length = length @@ -1144,6 +1157,23 @@ def test_number_of_steps_in_training_with_ipex(self): train_output = trainer.train() self.assertEqual(train_output.global_step, 10) + def test_torch_compile_loss_func_compatibility(self): + config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4) + tiny_llama = LlamaForCausalLM(config) + + x = torch.randint(0, 100, (128,)) + train_dataset = RepeatDataset(x) + + with tempfile.TemporaryDirectory() as tmp_dir: + args = TrainingArguments( + tmp_dir, + per_device_train_batch_size=2, + torch_compile=True, + max_steps=1, # compile happens on the first step + ) + trainer = Trainer(model=tiny_llama, args=args, train_dataset=train_dataset) # noqa + trainer.train() + @require_peft @require_bitsandbytes def test_bnb_compile(self): @@ -3676,9 +3706,6 @@ def test_accelerator_config_from_dict(self): self.assertEqual(trainer.accelerator.even_batches, False) self.assertEqual(trainer.accelerator.use_seedable_sampler, True) - if GRAD_ACCUM_KWARGS_VERSION_AVAILABLE: - self.assertEqual(trainer.accelerator.gradient_state.plugin_kwargs["sync_each_batch"], True) - def test_accelerator_config_from_yaml(self): # Checks that accelerator kwargs can be passed through # and the accelerator is initialized respectively @@ -3691,8 +3718,6 @@ def test_accelerator_config_from_yaml(self): "even_batches": False, "use_seedable_sampler": False, } - if GRAD_ACCUM_KWARGS_VERSION_AVAILABLE: - accelerator_config["gradient_accumulation_kwargs"] = {"sync_each_batch": True} json.dump(accelerator_config, f) config = RegressionModelConfig(a=1.5, b=2.5) model = RegressionPreTrainedModel(config) @@ -3706,9 +3731,6 @@ def test_accelerator_config_from_yaml(self): self.assertEqual(trainer.accelerator.even_batches, False) self.assertEqual(trainer.accelerator.use_seedable_sampler, False) - if GRAD_ACCUM_KWARGS_VERSION_AVAILABLE: - self.assertEqual(trainer.accelerator.gradient_state.plugin_kwargs["sync_each_batch"], True) - def test_accelerator_config_from_dataclass(self): # Checks that accelerator kwargs can be passed through # and the accelerator is initialized respectively @@ -3754,10 +3776,7 @@ def test_accelerate_config_from_dataclass_grad_accum(self): with tempfile.TemporaryDirectory() as tmp_dir: args = RegressionTrainingArguments(output_dir=tmp_dir, accelerator_config=accelerator_config) trainer = Trainer(model=model, args=args, eval_dataset=eval_dataset) - self.assertEqual(trainer.accelerator.gradient_state.plugin_kwargs["num_steps"], 10) - self.assertEqual(trainer.accelerator.gradient_state.plugin_kwargs["adjust_scheduler"], False) - self.assertEqual(trainer.accelerator.gradient_state.plugin_kwargs["sync_with_dataloader"], False) - self.assertEqual(trainer.accelerator.gradient_state.plugin_kwargs["sync_each_batch"], True) + self.assertEqual(trainer.args.gradient_accumulation_steps, 10) def test_accelerator_config_from_partial(self): # Checks that accelerator kwargs can be passed through From 187439c3fa139b2102a874483e9f8f0cfa8e5557 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Mon, 4 Nov 2024 16:37:51 +0100 Subject: [PATCH 002/318] VLM: special multimodal Tokenizer (#34461) * kinda works * update * add tests * update * use special tokens in processors * typo * fix copies * fix * fix moshi after rebase * update * fix tests * update * Update docs/source/en/main_classes/tokenizer.md Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * update docs * test for load time adding tokens * fix some more tests which are now fetched better * one more fix --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- docs/source/en/main_classes/tokenizer.md | 19 + src/transformers/data/data_collator.py | 22 +- .../models/blip_2/processing_blip_2.py | 7 +- .../models/chameleon/processing_chameleon.py | 9 +- .../models/gemma/tokenization_gemma.py | 2 +- .../models/idefics/processing_idefics.py | 6 +- .../models/idefics2/processing_idefics2.py | 17 +- .../instructblip/processing_instructblip.py | 7 +- .../processing_instructblipvideo.py | 7 +- .../layoutxlm/tokenization_layoutxlm.py | 2 +- .../models/llama/tokenization_llama.py | 2 +- .../models/llava/processing_llava.py | 2 +- .../llava_next/processing_llava_next.py | 2 +- .../processing_llava_next_video.py | 4 +- .../processing_llava_onevision.py | 4 +- .../models/mllama/processing_mllama.py | 9 +- .../models/paligemma/processing_paligemma.py | 12 +- .../models/qwen2_vl/processing_qwen2_vl.py | 14 +- .../models/udop/tokenization_udop.py | 2 +- .../video_llava/processing_video_llava.py | 4 +- src/transformers/tokenization_utils.py | 2 +- src/transformers/tokenization_utils_base.py | 324 ++++-------------- src/transformers/tokenization_utils_fast.py | 5 +- .../camembert/test_tokenization_camembert.py | 4 +- .../test_tokenization_layoutlmv2.py | 4 +- .../test_tokenization_layoutlmv3.py | 4 +- .../layoutxlm/test_tokenization_layoutxlm.py | 4 +- tests/models/llama/test_tokenization_llama.py | 1 + .../markuplm/test_tokenization_markuplm.py | 4 +- tests/models/moshi/test_tokenization_moshi.py | 4 +- .../rembert/test_tokenization_rembert.py | 4 +- tests/models/udop/test_tokenization_udop.py | 4 +- tests/test_tokenization_common.py | 9 +- tests/tokenization/test_tokenization_utils.py | 49 +++ tests/trainer/test_data_collator.py | 6 +- 35 files changed, 247 insertions(+), 334 deletions(-) diff --git a/docs/source/en/main_classes/tokenizer.md b/docs/source/en/main_classes/tokenizer.md index 2ad7e450404e77..83d2ae5df6a7fb 100644 --- a/docs/source/en/main_classes/tokenizer.md +++ b/docs/source/en/main_classes/tokenizer.md @@ -51,6 +51,25 @@ token space (e.g., getting the index of the token comprising a given character o to a given token). +# Multimodal Tokenizer + +Apart from that each tokenizer can be a "multimodal" tokenizer which means that the tokenizer will hold all relevant special tokens +as part of tokenizer attributes for easier access. For example, if the tokenizer is loaded from a vision-language model like LLaVA, you will +be able to access `tokenizer.image_token_id` to obtain the special image token used as a placeholder. + +To enable extra special tokens for any type of tokenizer, you have to add the following lines and save the tokenizer. Extra special tokens do not +have to be modality related and can ne anything that the model often needs access to. In the below code, tokenizer at `output_dir` will have direct access +to three more special tokens. + +```python +vision_tokenizer = AutoTokenizer.from_pretrained( + "llava-hf/llava-1.5-7b-hf", + extra_special_tokens={"image_token": "", "boi_token": "", "eoi_token": ""} +) +print(vision_tokenizer.image_token, vision_tokenizer.image_token_id) +("", 32000) +``` + ## PreTrainedTokenizer [[autodoc]] PreTrainedTokenizer diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py index cc80f6a19bfb26..9e75e6fd3c38df 100644 --- a/src/transformers/data/data_collator.py +++ b/src/transformers/data/data_collator.py @@ -443,7 +443,7 @@ def _torch_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] return torch.stack(examples, dim=0) # If yes, check if we have a `pad_token`. - if tokenizer._pad_token is None: + if tokenizer.pad_token is None: raise ValueError( "You are attempting to pad samples but the tokenizer you are using" f" ({tokenizer.__class__.__name__}) does not have a pad token." @@ -477,7 +477,7 @@ def _tf_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = N return tf.stack(examples, axis=0) # If yes, check if we have a `pad_token`. - if tokenizer._pad_token is None: + if tokenizer.pad_token is None: raise ValueError( "You are attempting to pad samples but the tokenizer you are using" f" ({tokenizer.__class__.__name__}) does not have a pad token." @@ -513,7 +513,7 @@ def _numpy_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] return np.stack(examples, axis=0) # If yes, check if we have a `pad_token`. - if tokenizer._pad_token is None: + if tokenizer.pad_token is None: raise ValueError( "You are attempting to pad samples but the tokenizer you are using" f" ({tokenizer.__class__.__name__}) does not have a pad token." @@ -1090,7 +1090,7 @@ def torch_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]: self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) - if self.tokenizer._pad_token is not None: + if self.tokenizer.pad_token is not None: padding_mask = labels.eq(self.tokenizer.pad_token_id) probability_matrix.masked_fill_(padding_mask, value=0.0) @@ -1131,7 +1131,7 @@ def tf_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]: self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels ] masked_indices = masked_indices & ~tf.cast(special_tokens_mask, dtype=tf.bool) - if self.tokenizer._pad_token is not None: + if self.tokenizer.pad_token is not None: padding_mask = inputs == self.tokenizer.pad_token_id masked_indices = masked_indices & ~padding_mask @@ -1170,7 +1170,7 @@ def numpy_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]: self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] masked_indices[np.array(special_tokens_mask, dtype=bool)] = 0 - if self.tokenizer._pad_token is not None: + if self.tokenizer.pad_token is not None: padding_mask = labels == self.tokenizer.pad_token_id masked_indices[padding_mask] = 0 @@ -1251,13 +1251,13 @@ def mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any]: self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) - if self.tokenizer._pad_token is not None: + if self.tokenizer.pad_token is not None: padding_mask = labels.eq(self.tokenizer.pad_token_id) probability_matrix.masked_fill_(padding_mask, value=0.0) masked_indices = torch.bernoulli(probability_matrix).bool() # probability be `1` (masked), however in albert model attention mask `0` means masked, revert the value attention_mask = (~masked_indices).float() - if self.tokenizer._pad_token is not None: + if self.tokenizer.pad_token is not None: attention_padding_mask = labels.eq(self.tokenizer.pad_token_id) attention_mask.masked_fill_(attention_padding_mask, value=1.0) labels[~masked_indices] = -100 # We only compute loss on masked tokens, -100 is default for CE compute @@ -1367,7 +1367,7 @@ def torch_mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any, Any]: dtype=torch.bool, ) masked_indices.masked_fill_(special_tokens_mask, value=0.0) - if self.tokenizer._pad_token is not None: + if self.tokenizer.pad_token is not None: padding_mask = labels.eq(self.tokenizer.pad_token_id) masked_indices.masked_fill_(padding_mask, value=0.0) @@ -1471,7 +1471,7 @@ def tf_mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any, Any]: ) special_tokens_mask = tf.cast(special_tokens_mask, dtype=tf.bool) masked_indices = masked_indices & ~special_tokens_mask - if self.tokenizer._pad_token is not None: + if self.tokenizer.pad_token is not None: padding_mask = labels == self.tokenizer.pad_token_id masked_indices = masked_indices & ~padding_mask @@ -1571,7 +1571,7 @@ def numpy_mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any, Any]: dtype=bool, ) masked_indices[special_tokens_mask] = 0 - if self.tokenizer._pad_token is not None: + if self.tokenizer.pad_token is not None: padding_mask = labels == self.tokenizer.pad_token_id masked_indices[padding_mask] = 0.0 diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py index fa6a99f71a4616..c6852378412895 100644 --- a/src/transformers/models/blip_2/processing_blip_2.py +++ b/src/transformers/models/blip_2/processing_blip_2.py @@ -74,8 +74,11 @@ class Blip2Processor(ProcessorMixin): def __init__(self, image_processor, tokenizer, num_query_tokens=None, **kwargs): tokenizer.return_token_type_ids = False self.current_processor = image_processor - self.image_token = AddedToken("", normalized=False, special=True) - tokenizer.add_tokens([self.image_token], special_tokens=True) + if not hasattr(tokenizer, "image_token"): + self.image_token = AddedToken("", normalized=False, special=True) + tokenizer.add_tokens([self.image_token], special_tokens=True) + else: + self.image_token = tokenizer.image_token self.num_query_tokens = num_query_tokens super().__init__(image_processor, tokenizer) diff --git a/src/transformers/models/chameleon/processing_chameleon.py b/src/transformers/models/chameleon/processing_chameleon.py index 2d699c8f663a61..e2a50d1af51b9e 100644 --- a/src/transformers/models/chameleon/processing_chameleon.py +++ b/src/transformers/models/chameleon/processing_chameleon.py @@ -66,9 +66,12 @@ class ChameleonProcessor(ProcessorMixin): def __init__(self, image_processor, tokenizer, image_seq_length: int = 1024, image_token: str = ""): self.image_seq_length = image_seq_length - self.image_token = image_token - self.image_start_token = "" # fixed tokens for start and end, so can hardcode - self.image_end_token = "" + self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token + self.image_start_token = ( + tokenizer.boi_token if hasattr(tokenizer, "boi_token") else "" + ) # fixed tokens for start and end, so can hardcode + self.image_end_token = tokenizer.eoi_token if hasattr(tokenizer, "eoi_token") else "" + super().__init__(image_processor, tokenizer) def __call__( diff --git a/src/transformers/models/gemma/tokenization_gemma.py b/src/transformers/models/gemma/tokenization_gemma.py index ff0d1d034c2238..7138cafbd625fc 100644 --- a/src/transformers/models/gemma/tokenization_gemma.py +++ b/src/transformers/models/gemma/tokenization_gemma.py @@ -138,7 +138,7 @@ def __getstate__(self): return state def __setstate__(self, d): - self.__dict__ = d + self.__dict__.update(d) self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.LoadFromSerializedProto(self.sp_model_proto) diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py index 3406ab2226e08b..ca6e4702d3173e 100644 --- a/src/transformers/models/idefics/processing_idefics.py +++ b/src/transformers/models/idefics/processing_idefics.py @@ -219,7 +219,11 @@ def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_u super().__init__(image_processor, tokenizer) self.current_processor = self.image_processor - self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN) + self.image_token_id = ( + tokenizer.image_token_id + if hasattr(tokenizer, "image_token") + else tokenizer.convert_tokens_to_ids(IMAGE_TOKEN) + ) self.default_image_dims = ( self.image_processor.image_num_channels, diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py index 9a041257c36b5b..f99c1bda474568 100644 --- a/src/transformers/models/idefics2/processing_idefics2.py +++ b/src/transformers/models/idefics2/processing_idefics2.py @@ -95,16 +95,19 @@ def __init__(self, image_processor, tokenizer=None, image_seq_len: int = 64, cha if tokenizer is None: raise ValueError("You need to specify a `tokenizer`.") - self.fake_image_token = AddedToken("", normalized=False, special=True) - self.image_token = AddedToken("", normalized=False, special=True) + if not hasattr(tokenizer, "image_token"): + self.fake_image_token = AddedToken("", normalized=False, special=True) + self.image_token = AddedToken("", normalized=False, special=True) + tokens_to_add = {"additional_special_tokens": [self.fake_image_token, self.image_token]} + tokenizer.add_special_tokens(tokens_to_add) + else: + self.fake_image_token = tokenizer.image_boundary_token + self.image_token = tokenizer.image_token + self.end_of_utterance_token = AddedToken("", normalized=False, special=True) + tokenizer.add_special_tokens({"additional_special_tokens": [self.end_of_utterance_token]}) self.image_seq_len = image_seq_len - tokens_to_add = { - "additional_special_tokens": [self.fake_image_token, self.image_token, self.end_of_utterance_token] - } - tokenizer.add_special_tokens(tokens_to_add) - super().__init__(image_processor, tokenizer, chat_template=chat_template) def _extract_images_from_prompts(self, prompts): diff --git a/src/transformers/models/instructblip/processing_instructblip.py b/src/transformers/models/instructblip/processing_instructblip.py index 05ff9871f4d731..3d48839d376c5c 100644 --- a/src/transformers/models/instructblip/processing_instructblip.py +++ b/src/transformers/models/instructblip/processing_instructblip.py @@ -78,8 +78,11 @@ class InstructBlipProcessor(ProcessorMixin): qformer_tokenizer_class = "AutoTokenizer" def __init__(self, image_processor, tokenizer, qformer_tokenizer, num_query_tokens=None, **kwargs): - self.image_token = AddedToken("", normalized=False, special=True) - tokenizer.add_tokens([self.image_token], special_tokens=True) + if not hasattr(tokenizer, "image_token"): + self.image_token = AddedToken("", normalized=False, special=True) + tokenizer.add_tokens([self.image_token], special_tokens=True) + else: + self.image_token = tokenizer.image_token self.num_query_tokens = num_query_tokens super().__init__(image_processor, tokenizer, qformer_tokenizer) diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py index 3e96d279a42f8d..1d4e59e26b4621 100644 --- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py @@ -63,8 +63,11 @@ class InstructBlipVideoProcessor(ProcessorMixin): qformer_tokenizer_class = "AutoTokenizer" def __init__(self, image_processor, tokenizer, qformer_tokenizer, num_query_tokens=None, **kwargs): - self.video_token = AddedToken("