From aa7e01a6fd72713079da95852d1bf1764ba37397 Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Thu, 28 Nov 2024 16:30:34 -0500 Subject: [PATCH 01/76] move `TestAssistedCandidateGeneratorDifferentTokenizers` into a new testing file --- tests/generation/test_candidate_generator.py | 43 ++++++++++++++++++++ tests/generation/test_utils.py | 39 ------------------ 2 files changed, 43 insertions(+), 39 deletions(-) create mode 100644 tests/generation/test_candidate_generator.py diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py new file mode 100644 index 00000000000000..03fd51324b022f --- /dev/null +++ b/tests/generation/test_candidate_generator.py @@ -0,0 +1,43 @@ +import unittest + +import numpy as np + +from transformers.generation.candidate_generator import AssistedCandidateGeneratorDifferentTokenizers + + +class TestAssistedCandidateGeneratorDifferentTokenizers(unittest.TestCase): + def test_no_intersection(self): + prompt = np.array([[1, 2, 3]]) + prompt_plus_new_tokens = np.array([[4, 5, 6]]) + result = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag(prompt, prompt_plus_new_tokens) + self.assertEqual(result, (None, None, None)) + + def test_complete_overlap(self): + prompt = np.array([[1, 2, 3]]) + prompt_plus_new_tokens = np.array([[1, 2, 3, 4, 5]]) + discrep_length, new_tokens_only, discrep_only = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag( + prompt, prompt_plus_new_tokens + ) + self.assertEqual(discrep_length, 0) + np.testing.assert_array_equal(new_tokens_only, np.array([[4, 5]])) + np.testing.assert_array_equal(discrep_only, np.array([[]])) + + def test_partial_overlap(self): + prompt = np.array([[1, 2, 3]]) + prompt_plus_new_tokens = np.array([[2, 3, 4, 5]]) + discrep_length, new_tokens_only, discrep_only = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag( + prompt, prompt_plus_new_tokens + ) + self.assertEqual(discrep_length, 0) + np.testing.assert_array_equal(new_tokens_only, np.array([[4, 5]])) + np.testing.assert_array_equal(discrep_only, np.array([[]])) + + def test_no_new_tokens(self): + prompt = np.array([[1, 2, 3]]) + prompt_plus_new_tokens = np.array([[1, 2, 3]]) + discrep_length, new_tokens_only, discrep_only = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag( + prompt, prompt_plus_new_tokens + ) + self.assertEqual(discrep_length, 0) + np.testing.assert_array_equal(new_tokens_only, np.array([[]])) + np.testing.assert_array_equal(discrep_only, np.array([[]])) diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 063e9a3da8fdad..86d7c0b198c055 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -92,7 +92,6 @@ WatermarkDetector, WatermarkingConfig, ) - from transformers.generation.candidate_generator import AssistedCandidateGeneratorDifferentTokenizers from transformers.generation.utils import _speculative_sampling @@ -4274,41 +4273,3 @@ def test_generate_from_inputs_embeds_with_bos_token_id_is_none(self): # bos_token_id is required when no input ids nor inputs_embeds is passed with self.assertRaises(ValueError): model.generate(max_length=20, bos_token_id=None) - - -class TestAssistedCandidateGeneratorDifferentTokenizers(unittest.TestCase): - def test_no_intersection(self): - prompt = np.array([[1, 2, 3]]) - prompt_plus_new_tokens = np.array([[4, 5, 6]]) - result = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag(prompt, prompt_plus_new_tokens) - self.assertEqual(result, (None, None, None)) - - def test_complete_overlap(self): - prompt = np.array([[1, 2, 3]]) - prompt_plus_new_tokens = np.array([[1, 2, 3, 4, 5]]) - discrep_length, new_tokens_only, discrep_only = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag( - prompt, prompt_plus_new_tokens - ) - self.assertEqual(discrep_length, 0) - np.testing.assert_array_equal(new_tokens_only, np.array([[4, 5]])) - np.testing.assert_array_equal(discrep_only, np.array([[]])) - - def test_partial_overlap(self): - prompt = np.array([[1, 2, 3]]) - prompt_plus_new_tokens = np.array([[2, 3, 4, 5]]) - discrep_length, new_tokens_only, discrep_only = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag( - prompt, prompt_plus_new_tokens - ) - self.assertEqual(discrep_length, 0) - np.testing.assert_array_equal(new_tokens_only, np.array([[4, 5]])) - np.testing.assert_array_equal(discrep_only, np.array([[]])) - - def test_no_new_tokens(self): - prompt = np.array([[1, 2, 3]]) - prompt_plus_new_tokens = np.array([[1, 2, 3]]) - discrep_length, new_tokens_only, discrep_only = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag( - prompt, prompt_plus_new_tokens - ) - self.assertEqual(discrep_length, 0) - np.testing.assert_array_equal(new_tokens_only, np.array([[]])) - np.testing.assert_array_equal(discrep_only, np.array([[]])) From f6b7f2041d7636dedb9185e2713d57df088793ab Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Thu, 28 Nov 2024 16:32:53 -0500 Subject: [PATCH 02/76] refactor --- .../generation/candidate_generator.py | 176 +++++++++--------- 1 file changed, 87 insertions(+), 89 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index 7cab88a4bc2e65..d85860ad5dc459 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -194,45 +194,15 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, vocabulary_size)` containing the logits associated to each candidate. """ input_ids = input_ids.to(self.assistant_model.device) - - # Don't generate more than `max_length - 1` candidates since the target model generates one extra token. - new_cur_len = input_ids.shape[-1] - max_new_tokens = min(int(self.num_assistant_tokens), self.generation_config.max_length - new_cur_len - 1) - min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - new_cur_len), 0) + # Calculate new tokens to generate + min_new_tokens, max_new_tokens = self._calculate_new_tokens(input_ids) if max_new_tokens == 0: return input_ids, None - - # 1. If it is not the first round of candidate generation, prepare the inputs based on the input_ids length - # (which implicitly contains the number of accepted candidates from the previous round) - has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None - if has_past_key_values: - new_cache_size = new_cur_len - 1 - self.assistant_kwargs["past_key_values"] = _crop_past_key_values( - self.assistant_model, self.assistant_kwargs["past_key_values"], new_cache_size - 1 - ) # the assistant does not have the token after the last match, hence the -1 - - self.assistant_kwargs = _prepare_attention_mask( - self.assistant_kwargs, new_cur_len, self.assistant_model.config.is_encoder_decoder - ) - self.assistant_kwargs = _prepare_token_type_ids(self.assistant_kwargs, new_cur_len) - - # 2. Forecast next N tokens using the assistant model. - assistant_generation_kwargs = { - self.input_ids_key: input_ids, - "min_new_tokens": min_new_tokens, - "max_new_tokens": max_new_tokens, - "generation_config": self.generation_config, - "logits_processor": self.logits_processor, - } - - assistant_output = self.assistant_model.generate(**assistant_generation_kwargs, **self.assistant_kwargs) - - # 3. Update variables for the next round of candidate generation - self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values - - # 4. Prepare variables for output - candidate_logits = torch.stack(assistant_output.scores, dim=1) - candidate_ids = assistant_output.sequences + # Update past key values and masks + self._update_past_and_masks(input_ids) + # Generate candidates + generation_args = self._prepare_generation_args(input_ids, min_new_tokens, max_new_tokens) + candidate_ids, candidate_logits = self._generate_candidates(generation_args) return candidate_ids, candidate_logits def update_candidate_strategy(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, num_matches: int): @@ -261,6 +231,45 @@ def update_candidate_strategy(self, input_ids: torch.LongTensor, scores: torch.F else: self.num_assistant_tokens = max(1.0, self.num_assistant_tokens - 1.0) + def _calculate_new_tokens(self, input_ids: torch.LongTensor) -> Tuple[int, int]: + """Calculate the minimum and maximum number of new tokens to generate.""" + new_cur_len = input_ids.shape[-1] + max_new_tokens = min(int(self.num_assistant_tokens), self.generation_config.max_length - new_cur_len - 1) + min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - new_cur_len), 0) + return min_new_tokens, max_new_tokens + + def _update_past_and_masks(self, input_ids: torch.LongTensor, remove_from_pkv: int = 0) -> bool: + """Update past key values and attention masks for subsequent generation rounds.""" + has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None + if has_past_key_values: + new_cache_size = input_ids.shape[-1] - 1 - remove_from_pkv + self.assistant_kwargs["past_key_values"] = _crop_past_key_values( + self.assistant_model, self.assistant_kwargs["past_key_values"], new_cache_size - 1 + ) + self.assistant_kwargs = _prepare_attention_mask( + self.assistant_kwargs, input_ids.shape[-1], self.assistant_model.config.is_encoder_decoder + ) + self.assistant_kwargs = _prepare_token_type_ids(self.assistant_kwargs, input_ids.shape[-1]) + return has_past_key_values + + def _prepare_generation_args(self, input_ids: torch.LongTensor, min_new_tokens: int, max_new_tokens: int) -> Dict: + """Prepare arguments for the generation call.""" + return { + self.input_ids_key: input_ids, + "min_new_tokens": min_new_tokens, + "max_new_tokens": max_new_tokens, + "generation_config": self.generation_config, + "logits_processor": self.logits_processor, + } + + def _generate_candidates(self, generation_args: Dict) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]: + """Generate candidate sequences using the assistant model.""" + assistant_output = self.assistant_model.generate(**generation_args, **self.assistant_kwargs) + self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values + candidate_logits = torch.stack(assistant_output.scores, dim=1) + candidate_ids = assistant_output.sequences + return candidate_ids, candidate_logits + class AssistedCandidateGeneratorDifferentTokenizers(AssistedCandidateGenerator): """ @@ -310,6 +319,8 @@ def __init__( self.target_tokenizer = target_tokenizer self.assistant_tokenizer = assistant_tokenizer + self.prev_target_ids = None + self.prev_tokens = None self.prev_assistant_ids = None self.target_lookbehind = assistant_model.generation_config.target_lookbehind self.assistant_lookbehind = assistant_model.generation_config.assistant_lookbehind @@ -440,18 +451,41 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, return input_ids, None input_ids = input_ids.to(self.assistant_model.device) + remove_from_pkv = 0 + + assistant_input_ids, remove_from_pkv = self._prepare_assistant_input_ids(input_ids) + self.prev_assistant_ids = assistant_input_ids + + min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - assistant_input_ids.shape[-1]), 0) + + self._update_past_and_masks(assistant_input_ids, remove_from_pkv) + generation_args = self._prepare_generation_args(assistant_input_ids, min_new_tokens, max_new_tokens) + self.assistant_kwargs.pop("attention_mask", None) + + assistant_output = self.assistant_model.generate(**generation_args, **self.assistant_kwargs) + new_target_ids = self._process_assistant_outputs(input_ids, assistant_output.sequences, assistant_input_ids) + + # Update state + self.prev_target_ids = input_ids + self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values + self.prev_tokens = assistant_output.sequences + + if input_ids.shape[1] >= new_target_ids.shape[1]: + return input_ids, None + + return new_target_ids, None + + def _prepare_assistant_input_ids(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, int]: + """Converts target input IDs to assistant input IDs, handling discrepancies.""" convert_kwargs = { "source_tokenizer": self.target_tokenizer, "destination_tokenizer": self.assistant_tokenizer, } remove_from_pkv = 0 - # Since re-encoding the tokens may result in tokenization discrepancies, we use 2 look behind values - # (one for each conversion) which mark where to start looking for the overlap between the - # source and target encodings, to ensure the new tokens include the correct prompt suffix. - if self.prev_assistant_ids is not None and input_ids.shape[1] > self.target_lookbehind: + if self.prev_tokens is not None and self.prev_target_ids.shape[1] > self.target_lookbehind: # input_ids contains all target prompt input ids and some new target input ids - start_index_in_target_window = input_ids.shape[1] - self.target_lookbehind + start_index_in_target_window = self.prev_target_ids.shape[1] - self.target_lookbehind new_assistant_ids = self.convert_source_tokens_to_target_tokens( input_ids[:, start_index_in_target_window:], **convert_kwargs @@ -459,8 +493,8 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, prompt_use_length = new_assistant_ids.shape[1] prompt_use = self.prev_assistant_ids[:, -prompt_use_length:] - discrepancy_length, new_tokens_only, discrepancy_only = ( - AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag(prompt_use, new_assistant_ids) + discrepancy_length, new_tokens_only, discrepancy_only = self._get_tokens_diag( + prompt_use, new_assistant_ids ) assistant_input_ids = self.prev_assistant_ids @@ -481,48 +515,21 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, else: # edge case: in case of no intersection between prompt and new_assistant_ids assistant_input_ids = torch.cat([assistant_input_ids, new_assistant_ids], dim=-1) - else: assistant_input_ids = self.convert_source_tokens_to_target_tokens(input_ids, **convert_kwargs) + self.prev_target_ids = input_ids - self.prev_assistant_ids = assistant_input_ids - new_cur_len = assistant_input_ids.shape[-1] - min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - new_cur_len), 0) - - # 1. If it is not the first round of candidate generation, prepare the inputs based on the input_ids length - # (which implicitly contains the number of accepted candidates from the previous round) - has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None - if has_past_key_values: - new_cache_size = new_cur_len - 1 - remove_from_pkv - self.assistant_kwargs["past_key_values"] = _crop_past_key_values( - self.assistant_model, self.assistant_kwargs["past_key_values"], new_cache_size - 1 - ) # the assistant does not have the token after the last match, hence the -1 - - self.assistant_kwargs = _prepare_attention_mask( - self.assistant_kwargs, new_cur_len, self.assistant_model.config.is_encoder_decoder - ) - self.assistant_kwargs = _prepare_token_type_ids(self.assistant_kwargs, new_cur_len) - - # 2. Forecast next N tokens using the assistant model. - assistant_generation_kwargs = { - self.input_ids_key: assistant_input_ids, - "min_new_tokens": min_new_tokens, - "max_new_tokens": max_new_tokens, - "generation_config": self.generation_config, - "logits_processor": self.logits_processor, - } - - self.assistant_kwargs.pop("attention_mask", None) - - assistant_output = self.assistant_model.generate(**assistant_generation_kwargs, **self.assistant_kwargs) + return assistant_input_ids, remove_from_pkv + def _process_assistant_outputs( + self, input_ids: torch.LongTensor, assistant_sequences: torch.LongTensor, assistant_input_ids: torch.LongTensor + ) -> torch.LongTensor: + """Processes assistant outputs to obtain target input IDs.""" num_prev_assistant = self.prev_assistant_ids.shape[1] start_assistant_look_index = num_prev_assistant - self.assistant_lookbehind - if start_assistant_look_index < 0: - start_assistant_look_index = 0 new_target_ids_from_window = self.convert_source_tokens_to_target_tokens( - assistant_output.sequences[:, start_assistant_look_index:], + assistant_sequences[:, start_assistant_look_index:], source_tokenizer=self.assistant_tokenizer, destination_tokenizer=self.target_tokenizer, ) @@ -530,9 +537,7 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, target_prompt_use = input_ids[:, -target_prompt_use_length:] - _, target_new_tokens_only, _ = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag( - target_prompt_use, new_target_ids_from_window - ) + _, target_new_tokens_only, _ = self._get_tokens_diag(target_prompt_use, new_target_ids_from_window) new_target_ids = input_ids @@ -546,14 +551,7 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, if hasattr(self.generation_config, "max_length"): new_target_ids = new_target_ids[:, : self.generation_config.max_length] - # 3. Update variables for the next round of candidate generation - self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values - - # 4. Prepare variables for output - if input_ids.shape[1] >= new_target_ids.shape[1]: - return input_ids, None - - return new_target_ids, None + return new_target_ids class PromptLookupCandidateGenerator(CandidateGenerator): From 0ded37c5d5c9dc0086dd9e56a91750117aa28469 Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Thu, 28 Nov 2024 17:04:57 -0500 Subject: [PATCH 03/76] NOTHING. add space to rerun github actions tests --- src/transformers/generation/candidate_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index d85860ad5dc459..43ed191df5d0af 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - + import copy from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple From d48b69b0cf405659ef0a18040a42612db1a7b929 Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Thu, 28 Nov 2024 17:05:04 -0500 Subject: [PATCH 04/76] remove it... --- src/transformers/generation/candidate_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index 43ed191df5d0af..d85860ad5dc459 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - + import copy from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple From b47e33a1c3e9ff14e6b25ca0eca9aa63b9d18f8c Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Sat, 16 Nov 2024 13:54:03 -0500 Subject: [PATCH 05/76] `UniversalSpeculativeDecodingGenerator` --- .../generation/candidate_generator.py | 132 +++++++++++++++++- 1 file changed, 131 insertions(+), 1 deletion(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index d85860ad5dc459..bbe6f85024a137 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -21,7 +21,12 @@ from ..cache_utils import DynamicCache from ..pytorch_utils import isin_mps_friendly -from .logits_process import LogitsProcessorList, MinLengthLogitsProcessor +from .logits_process import ( + LogitNormalization, + LogitsProcessorList, + MinLengthLogitsProcessor, + SuppressTokensLogitsProcessor, +) if TYPE_CHECKING: @@ -554,6 +559,131 @@ def _process_assistant_outputs( return new_target_ids +class UniversalSpeculativeDecodingGenerator(AssistedCandidateGeneratorDifferentTokenizers): + """ + `CandidateGenerator` class to be used for Universal Speculative Decoding (USD): speculative decoding with different tokenizers + for the assistant and main models. This class generates candidates through the use of a smaller + model. + + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids) + assistant_model (`PreTrainedModel`): + The model to be used for generating candidates. This model should be smaller than the main model. + target_tokenizer (`PreTrainedTokenizerBase`): + The tokenizer used for the target model. + assistant_tokenizer (`PreTrainedTokenizerBase`): + The tokenizer used for the assistant model. + generation_config (`~generation.GenerationConfig`, *optional*): + The generation configuration to be used as base parametrization for the generation call. + logits_processor (`LogitsProcessorList`): + An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] + used to modify the prediction scores of the language modeling head applied at each generation step. + model_kwargs (`Dict`): + The keyword arguments that will be passed to the main model, and are used as base inputs for the assistant + model as well. + inputs_tensor (`torch.Tensor`, *optional*): + The model input tensor. In encoder-decoder models, this is the encoder input. + """ + + def __init__( + self, + input_ids: torch.LongTensor, + assistant_model: "PreTrainedModel", + target_tokenizer: "PreTrainedTokenizerBase", + assistant_tokenizer: "PreTrainedTokenizerBase", + generation_config: "GenerationConfig", + model_kwargs: Dict, + inputs_tensor: Optional[torch.Tensor] = None, + logits_processor: "LogitsProcessorList" = None, + ): + target_vocab: dict[str, int] = target_tokenizer.get_vocab() + assistant_vocab: dict[str, int] = assistant_tokenizer.get_vocab() + self._target_to_assistant_input_ids: dict[int, int] = { + target_vocab[tok]: assistant_vocab[tok] for tok in target_vocab.keys() & assistant_vocab.keys() + } + # Suppress tokens that are in the assistant vocab but not in the target vocab + suppress_input_ids = list( + set(assistant_vocab.values()) - set(self._target_to_assistant_input_ids.values()) + ) + logits_processor.append( + SuppressTokensLogitsProcessor( + suppress_tokens=suppress_input_ids, + device=assistant_model.device, + ), + LogitNormalization(), + ) + super().__init__(input_ids, assistant_model, generation_config, model_kwargs, inputs_tensor, logits_processor) + self.prev_target_seq_len: int = 0 + + def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]: + """ + Fetches the candidates to be tried for the current input. + + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids) + + Return: + `torch.LongTensor` of shape `(batch_size, candidate_length)` containing the candidate sequences to be + assessed by the model and a `torch.FloatTensor` of shape `(batch_size, candidate_length, + vocabulary_size)` containing the logits associated to each candidate. + """ + # input_ids = input_ids.to(self.assistant_model.device) + + def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTensor: + target_seq_len = target_input_ids.shape[-1] + target_new_ids = target_input_ids[ + :, -(target_seq_len - self.prev_target_seq_len) : + ] + return torch.tensor( + [self._target_to_assistant_input_ids[tok.item()] for tok in target_new_ids.flatten()], + device=self.assistant_model.device, + ).view(target_new_ids.shape) + + input_ids = get_assistant_input_ids(input_ids) + + # Don't generate more than `max_length - 1` candidates since the target model generates one extra token. + new_cur_len = input_ids.shape[-1] + max_new_tokens = min(int(self.num_assistant_tokens), self.generation_config.max_length - new_cur_len - 1) + min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - new_cur_len), 0) + if max_new_tokens == 0: + return input_ids, None + + # 1. If it is not the first round of candidate generation, prepare the inputs based on the input_ids length + # (which implicitly contains the number of accepted candidates from the previous round) + has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None + if has_past_key_values: + new_cache_size = new_cur_len - 1 + self.assistant_kwargs["past_key_values"] = _crop_past_key_values( + self.assistant_model, self.assistant_kwargs["past_key_values"], new_cache_size - 1 + ) # the assistant does not have the token after the last match, hence the -1 + + self.assistant_kwargs = _prepare_attention_mask( + self.assistant_kwargs, new_cur_len, self.assistant_model.config.is_encoder_decoder + ) + self.assistant_kwargs = _prepare_token_type_ids(self.assistant_kwargs, new_cur_len) + + # 2. Forecast next N tokens using the assistant model. + assistant_generation_kwargs = { + self.input_ids_key: input_ids, + "min_new_tokens": min_new_tokens, + "max_new_tokens": max_new_tokens, + "generation_config": self.generation_config, + "logits_processor": self.logits_processor, + } + + assistant_output = self.assistant_model.generate(**assistant_generation_kwargs, **self.assistant_kwargs) + + # 3. Update variables for the next round of candidate generation + self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values + + # 4. Prepare variables for output + candidate_logits = torch.stack(assistant_output.scores, dim=1) + candidate_ids = assistant_output.sequences + return candidate_ids, candidate_logits + + class PromptLookupCandidateGenerator(CandidateGenerator): """ `CandidateGenerator` class to be used for prompt lookup generation. This class generates candidates by looking up From 8a991299aaba67414b1c6a6ef6dcfa2c01a90107 Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Sat, 16 Nov 2024 14:01:00 -0500 Subject: [PATCH 06/76] Use `UniversalSpeculativeDecodingGenerator` when `generation_config.do_sample=True` --- src/transformers/generation/utils.py | 36 ++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 05e39c4a9b56b5..07c743c6b79f3d 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -57,6 +57,7 @@ CandidateGenerator, EarlyExitCandidateGenerator, PromptLookupCandidateGenerator, + UniversalSpeculativeDecodingGenerator, _crop_past_key_values, _prepare_attention_mask, _prepare_token_type_ids, @@ -846,16 +847,31 @@ def _get_candidate_generator( max_length=generation_config.max_length, ) elif different_tokenizers: - candidate_generator = AssistedCandidateGeneratorDifferentTokenizers( - input_ids=input_ids, - assistant_model=assistant_model, - generation_config=generation_config, - model_kwargs=model_kwargs, - inputs_tensor=inputs_tensor, - logits_processor=logits_processor, - target_tokenizer=target_tokenizer, - assistant_tokenizer=assistant_tokenizer, - ) + match generation_config.do_sample: + case True: + candidate_generator = UniversalSpeculativeDecodingGenerator( + input_ids=input_ids, + assistant_model=assistant_model, + generation_config=generation_config, + model_kwargs=model_kwargs, + inputs_tensor=inputs_tensor, + logits_processor=logits_processor, + target_tokenizer=target_tokenizer, + assistant_tokenizer=assistant_tokenizer, + ) + case False: + candidate_generator = AssistedCandidateGeneratorDifferentTokenizers( + input_ids=input_ids, + assistant_model=assistant_model, + generation_config=generation_config, + model_kwargs=model_kwargs, + inputs_tensor=inputs_tensor, + logits_processor=logits_processor, + target_tokenizer=target_tokenizer, + assistant_tokenizer=assistant_tokenizer, + ) + case _: + raise ValueError(f"Invalid value for `do_sample`: {generation_config.do_sample}") else: candidate_generator = AssistedCandidateGenerator( input_ids=input_ids, From 4649bd2f8406af58c96d845435fea9a8e6a80c17 Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Sat, 16 Nov 2024 15:08:46 -0500 Subject: [PATCH 07/76] assistant tokenizes only the target's new suffix --- .../generation/candidate_generator.py | 30 ++++++++++++------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index bbe6f85024a137..d23480a45ac8d1 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -599,12 +599,12 @@ def __init__( ): target_vocab: dict[str, int] = target_tokenizer.get_vocab() assistant_vocab: dict[str, int] = assistant_tokenizer.get_vocab() - self._target_to_assistant_input_ids: dict[int, int] = { - target_vocab[tok]: assistant_vocab[tok] for tok in target_vocab.keys() & assistant_vocab.keys() + self._assistant_to_target_input_ids: dict[int, int] = { + assistant_vocab[tok]: target_vocab[tok] for tok in target_vocab.keys() & assistant_vocab.keys() } # Suppress tokens that are in the assistant vocab but not in the target vocab suppress_input_ids = list( - set(assistant_vocab.values()) - set(self._target_to_assistant_input_ids.values()) + set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.values()) ) logits_processor.append( SuppressTokensLogitsProcessor( @@ -614,7 +614,8 @@ def __init__( LogitNormalization(), ) super().__init__(input_ids, assistant_model, generation_config, model_kwargs, inputs_tensor, logits_processor) - self.prev_target_seq_len: int = 0 + self._prev_target_seq_len: int = 0 + self._prev_assistant_ids: torch.LongTensor | None = None def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]: """ @@ -629,19 +630,25 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, assessed by the model and a `torch.FloatTensor` of shape `(batch_size, candidate_length, vocabulary_size)` containing the logits associated to each candidate. """ - # input_ids = input_ids.to(self.assistant_model.device) + has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTensor: + nonlocal has_past_key_values target_seq_len = target_input_ids.shape[-1] target_new_ids = target_input_ids[ - :, -(target_seq_len - self.prev_target_seq_len) : + :, -(target_seq_len - self._prev_target_seq_len) : ] - return torch.tensor( - [self._target_to_assistant_input_ids[tok.item()] for tok in target_new_ids.flatten()], - device=self.assistant_model.device, - ).view(target_new_ids.shape) + # convert target_new_ids to string, and then, convert the string to assistant_new_ids + target_new_toks = self.target_tokenizer.batch_decode(target_new_ids, skip_special_tokens=False) + assistant_new_ids = self.assistant_tokenizer.encode(target_new_toks, add_special_tokens=False) + if self._prev_assistant_ids is None: + self._prev_assistant_ids = assistant_new_ids + else: + self._prev_assistant_ids = torch.cat([self._prev_assistant_ids, assistant_new_ids], dim=-1) + return self._prev_assistant_ids input_ids = get_assistant_input_ids(input_ids) + input_ids = input_ids.to(self.assistant_model.device) # Don't generate more than `max_length - 1` candidates since the target model generates one extra token. new_cur_len = input_ids.shape[-1] @@ -652,7 +659,7 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen # 1. If it is not the first round of candidate generation, prepare the inputs based on the input_ids length # (which implicitly contains the number of accepted candidates from the previous round) - has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None + # has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None if has_past_key_values: new_cache_size = new_cur_len - 1 self.assistant_kwargs["past_key_values"] = _crop_past_key_values( @@ -681,6 +688,7 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen # 4. Prepare variables for output candidate_logits = torch.stack(assistant_output.scores, dim=1) candidate_ids = assistant_output.sequences + candidate_ids.apply_(lambda x: self._assistant_to_target_input_ids[x.item()]) return candidate_ids, candidate_logits From f199c94ac821cffe81b9ea04955d71792d078825 Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Sat, 16 Nov 2024 15:24:30 -0500 Subject: [PATCH 08/76] formatting --- src/transformers/generation/candidate_generator.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index d23480a45ac8d1..6cda4df3153f05 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -603,9 +603,7 @@ def __init__( assistant_vocab[tok]: target_vocab[tok] for tok in target_vocab.keys() & assistant_vocab.keys() } # Suppress tokens that are in the assistant vocab but not in the target vocab - suppress_input_ids = list( - set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.values()) - ) + suppress_input_ids = list(set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.values())) logits_processor.append( SuppressTokensLogitsProcessor( suppress_tokens=suppress_input_ids, @@ -635,11 +633,10 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTensor: nonlocal has_past_key_values target_seq_len = target_input_ids.shape[-1] - target_new_ids = target_input_ids[ - :, -(target_seq_len - self._prev_target_seq_len) : - ] - # convert target_new_ids to string, and then, convert the string to assistant_new_ids + target_new_ids = target_input_ids[:, -(target_seq_len - self._prev_target_seq_len) :] + # Convert target_new_ids to string target_new_toks = self.target_tokenizer.batch_decode(target_new_ids, skip_special_tokens=False) + # Convert the string to assistant_new_ids assistant_new_ids = self.assistant_tokenizer.encode(target_new_toks, add_special_tokens=False) if self._prev_assistant_ids is None: self._prev_assistant_ids = assistant_new_ids From 19c0057dd56b5385f6f7d0a0379c4173b6e0e7ca Mon Sep 17 00:00:00 2001 From: jmamou Date: Thu, 21 Nov 2024 05:20:50 -0800 Subject: [PATCH 09/76] fix code --- .../generation/candidate_generator.py | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index 6cda4df3153f05..16cd7487e4bafe 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -608,10 +608,10 @@ def __init__( SuppressTokensLogitsProcessor( suppress_tokens=suppress_input_ids, device=assistant_model.device, - ), - LogitNormalization(), + ) ) - super().__init__(input_ids, assistant_model, generation_config, model_kwargs, inputs_tensor, logits_processor) + logits_processor.append(LogitNormalization()) + super().__init__(input_ids, assistant_model, target_tokenizer, assistant_tokenizer, generation_config, model_kwargs, inputs_tensor, logits_processor) self._prev_target_seq_len: int = 0 self._prev_assistant_ids: torch.LongTensor | None = None @@ -637,7 +637,7 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen # Convert target_new_ids to string target_new_toks = self.target_tokenizer.batch_decode(target_new_ids, skip_special_tokens=False) # Convert the string to assistant_new_ids - assistant_new_ids = self.assistant_tokenizer.encode(target_new_toks, add_special_tokens=False) + assistant_new_ids = self.assistant_tokenizer.encode(target_new_toks[0], add_special_tokens=False) if self._prev_assistant_ids is None: self._prev_assistant_ids = assistant_new_ids else: @@ -645,6 +645,9 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen return self._prev_assistant_ids input_ids = get_assistant_input_ids(input_ids) + # Ensure input_ids is a 2D tensor + if isinstance(input_ids, list): + input_ids = torch.tensor(input_ids).unsqueeze(0) input_ids = input_ids.to(self.assistant_model.device) # Don't generate more than `max_length - 1` candidates since the target model generates one extra token. @@ -662,12 +665,13 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen self.assistant_kwargs["past_key_values"] = _crop_past_key_values( self.assistant_model, self.assistant_kwargs["past_key_values"], new_cache_size - 1 ) # the assistant does not have the token after the last match, hence the -1 - - self.assistant_kwargs = _prepare_attention_mask( - self.assistant_kwargs, new_cur_len, self.assistant_model.config.is_encoder_decoder - ) self.assistant_kwargs = _prepare_token_type_ids(self.assistant_kwargs, new_cur_len) + # we need to update the attention mask to reflect the new input_ids length + self.assistant_kwargs = _prepare_attention_mask( + self.assistant_kwargs, new_cur_len, self.assistant_model.config.is_encoder_decoder + ) + # 2. Forecast next N tokens using the assistant model. assistant_generation_kwargs = { self.input_ids_key: input_ids, @@ -685,7 +689,10 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen # 4. Prepare variables for output candidate_logits = torch.stack(assistant_output.scores, dim=1) candidate_ids = assistant_output.sequences - candidate_ids.apply_(lambda x: self._assistant_to_target_input_ids[x.item()]) + device = candidate_ids.device + candidate_ids = candidate_ids.cpu() + candidate_ids.apply_(lambda x: self._assistant_to_target_input_ids[x]) + candidate_ids = candidate_ids.to(device) return candidate_ids, candidate_logits From acf5a4b315c18a01c561f3a94cc49ae768d5265c Mon Sep 17 00:00:00 2001 From: jmamou Date: Sun, 24 Nov 2024 05:15:37 -0800 Subject: [PATCH 10/76] fix code --- .../generation/candidate_generator.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index 16cd7487e4bafe..4d8aa41f1fcd29 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -603,7 +603,7 @@ def __init__( assistant_vocab[tok]: target_vocab[tok] for tok in target_vocab.keys() & assistant_vocab.keys() } # Suppress tokens that are in the assistant vocab but not in the target vocab - suppress_input_ids = list(set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.values())) + suppress_input_ids = list(set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.keys())) logits_processor.append( SuppressTokensLogitsProcessor( suppress_tokens=suppress_input_ids, @@ -634,6 +634,7 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen nonlocal has_past_key_values target_seq_len = target_input_ids.shape[-1] target_new_ids = target_input_ids[:, -(target_seq_len - self._prev_target_seq_len) :] + self._prev_target_seq_len = target_seq_len # Convert target_new_ids to string target_new_toks = self.target_tokenizer.batch_decode(target_new_ids, skip_special_tokens=False) # Convert the string to assistant_new_ids @@ -641,20 +642,17 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen if self._prev_assistant_ids is None: self._prev_assistant_ids = assistant_new_ids else: - self._prev_assistant_ids = torch.cat([self._prev_assistant_ids, assistant_new_ids], dim=-1) - return self._prev_assistant_ids - + self._prev_assistant_ids = self._prev_assistant_ids + assistant_new_ids + return torch.tensor(self._prev_assistant_ids).unsqueeze(0).to(self.assistant_model.device) + + target_input_ids = input_ids.clone() input_ids = get_assistant_input_ids(input_ids) - # Ensure input_ids is a 2D tensor - if isinstance(input_ids, list): - input_ids = torch.tensor(input_ids).unsqueeze(0) - input_ids = input_ids.to(self.assistant_model.device) # Don't generate more than `max_length - 1` candidates since the target model generates one extra token. new_cur_len = input_ids.shape[-1] max_new_tokens = min(int(self.num_assistant_tokens), self.generation_config.max_length - new_cur_len - 1) min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - new_cur_len), 0) - if max_new_tokens == 0: + if max_new_tokens <= 0: return input_ids, None # 1. If it is not the first round of candidate generation, prepare the inputs based on the input_ids length @@ -691,8 +689,10 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen candidate_ids = assistant_output.sequences device = candidate_ids.device candidate_ids = candidate_ids.cpu() - candidate_ids.apply_(lambda x: self._assistant_to_target_input_ids[x]) + candidate_ids = candidate_ids[0, -(len(candidate_ids[0])-input_ids.shape[1]):].apply_(lambda x: self._assistant_to_target_input_ids[x]) candidate_ids = candidate_ids.to(device) + candidate_ids = torch.cat((target_input_ids, candidate_ids.unsqueeze(0)), dim=1) + return candidate_ids, candidate_logits From 3712117297f9c73def57285748b6376aeccfbff3 Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Sun, 24 Nov 2024 11:28:46 -0500 Subject: [PATCH 11/76] formatting --- .../generation/candidate_generator.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index 4d8aa41f1fcd29..da7a77adc97d6c 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -611,7 +611,16 @@ def __init__( ) ) logits_processor.append(LogitNormalization()) - super().__init__(input_ids, assistant_model, target_tokenizer, assistant_tokenizer, generation_config, model_kwargs, inputs_tensor, logits_processor) + super().__init__( + input_ids, + assistant_model, + target_tokenizer, + assistant_tokenizer, + generation_config, + model_kwargs, + inputs_tensor, + logits_processor, + ) self._prev_target_seq_len: int = 0 self._prev_assistant_ids: torch.LongTensor | None = None @@ -644,7 +653,7 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen else: self._prev_assistant_ids = self._prev_assistant_ids + assistant_new_ids return torch.tensor(self._prev_assistant_ids).unsqueeze(0).to(self.assistant_model.device) - + target_input_ids = input_ids.clone() input_ids = get_assistant_input_ids(input_ids) @@ -689,7 +698,9 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen candidate_ids = assistant_output.sequences device = candidate_ids.device candidate_ids = candidate_ids.cpu() - candidate_ids = candidate_ids[0, -(len(candidate_ids[0])-input_ids.shape[1]):].apply_(lambda x: self._assistant_to_target_input_ids[x]) + candidate_ids = candidate_ids[0, -(len(candidate_ids[0]) - input_ids.shape[1]) :].apply_( + lambda x: self._assistant_to_target_input_ids[x] + ) candidate_ids = candidate_ids.to(device) candidate_ids = torch.cat((target_input_ids, candidate_ids.unsqueeze(0)), dim=1) From 63f2f4622b418eda71cf0aad682bfa3ec9838a51 Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Sun, 24 Nov 2024 14:20:56 -0500 Subject: [PATCH 12/76] add `TestGenerateWithDifferentModels` --- tests/generation/test_candidate_generator.py | 63 ++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py index 03fd51324b022f..38840431816532 100644 --- a/tests/generation/test_candidate_generator.py +++ b/tests/generation/test_candidate_generator.py @@ -2,7 +2,9 @@ import numpy as np +from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.generation.candidate_generator import AssistedCandidateGeneratorDifferentTokenizers +from transformers.generation.utils import GenerationConfig class TestAssistedCandidateGeneratorDifferentTokenizers(unittest.TestCase): @@ -41,3 +43,64 @@ def test_no_new_tokens(self): self.assertEqual(discrep_length, 0) np.testing.assert_array_equal(new_tokens_only, np.array([[]])) np.testing.assert_array_equal(discrep_only, np.array([[]])) + + +class TestGenerateWithDifferentModels(unittest.TestCase): + """Tests generation with different target and assistant models.""" + + def test_generate_with_different_models(self): + # Use smaller test models instead + target_model_checkpoint = "hf-internal-testing/tiny-random-LlamaForCausalLM" + assistant_checkpoint = "hf-internal-testing/tiny-random-gpt2" + + prompt = "Alice and Bob" + + # Load models sequentially and handle cleanup + target_model = AutoModelForCausalLM.from_pretrained( + target_model_checkpoint, + ) + target_tokenizer = AutoTokenizer.from_pretrained(target_model_checkpoint) + + assistant_model = AutoModelForCausalLM.from_pretrained( + assistant_checkpoint, + ) + assistant_tokenizer = AutoTokenizer.from_pretrained(assistant_checkpoint) + + # Tokenize input + input_ids = target_tokenizer(prompt, return_tensors="pt").input_ids.to(target_model.device) + + # Create generation configs + do_sample = False + base_config = GenerationConfig( + max_new_tokens=20, + do_sample=do_sample, + ) + + # Generate with and without assistant model + outputs_normal = target_model.generate( + input_ids, + generation_config=base_config, + ) + + # Pass the assistant model and tokenizers directly to the generate method + outputs_assisted = target_model.generate( + input_ids, + generation_config=base_config, + assistant_model=assistant_model, + tokenizer=target_tokenizer, + assistant_tokenizer=assistant_tokenizer, + ) + + # Decode outputs + text_normal = target_tokenizer.batch_decode(outputs_normal, skip_special_tokens=True)[0] + text_assisted = target_tokenizer.batch_decode(outputs_assisted, skip_special_tokens=True)[0] + + # Basic validation + self.assertIsInstance(text_normal, str) + self.assertIsInstance(text_assisted, str) + self.assertGreater(len(text_normal), len(prompt)) + self.assertGreater(len(text_assisted), len(prompt)) + self.assertTrue(text_normal.startswith(prompt)) + self.assertTrue(text_assisted.startswith(prompt)) + if not do_sample: + self.assertEqual(text_normal, text_assisted) From 6ac33f147121d7d2b5e51490369b06ba93443b0b Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Sun, 24 Nov 2024 14:24:34 -0500 Subject: [PATCH 13/76] `TestGenerateWithDifferentModels` parameterize on `do_sample` --- tests/generation/test_candidate_generator.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py index 38840431816532..6ef031edc922fb 100644 --- a/tests/generation/test_candidate_generator.py +++ b/tests/generation/test_candidate_generator.py @@ -1,4 +1,5 @@ import unittest +from parameterized import parameterized import numpy as np @@ -48,7 +49,11 @@ def test_no_new_tokens(self): class TestGenerateWithDifferentModels(unittest.TestCase): """Tests generation with different target and assistant models.""" - def test_generate_with_different_models(self): + @parameterized.expand([ + (False,), + (True,), + ]) + def test_generate_with_different_models(self, do_sample): # Use smaller test models instead target_model_checkpoint = "hf-internal-testing/tiny-random-LlamaForCausalLM" assistant_checkpoint = "hf-internal-testing/tiny-random-gpt2" @@ -70,7 +75,6 @@ def test_generate_with_different_models(self): input_ids = target_tokenizer(prompt, return_tensors="pt").input_ids.to(target_model.device) # Create generation configs - do_sample = False base_config = GenerationConfig( max_new_tokens=20, do_sample=do_sample, From 69383117d165687c6e1394945a4c4a201baaa2dc Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Sun, 24 Nov 2024 16:06:16 -0500 Subject: [PATCH 14/76] `AssistantVocabMapping` & `AssistantVocabMappingCache` --- .../generation/candidate_generator.py | 71 +++++++++++++++---- 1 file changed, 59 insertions(+), 12 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index da7a77adc97d6c..ea1f7fa23e8632 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -14,7 +14,10 @@ # limitations under the License. import copy +from functools import lru_cache +import threading from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple +import weakref import numpy as np import torch @@ -559,6 +562,52 @@ def _process_assistant_outputs( return new_target_ids +class AssistantVocabMapping: + def __init__(self, target_tokenizer, assistant_tokenizer): + self.target_tokenizer = target_tokenizer + self.assistant_tokenizer = assistant_tokenizer + self.assistant_to_target_input_ids = self._get_assistant_to_target_input_ids() + self.suppress_input_ids = self._get_suppress_input_ids() + + def _get_assistant_to_target_input_ids(self) -> dict[int, int]: + """ + Get a mapping from assistant tokens to target tokens based on vocabularies. + """ + target_vocab = self.target_tokenizer.get_vocab() + assistant_vocab = self.assistant_tokenizer.get_vocab() + return { + assistant_vocab[tok]: target_vocab[tok] + for tok in set(target_vocab.keys()) & set(assistant_vocab.keys()) + } + + def _get_suppress_input_ids(self) -> list[int]: + """ + Get the input ids that are in the assistant vocab but not in the target vocab. + """ + assistant_vocab = self.assistant_tokenizer.get_vocab() + return list(set(assistant_vocab.values()) - set(self.assistant_to_target_input_ids.keys())) + + +class AssistantVocabMappingCache: + _lock = threading.Lock() + _cache = weakref.WeakKeyDictionary() + + @classmethod + def get_mapping(cls, target_tokenizer, assistant_tokenizer) -> AssistantVocabMapping: + with cls._lock: + assistant_dict = cls._cache.get(target_tokenizer) + if assistant_dict is None: + assistant_dict = weakref.WeakKeyDictionary() + cls._cache[target_tokenizer] = assistant_dict + + mapping = assistant_dict.get(assistant_tokenizer) + if mapping is None: + mapping = AssistantVocabMapping(target_tokenizer, assistant_tokenizer) + assistant_dict[assistant_tokenizer] = mapping + + return mapping + + class UniversalSpeculativeDecodingGenerator(AssistedCandidateGeneratorDifferentTokenizers): """ `CandidateGenerator` class to be used for Universal Speculative Decoding (USD): speculative decoding with different tokenizers @@ -597,20 +646,18 @@ def __init__( inputs_tensor: Optional[torch.Tensor] = None, logits_processor: "LogitsProcessorList" = None, ): - target_vocab: dict[str, int] = target_tokenizer.get_vocab() - assistant_vocab: dict[str, int] = assistant_tokenizer.get_vocab() - self._assistant_to_target_input_ids: dict[int, int] = { - assistant_vocab[tok]: target_vocab[tok] for tok in target_vocab.keys() & assistant_vocab.keys() - } - # Suppress tokens that are in the assistant vocab but not in the target vocab - suppress_input_ids = list(set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.keys())) - logits_processor.append( + assistant_vocab_mapping = AssistantVocabMappingCache.get_mapping( + target_tokenizer, assistant_tokenizer + ) + self._assistant_to_target_input_ids = assistant_vocab_mapping.assistant_to_target_input_ids + logits_processor += [ SuppressTokensLogitsProcessor( - suppress_tokens=suppress_input_ids, + suppress_tokens=assistant_vocab_mapping.suppress_input_ids, device=assistant_model.device, - ) - ) - logits_processor.append(LogitNormalization()) + ), + LogitNormalization(), + ] + super().__init__( input_ids, assistant_model, From 5a0db3bb7322718171354ac9d87871b51a7624a4 Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Sun, 24 Nov 2024 16:06:35 -0500 Subject: [PATCH 15/76] formatting --- src/transformers/generation/candidate_generator.py | 12 ++++-------- tests/generation/test_candidate_generator.py | 12 +++++++----- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index ea1f7fa23e8632..e35c9086adcf18 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -14,10 +14,9 @@ # limitations under the License. import copy -from functools import lru_cache import threading -from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple import weakref +from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple import numpy as np import torch @@ -576,8 +575,7 @@ def _get_assistant_to_target_input_ids(self) -> dict[int, int]: target_vocab = self.target_tokenizer.get_vocab() assistant_vocab = self.assistant_tokenizer.get_vocab() return { - assistant_vocab[tok]: target_vocab[tok] - for tok in set(target_vocab.keys()) & set(assistant_vocab.keys()) + assistant_vocab[tok]: target_vocab[tok] for tok in set(target_vocab.keys()) & set(assistant_vocab.keys()) } def _get_suppress_input_ids(self) -> list[int]: @@ -606,7 +604,7 @@ def get_mapping(cls, target_tokenizer, assistant_tokenizer) -> AssistantVocabMap assistant_dict[assistant_tokenizer] = mapping return mapping - + class UniversalSpeculativeDecodingGenerator(AssistedCandidateGeneratorDifferentTokenizers): """ @@ -646,9 +644,7 @@ def __init__( inputs_tensor: Optional[torch.Tensor] = None, logits_processor: "LogitsProcessorList" = None, ): - assistant_vocab_mapping = AssistantVocabMappingCache.get_mapping( - target_tokenizer, assistant_tokenizer - ) + assistant_vocab_mapping = AssistantVocabMappingCache.get_mapping(target_tokenizer, assistant_tokenizer) self._assistant_to_target_input_ids = assistant_vocab_mapping.assistant_to_target_input_ids logits_processor += [ SuppressTokensLogitsProcessor( diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py index 6ef031edc922fb..f544a6cb2d838b 100644 --- a/tests/generation/test_candidate_generator.py +++ b/tests/generation/test_candidate_generator.py @@ -1,7 +1,7 @@ import unittest -from parameterized import parameterized import numpy as np +from parameterized import parameterized from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.generation.candidate_generator import AssistedCandidateGeneratorDifferentTokenizers @@ -49,10 +49,12 @@ def test_no_new_tokens(self): class TestGenerateWithDifferentModels(unittest.TestCase): """Tests generation with different target and assistant models.""" - @parameterized.expand([ - (False,), - (True,), - ]) + @parameterized.expand( + [ + (False,), + (True,), + ] + ) def test_generate_with_different_models(self, do_sample): # Use smaller test models instead target_model_checkpoint = "hf-internal-testing/tiny-random-LlamaForCausalLM" From 92f8ad3b55b65fffea2c85d2674be2e8486430d6 Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Sun, 24 Nov 2024 16:37:19 -0500 Subject: [PATCH 16/76] `AssistantToTargetTranslator`: `get_target_input_ids` & `get_target_logits` --- .../generation/candidate_generator.py | 54 +++++++++++-------- 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index e35c9086adcf18..3576f3adee0eea 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -561,19 +561,19 @@ def _process_assistant_outputs( return new_target_ids -class AssistantVocabMapping: +class AssistantToTargetTranslator: def __init__(self, target_tokenizer, assistant_tokenizer): - self.target_tokenizer = target_tokenizer - self.assistant_tokenizer = assistant_tokenizer - self.assistant_to_target_input_ids = self._get_assistant_to_target_input_ids() + self._target_tokenizer = target_tokenizer + self._assistant_tokenizer = assistant_tokenizer + self._assistant_to_target_input_ids = self._get_assistant_to_target_input_ids() self.suppress_input_ids = self._get_suppress_input_ids() def _get_assistant_to_target_input_ids(self) -> dict[int, int]: """ Get a mapping from assistant tokens to target tokens based on vocabularies. """ - target_vocab = self.target_tokenizer.get_vocab() - assistant_vocab = self.assistant_tokenizer.get_vocab() + target_vocab = self._target_tokenizer.get_vocab() + assistant_vocab = self._assistant_tokenizer.get_vocab() return { assistant_vocab[tok]: target_vocab[tok] for tok in set(target_vocab.keys()) & set(assistant_vocab.keys()) } @@ -582,8 +582,26 @@ def _get_suppress_input_ids(self) -> list[int]: """ Get the input ids that are in the assistant vocab but not in the target vocab. """ - assistant_vocab = self.assistant_tokenizer.get_vocab() - return list(set(assistant_vocab.values()) - set(self.assistant_to_target_input_ids.keys())) + assistant_vocab = self._assistant_tokenizer.get_vocab() + return list(set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.keys())) + + def get_target_input_ids(self, assistant_input_ids: torch.LongTensor) -> torch.LongTensor: + """ + Return the target input ids that correspond to the assistant input ids. + """ + return assistant_input_ids.apply_(lambda x: self._assistant_to_target_input_ids.get(x, x)) + + def get_target_logits(self, assistant_logits: torch.FloatTensor) -> torch.FloatTensor: + """ + Return the target logits that correspond to the assistant logits. + """ + target_vocab_size = len(self._target_tokenizer.get_vocab()) + ret: torch.FloatTensor = torch.zeros( + (assistant_logits.shape[0], target_vocab_size), device=assistant_logits.device + ) + ret[:, self.suppress_input_ids] = -float("inf") + ret[:, list(self._assistant_to_target_input_ids.values())] = assistant_logits + return ret class AssistantVocabMappingCache: @@ -591,7 +609,7 @@ class AssistantVocabMappingCache: _cache = weakref.WeakKeyDictionary() @classmethod - def get_mapping(cls, target_tokenizer, assistant_tokenizer) -> AssistantVocabMapping: + def get_mapping(cls, target_tokenizer, assistant_tokenizer) -> AssistantToTargetTranslator: with cls._lock: assistant_dict = cls._cache.get(target_tokenizer) if assistant_dict is None: @@ -600,7 +618,7 @@ def get_mapping(cls, target_tokenizer, assistant_tokenizer) -> AssistantVocabMap mapping = assistant_dict.get(assistant_tokenizer) if mapping is None: - mapping = AssistantVocabMapping(target_tokenizer, assistant_tokenizer) + mapping = AssistantToTargetTranslator(target_tokenizer, assistant_tokenizer) assistant_dict[assistant_tokenizer] = mapping return mapping @@ -644,11 +662,10 @@ def __init__( inputs_tensor: Optional[torch.Tensor] = None, logits_processor: "LogitsProcessorList" = None, ): - assistant_vocab_mapping = AssistantVocabMappingCache.get_mapping(target_tokenizer, assistant_tokenizer) - self._assistant_to_target_input_ids = assistant_vocab_mapping.assistant_to_target_input_ids + self._assistant_vocab_mapping = AssistantVocabMappingCache.get_mapping(target_tokenizer, assistant_tokenizer) logits_processor += [ SuppressTokensLogitsProcessor( - suppress_tokens=assistant_vocab_mapping.suppress_input_ids, + suppress_tokens=self._assistant_vocab_mapping.suppress_input_ids, device=assistant_model.device, ), LogitNormalization(), @@ -697,7 +714,6 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen self._prev_assistant_ids = self._prev_assistant_ids + assistant_new_ids return torch.tensor(self._prev_assistant_ids).unsqueeze(0).to(self.assistant_model.device) - target_input_ids = input_ids.clone() input_ids = get_assistant_input_ids(input_ids) # Don't generate more than `max_length - 1` candidates since the target model generates one extra token. @@ -739,13 +755,9 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen # 4. Prepare variables for output candidate_logits = torch.stack(assistant_output.scores, dim=1) candidate_ids = assistant_output.sequences - device = candidate_ids.device - candidate_ids = candidate_ids.cpu() - candidate_ids = candidate_ids[0, -(len(candidate_ids[0]) - input_ids.shape[1]) :].apply_( - lambda x: self._assistant_to_target_input_ids[x] - ) - candidate_ids = candidate_ids.to(device) - candidate_ids = torch.cat((target_input_ids, candidate_ids.unsqueeze(0)), dim=1) + candidate_ids = self._assistant_vocab_mapping.get_target_input_ids(candidate_ids) + + candidate_logits = self._assistant_vocab_mapping.get_target_logits(candidate_logits) return candidate_ids, candidate_logits From 7c8708ed3fd612d747a3ebaf42b2a4fe81ec2f07 Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Sun, 24 Nov 2024 18:11:57 -0500 Subject: [PATCH 17/76] improve `_get_assistant_to_target_input_ids` & formatting --- .../generation/candidate_generator.py | 42 ++++++++++++------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index 3576f3adee0eea..2914b472d96f26 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -562,11 +562,15 @@ def _process_assistant_outputs( class AssistantToTargetTranslator: - def __init__(self, target_tokenizer, assistant_tokenizer): - self._target_tokenizer = target_tokenizer - self._assistant_tokenizer = assistant_tokenizer - self._assistant_to_target_input_ids = self._get_assistant_to_target_input_ids() - self.suppress_input_ids = self._get_suppress_input_ids() + """ + Translate the assistant into the target universe. + """ + + def __init__(self, target_tokenizer: "PreTrainedTokenizerBase", assistant_tokenizer: "PreTrainedTokenizerBase"): + self._target_tokenizer: "PreTrainedTokenizerBase" = target_tokenizer + self._assistant_tokenizer: "PreTrainedTokenizerBase" = assistant_tokenizer + self._assistant_to_target_input_ids: dict[int, int] = self._get_assistant_to_target_input_ids() + self.suppress_input_ids: list[int] = self._get_suppress_input_ids() def _get_assistant_to_target_input_ids(self) -> dict[int, int]: """ @@ -595,13 +599,18 @@ def get_target_logits(self, assistant_logits: torch.FloatTensor) -> torch.FloatT """ Return the target logits that correspond to the assistant logits. """ - target_vocab_size = len(self._target_tokenizer.get_vocab()) - ret: torch.FloatTensor = torch.zeros( - (assistant_logits.shape[0], target_vocab_size), device=assistant_logits.device + target_vocab_size: int = len(self._target_tokenizer.get_vocab()) + target_shape: tuple[int, ...] = (*assistant_logits.shape[:-1], target_vocab_size) + target_logits: torch.FloatTensor = torch.full(target_shape, -float("inf")) + assistant_logits_supported_mask: torch.BoolTensor = assistant_logits > -float("inf") + assistant_logits_supported_indices: torch.IntTensor = assistant_logits_supported_mask.nonzero(as_tuple=True)[ + -1 + ] + target_logits_supported_indices: torch.IntTensor = assistant_logits_supported_indices.apply_( + lambda x: self._assistant_to_target_input_ids[x] ) - ret[:, self.suppress_input_ids] = -float("inf") - ret[:, list(self._assistant_to_target_input_ids.values())] = assistant_logits - return ret + target_logits[..., target_logits_supported_indices] = assistant_logits[..., assistant_logits_supported_mask] + return target_logits class AssistantVocabMappingCache: @@ -609,7 +618,9 @@ class AssistantVocabMappingCache: _cache = weakref.WeakKeyDictionary() @classmethod - def get_mapping(cls, target_tokenizer, assistant_tokenizer) -> AssistantToTargetTranslator: + def get_mapping( + cls, target_tokenizer: "PreTrainedTokenizerBase", assistant_tokenizer: "PreTrainedTokenizerBase" + ) -> AssistantToTargetTranslator: with cls._lock: assistant_dict = cls._cache.get(target_tokenizer) if assistant_dict is None: @@ -755,11 +766,10 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen # 4. Prepare variables for output candidate_logits = torch.stack(assistant_output.scores, dim=1) candidate_ids = assistant_output.sequences - candidate_ids = self._assistant_vocab_mapping.get_target_input_ids(candidate_ids) - - candidate_logits = self._assistant_vocab_mapping.get_target_logits(candidate_logits) + target_ids = self._assistant_vocab_mapping.get_target_input_ids(candidate_ids) - return candidate_ids, candidate_logits + target_logits = self._assistant_vocab_mapping.get_target_logits(candidate_logits) + return target_ids, target_logits class PromptLookupCandidateGenerator(CandidateGenerator): From 880d0aeaaec0c2d4857edb42c369f1707d2725d9 Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Sun, 24 Nov 2024 18:19:52 -0500 Subject: [PATCH 18/76] renaming --- src/transformers/generation/candidate_generator.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index 2914b472d96f26..026f338960b5a8 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -613,12 +613,12 @@ def get_target_logits(self, assistant_logits: torch.FloatTensor) -> torch.FloatT return target_logits -class AssistantVocabMappingCache: +class AssistantVocabTranslatorCache: _lock = threading.Lock() _cache = weakref.WeakKeyDictionary() @classmethod - def get_mapping( + def get_translator( cls, target_tokenizer: "PreTrainedTokenizerBase", assistant_tokenizer: "PreTrainedTokenizerBase" ) -> AssistantToTargetTranslator: with cls._lock: @@ -673,10 +673,10 @@ def __init__( inputs_tensor: Optional[torch.Tensor] = None, logits_processor: "LogitsProcessorList" = None, ): - self._assistant_vocab_mapping = AssistantVocabMappingCache.get_mapping(target_tokenizer, assistant_tokenizer) + self._atm_translator = AssistantVocabTranslatorCache.get_translator(target_tokenizer, assistant_tokenizer) logits_processor += [ SuppressTokensLogitsProcessor( - suppress_tokens=self._assistant_vocab_mapping.suppress_input_ids, + suppress_tokens=self._atm_translator.suppress_input_ids, device=assistant_model.device, ), LogitNormalization(), @@ -766,9 +766,9 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen # 4. Prepare variables for output candidate_logits = torch.stack(assistant_output.scores, dim=1) candidate_ids = assistant_output.sequences - target_ids = self._assistant_vocab_mapping.get_target_input_ids(candidate_ids) + target_ids = self._atm_translator.get_target_input_ids(candidate_ids) - target_logits = self._assistant_vocab_mapping.get_target_logits(candidate_logits) + target_logits = self._atm_translator.get_target_logits(candidate_logits) return target_ids, target_logits From d9b5e748a803f90950cb168119193a1d1599e26d Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Sun, 24 Nov 2024 19:05:36 -0500 Subject: [PATCH 19/76] WIP: debugging `min_new_tokens` --- .../generation/candidate_generator.py | 31 +++++++++++-------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index 026f338960b5a8..710ef04eb75a68 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -571,6 +571,12 @@ def __init__(self, target_tokenizer: "PreTrainedTokenizerBase", assistant_tokeni self._assistant_tokenizer: "PreTrainedTokenizerBase" = assistant_tokenizer self._assistant_to_target_input_ids: dict[int, int] = self._get_assistant_to_target_input_ids() self.suppress_input_ids: list[int] = self._get_suppress_input_ids() + self.logits_processors: LogitsProcessorList = LogitsProcessorList( + [ + SuppressTokensLogitsProcessor(self.suppress_input_ids), + LogitNormalization(), + ] + ) def _get_assistant_to_target_input_ids(self) -> dict[int, int]: """ @@ -674,14 +680,6 @@ def __init__( logits_processor: "LogitsProcessorList" = None, ): self._atm_translator = AssistantVocabTranslatorCache.get_translator(target_tokenizer, assistant_tokenizer) - logits_processor += [ - SuppressTokensLogitsProcessor( - suppress_tokens=self._atm_translator.suppress_input_ids, - device=assistant_model.device, - ), - LogitNormalization(), - ] - super().__init__( input_ids, assistant_model, @@ -731,7 +729,7 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen new_cur_len = input_ids.shape[-1] max_new_tokens = min(int(self.num_assistant_tokens), self.generation_config.max_length - new_cur_len - 1) min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - new_cur_len), 0) - if max_new_tokens <= 0: + if max_new_tokens == 0: return input_ids, None # 1. If it is not the first round of candidate generation, prepare the inputs based on the input_ids length @@ -752,20 +750,27 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen # 2. Forecast next N tokens using the assistant model. assistant_generation_kwargs = { self.input_ids_key: input_ids, - "min_new_tokens": min_new_tokens, - "max_new_tokens": max_new_tokens, + # "min_new_tokens": min_new_tokens, + # "max_new_tokens": max_new_tokens, + "min_new_tokens": 100, + "max_new_tokens": 100, "generation_config": self.generation_config, "logits_processor": self.logits_processor, } - assistant_output = self.assistant_model.generate(**assistant_generation_kwargs, **self.assistant_kwargs) + assistant_output = self.assistant_model.generate(**assistant_generation_kwargs, **self.assistant_kwargs, output_logits=True) # 3. Update variables for the next round of candidate generation self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values # 4. Prepare variables for output - candidate_logits = torch.stack(assistant_output.scores, dim=1) + # candidate_logits = torch.stack(assistant_output.scores, dim=1) + candidate_logits = torch.stack(assistant_output.logits, dim=1) + if not candidate_logits.shape[1] > 1: + msg = f"Since we set min_new_tokens to {assistant_generation_kwargs['min_new_tokens']} and max_new_tokens to {assistant_generation_kwargs['max_new_tokens']}, we expect at least 2 candidates, but seems like we got {candidate_logits.shape[1]} candidates." + raise Exception(msg) candidate_ids = assistant_output.sequences + candidate_logits = self._atm_translator.logits_processors(input_ids=candidate_ids, scores=candidate_logits) target_ids = self._atm_translator.get_target_input_ids(candidate_ids) target_logits = self._atm_translator.get_target_logits(candidate_logits) From 25974d5e998260017b7f3846bf35644bc9551909 Mon Sep 17 00:00:00 2001 From: jmamou Date: Mon, 25 Nov 2024 04:03:41 -0800 Subject: [PATCH 20/76] fix get_target_ids --- .../generation/candidate_generator.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index 710ef04eb75a68..e2dcba1e2cb076 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -595,11 +595,16 @@ def _get_suppress_input_ids(self) -> list[int]: assistant_vocab = self._assistant_tokenizer.get_vocab() return list(set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.keys())) - def get_target_input_ids(self, assistant_input_ids: torch.LongTensor) -> torch.LongTensor: + def get_target_ids(self, assistant_input_ids, target_input_ids, assistant_candidate_ids: torch.LongTensor) -> torch.LongTensor: """ - Return the target input ids that correspond to the assistant input ids. + Return the target candidate ids that correspond to the assistant candidate ids. + Note that we have already the target ids for the prompt and we only need to find the target ids for the new tokens. + Moreover, assistant ids of the original prompt does not necessarily appear in _assistant_to_target_input_ids. """ - return assistant_input_ids.apply_(lambda x: self._assistant_to_target_input_ids.get(x, x)) + target_candidate_ids = assistant_candidate_ids[0, -(len(assistant_candidate_ids[0]) - assistant_input_ids.shape[1]) :].apply_( + lambda x: self._assistant_to_target_input_ids.get(x, x) + ) + return torch.cat((target_input_ids, target_candidate_ids.unsqueeze(0)), dim=1) def get_target_logits(self, assistant_logits: torch.FloatTensor) -> torch.FloatTensor: """ @@ -723,6 +728,7 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen self._prev_assistant_ids = self._prev_assistant_ids + assistant_new_ids return torch.tensor(self._prev_assistant_ids).unsqueeze(0).to(self.assistant_model.device) + target_input_ids = input_ids.clone() input_ids = get_assistant_input_ids(input_ids) # Don't generate more than `max_length - 1` candidates since the target model generates one extra token. @@ -764,14 +770,13 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values # 4. Prepare variables for output - # candidate_logits = torch.stack(assistant_output.scores, dim=1) candidate_logits = torch.stack(assistant_output.logits, dim=1) if not candidate_logits.shape[1] > 1: msg = f"Since we set min_new_tokens to {assistant_generation_kwargs['min_new_tokens']} and max_new_tokens to {assistant_generation_kwargs['max_new_tokens']}, we expect at least 2 candidates, but seems like we got {candidate_logits.shape[1]} candidates." raise Exception(msg) candidate_ids = assistant_output.sequences candidate_logits = self._atm_translator.logits_processors(input_ids=candidate_ids, scores=candidate_logits) - target_ids = self._atm_translator.get_target_input_ids(candidate_ids) + target_ids = self._atm_translator.get_target_ids(input_ids, target_input_ids, candidate_ids) target_logits = self._atm_translator.get_target_logits(candidate_logits) return target_ids, target_logits From b8636ab1d811ccbb950ab548a68c7a53dfcc192f Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Sat, 16 Nov 2024 13:54:03 -0500 Subject: [PATCH 21/76] `UniversalSpeculativeDecodingGenerator` --- .../generation/candidate_generator.py | 125 ++++++++++++++++++ 1 file changed, 125 insertions(+) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index e2dcba1e2cb076..d94e61d0a48f42 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -782,6 +782,131 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen return target_ids, target_logits +class UniversalSpeculativeDecodingGenerator(AssistedCandidateGeneratorDifferentTokenizers): + """ + `CandidateGenerator` class to be used for Universal Speculative Decoding (USD): speculative decoding with different tokenizers + for the assistant and main models. This class generates candidates through the use of a smaller + model. + + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids) + assistant_model (`PreTrainedModel`): + The model to be used for generating candidates. This model should be smaller than the main model. + target_tokenizer (`PreTrainedTokenizerBase`): + The tokenizer used for the target model. + assistant_tokenizer (`PreTrainedTokenizerBase`): + The tokenizer used for the assistant model. + generation_config (`~generation.GenerationConfig`, *optional*): + The generation configuration to be used as base parametrization for the generation call. + logits_processor (`LogitsProcessorList`): + An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] + used to modify the prediction scores of the language modeling head applied at each generation step. + model_kwargs (`Dict`): + The keyword arguments that will be passed to the main model, and are used as base inputs for the assistant + model as well. + inputs_tensor (`torch.Tensor`, *optional*): + The model input tensor. In encoder-decoder models, this is the encoder input. + """ + + def __init__( + self, + input_ids: torch.LongTensor, + assistant_model: "PreTrainedModel", + target_tokenizer: "PreTrainedTokenizerBase", + assistant_tokenizer: "PreTrainedTokenizerBase", + generation_config: "GenerationConfig", + model_kwargs: Dict, + inputs_tensor: Optional[torch.Tensor] = None, + logits_processor: "LogitsProcessorList" = None, + ): + target_vocab: dict[str, int] = target_tokenizer.get_vocab() + assistant_vocab: dict[str, int] = assistant_tokenizer.get_vocab() + self._target_to_assistant_input_ids: dict[int, int] = { + target_vocab[tok]: assistant_vocab[tok] for tok in target_vocab.keys() & assistant_vocab.keys() + } + # Suppress tokens that are in the assistant vocab but not in the target vocab + suppress_input_ids = list( + set(assistant_vocab.values()) - set(self._target_to_assistant_input_ids.values()) + ) + logits_processor.append( + SuppressTokensLogitsProcessor( + suppress_tokens=suppress_input_ids, + device=assistant_model.device, + ), + LogitNormalization(), + ) + super().__init__(input_ids, assistant_model, generation_config, model_kwargs, inputs_tensor, logits_processor) + self.prev_target_seq_len: int = 0 + + def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]: + """ + Fetches the candidates to be tried for the current input. + + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids) + + Return: + `torch.LongTensor` of shape `(batch_size, candidate_length)` containing the candidate sequences to be + assessed by the model and a `torch.FloatTensor` of shape `(batch_size, candidate_length, + vocabulary_size)` containing the logits associated to each candidate. + """ + # input_ids = input_ids.to(self.assistant_model.device) + + def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTensor: + target_seq_len = target_input_ids.shape[-1] + target_new_ids = target_input_ids[ + :, -(target_seq_len - self.prev_target_seq_len) : + ] + return torch.tensor( + [self._target_to_assistant_input_ids[tok.item()] for tok in target_new_ids.flatten()], + device=self.assistant_model.device, + ).view(target_new_ids.shape) + + input_ids = get_assistant_input_ids(input_ids) + + # Don't generate more than `max_length - 1` candidates since the target model generates one extra token. + new_cur_len = input_ids.shape[-1] + max_new_tokens = min(int(self.num_assistant_tokens), self.generation_config.max_length - new_cur_len - 1) + min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - new_cur_len), 0) + if max_new_tokens == 0: + return input_ids, None + + # 1. If it is not the first round of candidate generation, prepare the inputs based on the input_ids length + # (which implicitly contains the number of accepted candidates from the previous round) + has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None + if has_past_key_values: + new_cache_size = new_cur_len - 1 + self.assistant_kwargs["past_key_values"] = _crop_past_key_values( + self.assistant_model, self.assistant_kwargs["past_key_values"], new_cache_size - 1 + ) # the assistant does not have the token after the last match, hence the -1 + + self.assistant_kwargs = _prepare_attention_mask( + self.assistant_kwargs, new_cur_len, self.assistant_model.config.is_encoder_decoder + ) + self.assistant_kwargs = _prepare_token_type_ids(self.assistant_kwargs, new_cur_len) + + # 2. Forecast next N tokens using the assistant model. + assistant_generation_kwargs = { + self.input_ids_key: input_ids, + "min_new_tokens": min_new_tokens, + "max_new_tokens": max_new_tokens, + "generation_config": self.generation_config, + "logits_processor": self.logits_processor, + } + + assistant_output = self.assistant_model.generate(**assistant_generation_kwargs, **self.assistant_kwargs) + + # 3. Update variables for the next round of candidate generation + self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values + + # 4. Prepare variables for output + candidate_logits = torch.stack(assistant_output.scores, dim=1) + candidate_ids = assistant_output.sequences + return candidate_ids, candidate_logits + + class PromptLookupCandidateGenerator(CandidateGenerator): """ `CandidateGenerator` class to be used for prompt lookup generation. This class generates candidates by looking up From 1ef46b787d70e7e4792628c2ae414b12c0fc3967 Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Sat, 16 Nov 2024 15:08:46 -0500 Subject: [PATCH 22/76] assistant tokenizes only the target's new suffix --- .../generation/candidate_generator.py | 30 ++++++++++++------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index d94e61d0a48f42..bc2ad887f235d0 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -822,12 +822,12 @@ def __init__( ): target_vocab: dict[str, int] = target_tokenizer.get_vocab() assistant_vocab: dict[str, int] = assistant_tokenizer.get_vocab() - self._target_to_assistant_input_ids: dict[int, int] = { - target_vocab[tok]: assistant_vocab[tok] for tok in target_vocab.keys() & assistant_vocab.keys() + self._assistant_to_target_input_ids: dict[int, int] = { + assistant_vocab[tok]: target_vocab[tok] for tok in target_vocab.keys() & assistant_vocab.keys() } # Suppress tokens that are in the assistant vocab but not in the target vocab suppress_input_ids = list( - set(assistant_vocab.values()) - set(self._target_to_assistant_input_ids.values()) + set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.values()) ) logits_processor.append( SuppressTokensLogitsProcessor( @@ -837,7 +837,8 @@ def __init__( LogitNormalization(), ) super().__init__(input_ids, assistant_model, generation_config, model_kwargs, inputs_tensor, logits_processor) - self.prev_target_seq_len: int = 0 + self._prev_target_seq_len: int = 0 + self._prev_assistant_ids: torch.LongTensor | None = None def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]: """ @@ -852,19 +853,25 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, assessed by the model and a `torch.FloatTensor` of shape `(batch_size, candidate_length, vocabulary_size)` containing the logits associated to each candidate. """ - # input_ids = input_ids.to(self.assistant_model.device) + has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTensor: + nonlocal has_past_key_values target_seq_len = target_input_ids.shape[-1] target_new_ids = target_input_ids[ - :, -(target_seq_len - self.prev_target_seq_len) : + :, -(target_seq_len - self._prev_target_seq_len) : ] - return torch.tensor( - [self._target_to_assistant_input_ids[tok.item()] for tok in target_new_ids.flatten()], - device=self.assistant_model.device, - ).view(target_new_ids.shape) + # convert target_new_ids to string, and then, convert the string to assistant_new_ids + target_new_toks = self.target_tokenizer.batch_decode(target_new_ids, skip_special_tokens=False) + assistant_new_ids = self.assistant_tokenizer.encode(target_new_toks, add_special_tokens=False) + if self._prev_assistant_ids is None: + self._prev_assistant_ids = assistant_new_ids + else: + self._prev_assistant_ids = torch.cat([self._prev_assistant_ids, assistant_new_ids], dim=-1) + return self._prev_assistant_ids input_ids = get_assistant_input_ids(input_ids) + input_ids = input_ids.to(self.assistant_model.device) # Don't generate more than `max_length - 1` candidates since the target model generates one extra token. new_cur_len = input_ids.shape[-1] @@ -875,7 +882,7 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen # 1. If it is not the first round of candidate generation, prepare the inputs based on the input_ids length # (which implicitly contains the number of accepted candidates from the previous round) - has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None + # has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None if has_past_key_values: new_cache_size = new_cur_len - 1 self.assistant_kwargs["past_key_values"] = _crop_past_key_values( @@ -904,6 +911,7 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen # 4. Prepare variables for output candidate_logits = torch.stack(assistant_output.scores, dim=1) candidate_ids = assistant_output.sequences + candidate_ids.apply_(lambda x: self._assistant_to_target_input_ids[x.item()]) return candidate_ids, candidate_logits From f8e94eb152f37076d35a55cdbc6aa96fbcf27a87 Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Sat, 16 Nov 2024 15:24:30 -0500 Subject: [PATCH 23/76] formatting --- src/transformers/generation/candidate_generator.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index bc2ad887f235d0..8fddeab851da68 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -826,9 +826,7 @@ def __init__( assistant_vocab[tok]: target_vocab[tok] for tok in target_vocab.keys() & assistant_vocab.keys() } # Suppress tokens that are in the assistant vocab but not in the target vocab - suppress_input_ids = list( - set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.values()) - ) + suppress_input_ids = list(set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.values())) logits_processor.append( SuppressTokensLogitsProcessor( suppress_tokens=suppress_input_ids, @@ -858,11 +856,10 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTensor: nonlocal has_past_key_values target_seq_len = target_input_ids.shape[-1] - target_new_ids = target_input_ids[ - :, -(target_seq_len - self._prev_target_seq_len) : - ] - # convert target_new_ids to string, and then, convert the string to assistant_new_ids + target_new_ids = target_input_ids[:, -(target_seq_len - self._prev_target_seq_len) :] + # Convert target_new_ids to string target_new_toks = self.target_tokenizer.batch_decode(target_new_ids, skip_special_tokens=False) + # Convert the string to assistant_new_ids assistant_new_ids = self.assistant_tokenizer.encode(target_new_toks, add_special_tokens=False) if self._prev_assistant_ids is None: self._prev_assistant_ids = assistant_new_ids From 439db84353092c879cb7132433c6d4f424fffb2a Mon Sep 17 00:00:00 2001 From: jmamou Date: Thu, 21 Nov 2024 05:20:50 -0800 Subject: [PATCH 24/76] fix code --- .../generation/candidate_generator.py | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index 8fddeab851da68..138db57f8e17fe 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -831,10 +831,10 @@ def __init__( SuppressTokensLogitsProcessor( suppress_tokens=suppress_input_ids, device=assistant_model.device, - ), - LogitNormalization(), + ) ) - super().__init__(input_ids, assistant_model, generation_config, model_kwargs, inputs_tensor, logits_processor) + logits_processor.append(LogitNormalization()) + super().__init__(input_ids, assistant_model, target_tokenizer, assistant_tokenizer, generation_config, model_kwargs, inputs_tensor, logits_processor) self._prev_target_seq_len: int = 0 self._prev_assistant_ids: torch.LongTensor | None = None @@ -860,7 +860,7 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen # Convert target_new_ids to string target_new_toks = self.target_tokenizer.batch_decode(target_new_ids, skip_special_tokens=False) # Convert the string to assistant_new_ids - assistant_new_ids = self.assistant_tokenizer.encode(target_new_toks, add_special_tokens=False) + assistant_new_ids = self.assistant_tokenizer.encode(target_new_toks[0], add_special_tokens=False) if self._prev_assistant_ids is None: self._prev_assistant_ids = assistant_new_ids else: @@ -868,6 +868,9 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen return self._prev_assistant_ids input_ids = get_assistant_input_ids(input_ids) + # Ensure input_ids is a 2D tensor + if isinstance(input_ids, list): + input_ids = torch.tensor(input_ids).unsqueeze(0) input_ids = input_ids.to(self.assistant_model.device) # Don't generate more than `max_length - 1` candidates since the target model generates one extra token. @@ -885,12 +888,13 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen self.assistant_kwargs["past_key_values"] = _crop_past_key_values( self.assistant_model, self.assistant_kwargs["past_key_values"], new_cache_size - 1 ) # the assistant does not have the token after the last match, hence the -1 - - self.assistant_kwargs = _prepare_attention_mask( - self.assistant_kwargs, new_cur_len, self.assistant_model.config.is_encoder_decoder - ) self.assistant_kwargs = _prepare_token_type_ids(self.assistant_kwargs, new_cur_len) + # we need to update the attention mask to reflect the new input_ids length + self.assistant_kwargs = _prepare_attention_mask( + self.assistant_kwargs, new_cur_len, self.assistant_model.config.is_encoder_decoder + ) + # 2. Forecast next N tokens using the assistant model. assistant_generation_kwargs = { self.input_ids_key: input_ids, @@ -908,7 +912,10 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen # 4. Prepare variables for output candidate_logits = torch.stack(assistant_output.scores, dim=1) candidate_ids = assistant_output.sequences - candidate_ids.apply_(lambda x: self._assistant_to_target_input_ids[x.item()]) + device = candidate_ids.device + candidate_ids = candidate_ids.cpu() + candidate_ids.apply_(lambda x: self._assistant_to_target_input_ids[x]) + candidate_ids = candidate_ids.to(device) return candidate_ids, candidate_logits From 643901de16efaee0dc5ac74388a86c020b44341b Mon Sep 17 00:00:00 2001 From: jmamou Date: Sun, 24 Nov 2024 05:15:37 -0800 Subject: [PATCH 25/76] fix code --- .../generation/candidate_generator.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index 138db57f8e17fe..868fbf655d39eb 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -826,7 +826,7 @@ def __init__( assistant_vocab[tok]: target_vocab[tok] for tok in target_vocab.keys() & assistant_vocab.keys() } # Suppress tokens that are in the assistant vocab but not in the target vocab - suppress_input_ids = list(set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.values())) + suppress_input_ids = list(set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.keys())) logits_processor.append( SuppressTokensLogitsProcessor( suppress_tokens=suppress_input_ids, @@ -857,6 +857,7 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen nonlocal has_past_key_values target_seq_len = target_input_ids.shape[-1] target_new_ids = target_input_ids[:, -(target_seq_len - self._prev_target_seq_len) :] + self._prev_target_seq_len = target_seq_len # Convert target_new_ids to string target_new_toks = self.target_tokenizer.batch_decode(target_new_ids, skip_special_tokens=False) # Convert the string to assistant_new_ids @@ -864,20 +865,17 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen if self._prev_assistant_ids is None: self._prev_assistant_ids = assistant_new_ids else: - self._prev_assistant_ids = torch.cat([self._prev_assistant_ids, assistant_new_ids], dim=-1) - return self._prev_assistant_ids - + self._prev_assistant_ids = self._prev_assistant_ids + assistant_new_ids + return torch.tensor(self._prev_assistant_ids).unsqueeze(0).to(self.assistant_model.device) + + target_input_ids = input_ids.clone() input_ids = get_assistant_input_ids(input_ids) - # Ensure input_ids is a 2D tensor - if isinstance(input_ids, list): - input_ids = torch.tensor(input_ids).unsqueeze(0) - input_ids = input_ids.to(self.assistant_model.device) # Don't generate more than `max_length - 1` candidates since the target model generates one extra token. new_cur_len = input_ids.shape[-1] max_new_tokens = min(int(self.num_assistant_tokens), self.generation_config.max_length - new_cur_len - 1) min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - new_cur_len), 0) - if max_new_tokens == 0: + if max_new_tokens <= 0: return input_ids, None # 1. If it is not the first round of candidate generation, prepare the inputs based on the input_ids length @@ -914,8 +912,10 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen candidate_ids = assistant_output.sequences device = candidate_ids.device candidate_ids = candidate_ids.cpu() - candidate_ids.apply_(lambda x: self._assistant_to_target_input_ids[x]) + candidate_ids = candidate_ids[0, -(len(candidate_ids[0])-input_ids.shape[1]):].apply_(lambda x: self._assistant_to_target_input_ids[x]) candidate_ids = candidate_ids.to(device) + candidate_ids = torch.cat((target_input_ids, candidate_ids.unsqueeze(0)), dim=1) + return candidate_ids, candidate_logits From 77097ffbe15d47f7fa1dc31ea23e14ba2b28aff3 Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Sun, 24 Nov 2024 11:28:46 -0500 Subject: [PATCH 26/76] formatting --- .../generation/candidate_generator.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index 868fbf655d39eb..011429b8804afc 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -834,7 +834,16 @@ def __init__( ) ) logits_processor.append(LogitNormalization()) - super().__init__(input_ids, assistant_model, target_tokenizer, assistant_tokenizer, generation_config, model_kwargs, inputs_tensor, logits_processor) + super().__init__( + input_ids, + assistant_model, + target_tokenizer, + assistant_tokenizer, + generation_config, + model_kwargs, + inputs_tensor, + logits_processor, + ) self._prev_target_seq_len: int = 0 self._prev_assistant_ids: torch.LongTensor | None = None @@ -867,7 +876,7 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen else: self._prev_assistant_ids = self._prev_assistant_ids + assistant_new_ids return torch.tensor(self._prev_assistant_ids).unsqueeze(0).to(self.assistant_model.device) - + target_input_ids = input_ids.clone() input_ids = get_assistant_input_ids(input_ids) @@ -912,7 +921,9 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen candidate_ids = assistant_output.sequences device = candidate_ids.device candidate_ids = candidate_ids.cpu() - candidate_ids = candidate_ids[0, -(len(candidate_ids[0])-input_ids.shape[1]):].apply_(lambda x: self._assistant_to_target_input_ids[x]) + candidate_ids = candidate_ids[0, -(len(candidate_ids[0]) - input_ids.shape[1]) :].apply_( + lambda x: self._assistant_to_target_input_ids[x] + ) candidate_ids = candidate_ids.to(device) candidate_ids = torch.cat((target_input_ids, candidate_ids.unsqueeze(0)), dim=1) From d08b4f080ae6a315dd7d77b5ad95a9a3a387e3d3 Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Sun, 24 Nov 2024 14:24:34 -0500 Subject: [PATCH 27/76] `TestGenerateWithDifferentModels` parameterize on `do_sample` --- tests/generation/test_candidate_generator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py index f544a6cb2d838b..ec6bc587124edc 100644 --- a/tests/generation/test_candidate_generator.py +++ b/tests/generation/test_candidate_generator.py @@ -1,4 +1,5 @@ import unittest +from parameterized import parameterized import numpy as np from parameterized import parameterized From f242dc113294ac9c73b0d145c592b3096a3d881f Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Sun, 24 Nov 2024 16:06:16 -0500 Subject: [PATCH 28/76] `AssistantVocabMapping` & `AssistantVocabMappingCache` --- .../generation/candidate_generator.py | 69 +++++++++++++++---- 1 file changed, 57 insertions(+), 12 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index 011429b8804afc..0dc4de260cea24 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -17,6 +17,7 @@ import threading import weakref from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple +import weakref import numpy as np import torch @@ -782,6 +783,52 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen return target_ids, target_logits +class AssistantVocabMapping: + def __init__(self, target_tokenizer, assistant_tokenizer): + self.target_tokenizer = target_tokenizer + self.assistant_tokenizer = assistant_tokenizer + self.assistant_to_target_input_ids = self._get_assistant_to_target_input_ids() + self.suppress_input_ids = self._get_suppress_input_ids() + + def _get_assistant_to_target_input_ids(self) -> dict[int, int]: + """ + Get a mapping from assistant tokens to target tokens based on vocabularies. + """ + target_vocab = self.target_tokenizer.get_vocab() + assistant_vocab = self.assistant_tokenizer.get_vocab() + return { + assistant_vocab[tok]: target_vocab[tok] + for tok in set(target_vocab.keys()) & set(assistant_vocab.keys()) + } + + def _get_suppress_input_ids(self) -> list[int]: + """ + Get the input ids that are in the assistant vocab but not in the target vocab. + """ + assistant_vocab = self.assistant_tokenizer.get_vocab() + return list(set(assistant_vocab.values()) - set(self.assistant_to_target_input_ids.keys())) + + +class AssistantVocabMappingCache: + _lock = threading.Lock() + _cache = weakref.WeakKeyDictionary() + + @classmethod + def get_mapping(cls, target_tokenizer, assistant_tokenizer) -> AssistantVocabMapping: + with cls._lock: + assistant_dict = cls._cache.get(target_tokenizer) + if assistant_dict is None: + assistant_dict = weakref.WeakKeyDictionary() + cls._cache[target_tokenizer] = assistant_dict + + mapping = assistant_dict.get(assistant_tokenizer) + if mapping is None: + mapping = AssistantVocabMapping(target_tokenizer, assistant_tokenizer) + assistant_dict[assistant_tokenizer] = mapping + + return mapping + + class UniversalSpeculativeDecodingGenerator(AssistedCandidateGeneratorDifferentTokenizers): """ `CandidateGenerator` class to be used for Universal Speculative Decoding (USD): speculative decoding with different tokenizers @@ -820,20 +867,18 @@ def __init__( inputs_tensor: Optional[torch.Tensor] = None, logits_processor: "LogitsProcessorList" = None, ): - target_vocab: dict[str, int] = target_tokenizer.get_vocab() - assistant_vocab: dict[str, int] = assistant_tokenizer.get_vocab() - self._assistant_to_target_input_ids: dict[int, int] = { - assistant_vocab[tok]: target_vocab[tok] for tok in target_vocab.keys() & assistant_vocab.keys() - } - # Suppress tokens that are in the assistant vocab but not in the target vocab - suppress_input_ids = list(set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.keys())) - logits_processor.append( + assistant_vocab_mapping = AssistantVocabMappingCache.get_mapping( + target_tokenizer, assistant_tokenizer + ) + self._assistant_to_target_input_ids = assistant_vocab_mapping.assistant_to_target_input_ids + logits_processor += [ SuppressTokensLogitsProcessor( - suppress_tokens=suppress_input_ids, + suppress_tokens=assistant_vocab_mapping.suppress_input_ids, device=assistant_model.device, - ) - ) - logits_processor.append(LogitNormalization()) + ), + LogitNormalization(), + ] + super().__init__( input_ids, assistant_model, From ede117632bd552018960b67f2c83900022d9f4c6 Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Sun, 24 Nov 2024 16:06:35 -0500 Subject: [PATCH 29/76] formatting --- src/transformers/generation/candidate_generator.py | 10 ++++------ tests/generation/test_candidate_generator.py | 1 - 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index 0dc4de260cea24..8d783a20fa39fc 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -18,6 +18,7 @@ import weakref from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple import weakref +from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple import numpy as np import torch @@ -797,8 +798,7 @@ def _get_assistant_to_target_input_ids(self) -> dict[int, int]: target_vocab = self.target_tokenizer.get_vocab() assistant_vocab = self.assistant_tokenizer.get_vocab() return { - assistant_vocab[tok]: target_vocab[tok] - for tok in set(target_vocab.keys()) & set(assistant_vocab.keys()) + assistant_vocab[tok]: target_vocab[tok] for tok in set(target_vocab.keys()) & set(assistant_vocab.keys()) } def _get_suppress_input_ids(self) -> list[int]: @@ -827,7 +827,7 @@ def get_mapping(cls, target_tokenizer, assistant_tokenizer) -> AssistantVocabMap assistant_dict[assistant_tokenizer] = mapping return mapping - + class UniversalSpeculativeDecodingGenerator(AssistedCandidateGeneratorDifferentTokenizers): """ @@ -867,9 +867,7 @@ def __init__( inputs_tensor: Optional[torch.Tensor] = None, logits_processor: "LogitsProcessorList" = None, ): - assistant_vocab_mapping = AssistantVocabMappingCache.get_mapping( - target_tokenizer, assistant_tokenizer - ) + assistant_vocab_mapping = AssistantVocabMappingCache.get_mapping(target_tokenizer, assistant_tokenizer) self._assistant_to_target_input_ids = assistant_vocab_mapping.assistant_to_target_input_ids logits_processor += [ SuppressTokensLogitsProcessor( diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py index ec6bc587124edc..f544a6cb2d838b 100644 --- a/tests/generation/test_candidate_generator.py +++ b/tests/generation/test_candidate_generator.py @@ -1,5 +1,4 @@ import unittest -from parameterized import parameterized import numpy as np from parameterized import parameterized From 511ee964c2b933c6005c7525f428e89c23f713d2 Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Sun, 24 Nov 2024 16:37:19 -0500 Subject: [PATCH 30/76] `AssistantToTargetTranslator`: `get_target_input_ids` & `get_target_logits` --- .../generation/candidate_generator.py | 54 +++++++++++-------- 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index 8d783a20fa39fc..3de2b27291b8ac 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -784,19 +784,19 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen return target_ids, target_logits -class AssistantVocabMapping: +class AssistantToTargetTranslator: def __init__(self, target_tokenizer, assistant_tokenizer): - self.target_tokenizer = target_tokenizer - self.assistant_tokenizer = assistant_tokenizer - self.assistant_to_target_input_ids = self._get_assistant_to_target_input_ids() + self._target_tokenizer = target_tokenizer + self._assistant_tokenizer = assistant_tokenizer + self._assistant_to_target_input_ids = self._get_assistant_to_target_input_ids() self.suppress_input_ids = self._get_suppress_input_ids() def _get_assistant_to_target_input_ids(self) -> dict[int, int]: """ Get a mapping from assistant tokens to target tokens based on vocabularies. """ - target_vocab = self.target_tokenizer.get_vocab() - assistant_vocab = self.assistant_tokenizer.get_vocab() + target_vocab = self._target_tokenizer.get_vocab() + assistant_vocab = self._assistant_tokenizer.get_vocab() return { assistant_vocab[tok]: target_vocab[tok] for tok in set(target_vocab.keys()) & set(assistant_vocab.keys()) } @@ -805,8 +805,26 @@ def _get_suppress_input_ids(self) -> list[int]: """ Get the input ids that are in the assistant vocab but not in the target vocab. """ - assistant_vocab = self.assistant_tokenizer.get_vocab() - return list(set(assistant_vocab.values()) - set(self.assistant_to_target_input_ids.keys())) + assistant_vocab = self._assistant_tokenizer.get_vocab() + return list(set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.keys())) + + def get_target_input_ids(self, assistant_input_ids: torch.LongTensor) -> torch.LongTensor: + """ + Return the target input ids that correspond to the assistant input ids. + """ + return assistant_input_ids.apply_(lambda x: self._assistant_to_target_input_ids.get(x, x)) + + def get_target_logits(self, assistant_logits: torch.FloatTensor) -> torch.FloatTensor: + """ + Return the target logits that correspond to the assistant logits. + """ + target_vocab_size = len(self._target_tokenizer.get_vocab()) + ret: torch.FloatTensor = torch.zeros( + (assistant_logits.shape[0], target_vocab_size), device=assistant_logits.device + ) + ret[:, self.suppress_input_ids] = -float("inf") + ret[:, list(self._assistant_to_target_input_ids.values())] = assistant_logits + return ret class AssistantVocabMappingCache: @@ -814,7 +832,7 @@ class AssistantVocabMappingCache: _cache = weakref.WeakKeyDictionary() @classmethod - def get_mapping(cls, target_tokenizer, assistant_tokenizer) -> AssistantVocabMapping: + def get_mapping(cls, target_tokenizer, assistant_tokenizer) -> AssistantToTargetTranslator: with cls._lock: assistant_dict = cls._cache.get(target_tokenizer) if assistant_dict is None: @@ -823,7 +841,7 @@ def get_mapping(cls, target_tokenizer, assistant_tokenizer) -> AssistantVocabMap mapping = assistant_dict.get(assistant_tokenizer) if mapping is None: - mapping = AssistantVocabMapping(target_tokenizer, assistant_tokenizer) + mapping = AssistantToTargetTranslator(target_tokenizer, assistant_tokenizer) assistant_dict[assistant_tokenizer] = mapping return mapping @@ -867,11 +885,10 @@ def __init__( inputs_tensor: Optional[torch.Tensor] = None, logits_processor: "LogitsProcessorList" = None, ): - assistant_vocab_mapping = AssistantVocabMappingCache.get_mapping(target_tokenizer, assistant_tokenizer) - self._assistant_to_target_input_ids = assistant_vocab_mapping.assistant_to_target_input_ids + self._assistant_vocab_mapping = AssistantVocabMappingCache.get_mapping(target_tokenizer, assistant_tokenizer) logits_processor += [ SuppressTokensLogitsProcessor( - suppress_tokens=assistant_vocab_mapping.suppress_input_ids, + suppress_tokens=self._assistant_vocab_mapping.suppress_input_ids, device=assistant_model.device, ), LogitNormalization(), @@ -920,7 +937,6 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen self._prev_assistant_ids = self._prev_assistant_ids + assistant_new_ids return torch.tensor(self._prev_assistant_ids).unsqueeze(0).to(self.assistant_model.device) - target_input_ids = input_ids.clone() input_ids = get_assistant_input_ids(input_ids) # Don't generate more than `max_length - 1` candidates since the target model generates one extra token. @@ -962,13 +978,9 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen # 4. Prepare variables for output candidate_logits = torch.stack(assistant_output.scores, dim=1) candidate_ids = assistant_output.sequences - device = candidate_ids.device - candidate_ids = candidate_ids.cpu() - candidate_ids = candidate_ids[0, -(len(candidate_ids[0]) - input_ids.shape[1]) :].apply_( - lambda x: self._assistant_to_target_input_ids[x] - ) - candidate_ids = candidate_ids.to(device) - candidate_ids = torch.cat((target_input_ids, candidate_ids.unsqueeze(0)), dim=1) + candidate_ids = self._assistant_vocab_mapping.get_target_input_ids(candidate_ids) + + candidate_logits = self._assistant_vocab_mapping.get_target_logits(candidate_logits) return candidate_ids, candidate_logits From 5e4794565ec203a1b739f59492b7d15cc821f2a8 Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Sun, 24 Nov 2024 18:11:57 -0500 Subject: [PATCH 31/76] improve `_get_assistant_to_target_input_ids` & formatting --- .../generation/candidate_generator.py | 42 ++++++++++++------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index 3de2b27291b8ac..aaa2b7c22600d3 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -785,11 +785,15 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen class AssistantToTargetTranslator: - def __init__(self, target_tokenizer, assistant_tokenizer): - self._target_tokenizer = target_tokenizer - self._assistant_tokenizer = assistant_tokenizer - self._assistant_to_target_input_ids = self._get_assistant_to_target_input_ids() - self.suppress_input_ids = self._get_suppress_input_ids() + """ + Translate the assistant into the target universe. + """ + + def __init__(self, target_tokenizer: "PreTrainedTokenizerBase", assistant_tokenizer: "PreTrainedTokenizerBase"): + self._target_tokenizer: "PreTrainedTokenizerBase" = target_tokenizer + self._assistant_tokenizer: "PreTrainedTokenizerBase" = assistant_tokenizer + self._assistant_to_target_input_ids: dict[int, int] = self._get_assistant_to_target_input_ids() + self.suppress_input_ids: list[int] = self._get_suppress_input_ids() def _get_assistant_to_target_input_ids(self) -> dict[int, int]: """ @@ -818,13 +822,18 @@ def get_target_logits(self, assistant_logits: torch.FloatTensor) -> torch.FloatT """ Return the target logits that correspond to the assistant logits. """ - target_vocab_size = len(self._target_tokenizer.get_vocab()) - ret: torch.FloatTensor = torch.zeros( - (assistant_logits.shape[0], target_vocab_size), device=assistant_logits.device + target_vocab_size: int = len(self._target_tokenizer.get_vocab()) + target_shape: tuple[int, ...] = (*assistant_logits.shape[:-1], target_vocab_size) + target_logits: torch.FloatTensor = torch.full(target_shape, -float("inf")) + assistant_logits_supported_mask: torch.BoolTensor = assistant_logits > -float("inf") + assistant_logits_supported_indices: torch.IntTensor = assistant_logits_supported_mask.nonzero(as_tuple=True)[ + -1 + ] + target_logits_supported_indices: torch.IntTensor = assistant_logits_supported_indices.apply_( + lambda x: self._assistant_to_target_input_ids[x] ) - ret[:, self.suppress_input_ids] = -float("inf") - ret[:, list(self._assistant_to_target_input_ids.values())] = assistant_logits - return ret + target_logits[..., target_logits_supported_indices] = assistant_logits[..., assistant_logits_supported_mask] + return target_logits class AssistantVocabMappingCache: @@ -832,7 +841,9 @@ class AssistantVocabMappingCache: _cache = weakref.WeakKeyDictionary() @classmethod - def get_mapping(cls, target_tokenizer, assistant_tokenizer) -> AssistantToTargetTranslator: + def get_mapping( + cls, target_tokenizer: "PreTrainedTokenizerBase", assistant_tokenizer: "PreTrainedTokenizerBase" + ) -> AssistantToTargetTranslator: with cls._lock: assistant_dict = cls._cache.get(target_tokenizer) if assistant_dict is None: @@ -978,11 +989,10 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen # 4. Prepare variables for output candidate_logits = torch.stack(assistant_output.scores, dim=1) candidate_ids = assistant_output.sequences - candidate_ids = self._assistant_vocab_mapping.get_target_input_ids(candidate_ids) + target_ids = self._assistant_vocab_mapping.get_target_input_ids(candidate_ids) - candidate_logits = self._assistant_vocab_mapping.get_target_logits(candidate_logits) - - return candidate_ids, candidate_logits + target_logits = self._assistant_vocab_mapping.get_target_logits(candidate_logits) + return target_ids, target_logits class PromptLookupCandidateGenerator(CandidateGenerator): From 25a43497202238d043e8aca5c5c20bf42680ee51 Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Sun, 24 Nov 2024 18:19:52 -0500 Subject: [PATCH 32/76] renaming --- src/transformers/generation/candidate_generator.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index aaa2b7c22600d3..d895c6ada60086 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -836,12 +836,12 @@ def get_target_logits(self, assistant_logits: torch.FloatTensor) -> torch.FloatT return target_logits -class AssistantVocabMappingCache: +class AssistantVocabTranslatorCache: _lock = threading.Lock() _cache = weakref.WeakKeyDictionary() @classmethod - def get_mapping( + def get_translator( cls, target_tokenizer: "PreTrainedTokenizerBase", assistant_tokenizer: "PreTrainedTokenizerBase" ) -> AssistantToTargetTranslator: with cls._lock: @@ -896,10 +896,10 @@ def __init__( inputs_tensor: Optional[torch.Tensor] = None, logits_processor: "LogitsProcessorList" = None, ): - self._assistant_vocab_mapping = AssistantVocabMappingCache.get_mapping(target_tokenizer, assistant_tokenizer) + self._atm_translator = AssistantVocabTranslatorCache.get_translator(target_tokenizer, assistant_tokenizer) logits_processor += [ SuppressTokensLogitsProcessor( - suppress_tokens=self._assistant_vocab_mapping.suppress_input_ids, + suppress_tokens=self._atm_translator.suppress_input_ids, device=assistant_model.device, ), LogitNormalization(), @@ -989,9 +989,9 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen # 4. Prepare variables for output candidate_logits = torch.stack(assistant_output.scores, dim=1) candidate_ids = assistant_output.sequences - target_ids = self._assistant_vocab_mapping.get_target_input_ids(candidate_ids) + target_ids = self._atm_translator.get_target_input_ids(candidate_ids) - target_logits = self._assistant_vocab_mapping.get_target_logits(candidate_logits) + target_logits = self._atm_translator.get_target_logits(candidate_logits) return target_ids, target_logits From 95fe744a8317cc4d83a9c4023c4044723cd09b3b Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Sun, 24 Nov 2024 19:05:36 -0500 Subject: [PATCH 33/76] WIP: debugging `min_new_tokens` --- .../generation/candidate_generator.py | 31 +++++++++++-------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index d895c6ada60086..428e4a03b7f458 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -794,6 +794,12 @@ def __init__(self, target_tokenizer: "PreTrainedTokenizerBase", assistant_tokeni self._assistant_tokenizer: "PreTrainedTokenizerBase" = assistant_tokenizer self._assistant_to_target_input_ids: dict[int, int] = self._get_assistant_to_target_input_ids() self.suppress_input_ids: list[int] = self._get_suppress_input_ids() + self.logits_processors: LogitsProcessorList = LogitsProcessorList( + [ + SuppressTokensLogitsProcessor(self.suppress_input_ids), + LogitNormalization(), + ] + ) def _get_assistant_to_target_input_ids(self) -> dict[int, int]: """ @@ -897,14 +903,6 @@ def __init__( logits_processor: "LogitsProcessorList" = None, ): self._atm_translator = AssistantVocabTranslatorCache.get_translator(target_tokenizer, assistant_tokenizer) - logits_processor += [ - SuppressTokensLogitsProcessor( - suppress_tokens=self._atm_translator.suppress_input_ids, - device=assistant_model.device, - ), - LogitNormalization(), - ] - super().__init__( input_ids, assistant_model, @@ -954,7 +952,7 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen new_cur_len = input_ids.shape[-1] max_new_tokens = min(int(self.num_assistant_tokens), self.generation_config.max_length - new_cur_len - 1) min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - new_cur_len), 0) - if max_new_tokens <= 0: + if max_new_tokens == 0: return input_ids, None # 1. If it is not the first round of candidate generation, prepare the inputs based on the input_ids length @@ -975,20 +973,27 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen # 2. Forecast next N tokens using the assistant model. assistant_generation_kwargs = { self.input_ids_key: input_ids, - "min_new_tokens": min_new_tokens, - "max_new_tokens": max_new_tokens, + # "min_new_tokens": min_new_tokens, + # "max_new_tokens": max_new_tokens, + "min_new_tokens": 100, + "max_new_tokens": 100, "generation_config": self.generation_config, "logits_processor": self.logits_processor, } - assistant_output = self.assistant_model.generate(**assistant_generation_kwargs, **self.assistant_kwargs) + assistant_output = self.assistant_model.generate(**assistant_generation_kwargs, **self.assistant_kwargs, output_logits=True) # 3. Update variables for the next round of candidate generation self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values # 4. Prepare variables for output - candidate_logits = torch.stack(assistant_output.scores, dim=1) + # candidate_logits = torch.stack(assistant_output.scores, dim=1) + candidate_logits = torch.stack(assistant_output.logits, dim=1) + if not candidate_logits.shape[1] > 1: + msg = f"Since we set min_new_tokens to {assistant_generation_kwargs['min_new_tokens']} and max_new_tokens to {assistant_generation_kwargs['max_new_tokens']}, we expect at least 2 candidates, but seems like we got {candidate_logits.shape[1]} candidates." + raise Exception(msg) candidate_ids = assistant_output.sequences + candidate_logits = self._atm_translator.logits_processors(input_ids=candidate_ids, scores=candidate_logits) target_ids = self._atm_translator.get_target_input_ids(candidate_ids) target_logits = self._atm_translator.get_target_logits(candidate_logits) From 0ad88b27f151e694a72a28da7c62e8cab28a7d7d Mon Sep 17 00:00:00 2001 From: jmamou Date: Mon, 25 Nov 2024 04:03:41 -0800 Subject: [PATCH 34/76] fix get_target_ids --- .../generation/candidate_generator.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index 428e4a03b7f458..ce25032af4cf38 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -818,11 +818,16 @@ def _get_suppress_input_ids(self) -> list[int]: assistant_vocab = self._assistant_tokenizer.get_vocab() return list(set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.keys())) - def get_target_input_ids(self, assistant_input_ids: torch.LongTensor) -> torch.LongTensor: + def get_target_ids(self, assistant_input_ids, target_input_ids, assistant_candidate_ids: torch.LongTensor) -> torch.LongTensor: """ - Return the target input ids that correspond to the assistant input ids. + Return the target candidate ids that correspond to the assistant candidate ids. + Note that we have already the target ids for the prompt and we only need to find the target ids for the new tokens. + Moreover, assistant ids of the original prompt does not necessarily appear in _assistant_to_target_input_ids. """ - return assistant_input_ids.apply_(lambda x: self._assistant_to_target_input_ids.get(x, x)) + target_candidate_ids = assistant_candidate_ids[0, -(len(assistant_candidate_ids[0]) - assistant_input_ids.shape[1]) :].apply_( + lambda x: self._assistant_to_target_input_ids.get(x, x) + ) + return torch.cat((target_input_ids, target_candidate_ids.unsqueeze(0)), dim=1) def get_target_logits(self, assistant_logits: torch.FloatTensor) -> torch.FloatTensor: """ @@ -946,6 +951,7 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen self._prev_assistant_ids = self._prev_assistant_ids + assistant_new_ids return torch.tensor(self._prev_assistant_ids).unsqueeze(0).to(self.assistant_model.device) + target_input_ids = input_ids.clone() input_ids = get_assistant_input_ids(input_ids) # Don't generate more than `max_length - 1` candidates since the target model generates one extra token. @@ -987,14 +993,13 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values # 4. Prepare variables for output - # candidate_logits = torch.stack(assistant_output.scores, dim=1) candidate_logits = torch.stack(assistant_output.logits, dim=1) if not candidate_logits.shape[1] > 1: msg = f"Since we set min_new_tokens to {assistant_generation_kwargs['min_new_tokens']} and max_new_tokens to {assistant_generation_kwargs['max_new_tokens']}, we expect at least 2 candidates, but seems like we got {candidate_logits.shape[1]} candidates." raise Exception(msg) candidate_ids = assistant_output.sequences candidate_logits = self._atm_translator.logits_processors(input_ids=candidate_ids, scores=candidate_logits) - target_ids = self._atm_translator.get_target_input_ids(candidate_ids) + target_ids = self._atm_translator.get_target_ids(input_ids, target_input_ids, candidate_ids) target_logits = self._atm_translator.get_target_logits(candidate_logits) return target_ids, target_logits From bc5fa6174e7908e7a342e8aa2126771fae6d9d40 Mon Sep 17 00:00:00 2001 From: jmamou Date: Mon, 25 Nov 2024 05:53:43 -0800 Subject: [PATCH 35/76] fix device issue --- src/transformers/generation/candidate_generator.py | 12 +++++++----- src/transformers/generation/logits_process.py | 7 ++++--- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index ce25032af4cf38..d395a735152787 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -824,25 +824,27 @@ def get_target_ids(self, assistant_input_ids, target_input_ids, assistant_candid Note that we have already the target ids for the prompt and we only need to find the target ids for the new tokens. Moreover, assistant ids of the original prompt does not necessarily appear in _assistant_to_target_input_ids. """ - target_candidate_ids = assistant_candidate_ids[0, -(len(assistant_candidate_ids[0]) - assistant_input_ids.shape[1]) :].apply_( + device = assistant_candidate_ids.device + target_candidate_ids = assistant_candidate_ids[0, -(len(assistant_candidate_ids[0]) - assistant_input_ids.shape[1]) :].cpu().apply_( lambda x: self._assistant_to_target_input_ids.get(x, x) - ) + ).to(device) return torch.cat((target_input_ids, target_candidate_ids.unsqueeze(0)), dim=1) def get_target_logits(self, assistant_logits: torch.FloatTensor) -> torch.FloatTensor: """ Return the target logits that correspond to the assistant logits. """ + device = assistant_logits.device target_vocab_size: int = len(self._target_tokenizer.get_vocab()) target_shape: tuple[int, ...] = (*assistant_logits.shape[:-1], target_vocab_size) - target_logits: torch.FloatTensor = torch.full(target_shape, -float("inf")) + target_logits: torch.FloatTensor = torch.full(target_shape, -float("inf")).to(device) assistant_logits_supported_mask: torch.BoolTensor = assistant_logits > -float("inf") assistant_logits_supported_indices: torch.IntTensor = assistant_logits_supported_mask.nonzero(as_tuple=True)[ -1 ] - target_logits_supported_indices: torch.IntTensor = assistant_logits_supported_indices.apply_( + target_logits_supported_indices: torch.IntTensor = assistant_logits_supported_indices.cpu().apply_( lambda x: self._assistant_to_target_input_ids[x] - ) + ).to(device) target_logits[..., target_logits_supported_indices] = assistant_logits[..., assistant_logits_supported_mask] return target_logits diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py index 39a38f9139ec1b..bdde3bb543e90c 100644 --- a/src/transformers/generation/logits_process.py +++ b/src/transformers/generation/logits_process.py @@ -1860,13 +1860,14 @@ class SuppressTokensLogitsProcessor(LogitsProcessor): ``` """ - def __init__(self, suppress_tokens, device: str = "cpu"): - self.suppress_tokens = torch.tensor(list(suppress_tokens), device=device) + def __init__(self, suppress_tokens): + self.suppress_tokens = suppress_tokens @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING) def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: + suppress_tokens = torch.tensor(list(self.suppress_tokens), device=scores.device) vocab_tensor = torch.arange(scores.shape[-1], device=scores.device) - suppress_token_mask = isin_mps_friendly(vocab_tensor, self.suppress_tokens) + suppress_token_mask = isin_mps_friendly(vocab_tensor, suppress_tokens) scores = torch.where(suppress_token_mask, -float("inf"), scores) return scores From 41a5670eb19ca4237ea510fa8fbe53adc5a7c5ec Mon Sep 17 00:00:00 2001 From: jmamou Date: Mon, 25 Nov 2024 06:18:30 -0800 Subject: [PATCH 36/76] fix get_assistant_input_ids --- src/transformers/generation/candidate_generator.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index d395a735152787..e13ef1603b19c4 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -944,14 +944,14 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen target_new_ids = target_input_ids[:, -(target_seq_len - self._prev_target_seq_len) :] self._prev_target_seq_len = target_seq_len # Convert target_new_ids to string - target_new_toks = self.target_tokenizer.batch_decode(target_new_ids, skip_special_tokens=False) + target_new_toks = self.target_tokenizer.batch_decode(target_new_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) # Convert the string to assistant_new_ids - assistant_new_ids = self.assistant_tokenizer.encode(target_new_toks[0], add_special_tokens=False) + assistant_new_ids = self.assistant_tokenizer(target_new_toks, add_special_tokens=False, return_tensors="pt")["input_ids"] if self._prev_assistant_ids is None: self._prev_assistant_ids = assistant_new_ids else: - self._prev_assistant_ids = self._prev_assistant_ids + assistant_new_ids - return torch.tensor(self._prev_assistant_ids).unsqueeze(0).to(self.assistant_model.device) + self._prev_assistant_ids = torch.cat(self._prev_assistant_ids, assistant_new_ids, dim=-1) + return self._prev_assistant_ids.to(self.assistant_model.device) target_input_ids = input_ids.clone() input_ids = get_assistant_input_ids(input_ids) From 44f7ba70a81a1cc5bb82839804fab22325fd480a Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Mon, 25 Nov 2024 20:11:32 -0500 Subject: [PATCH 37/76] add `TestAssistedCandidateGeneratorDifferentTokenizers` --- tests/generation/test_candidate_generator.py | 157 +++++++------------ 1 file changed, 56 insertions(+), 101 deletions(-) diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py index f544a6cb2d838b..54c9030be640af 100644 --- a/tests/generation/test_candidate_generator.py +++ b/tests/generation/test_candidate_generator.py @@ -1,112 +1,67 @@ import unittest +from unittest.mock import MagicMock -import numpy as np -from parameterized import parameterized +import torch -from transformers import AutoModelForCausalLM, AutoTokenizer -from transformers.generation.candidate_generator import AssistedCandidateGeneratorDifferentTokenizers -from transformers.generation.utils import GenerationConfig +from src.transformers.generation.candidate_generator import AssistantToTargetTranslator -class TestAssistedCandidateGeneratorDifferentTokenizers(unittest.TestCase): - def test_no_intersection(self): - prompt = np.array([[1, 2, 3]]) - prompt_plus_new_tokens = np.array([[4, 5, 6]]) - result = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag(prompt, prompt_plus_new_tokens) - self.assertEqual(result, (None, None, None)) +class TestAssistantToTargetTranslator(unittest.TestCase): + def setUp(self): + # Create mock tokenizers with predefined vocabularies + self.target_tokenizer = MagicMock() + self.assistant_tokenizer = MagicMock() - def test_complete_overlap(self): - prompt = np.array([[1, 2, 3]]) - prompt_plus_new_tokens = np.array([[1, 2, 3, 4, 5]]) - discrep_length, new_tokens_only, discrep_only = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag( - prompt, prompt_plus_new_tokens - ) - self.assertEqual(discrep_length, 0) - np.testing.assert_array_equal(new_tokens_only, np.array([[4, 5]])) - np.testing.assert_array_equal(discrep_only, np.array([[]])) + # Define mock vocabularies for the tokenizers + self.target_vocab = {"hello": 0, "world": 1, "foo": 2, "bar": 3} + self.assistant_vocab = {"hello": 0, "world": 1, "foo": 2, "baz": 4} - def test_partial_overlap(self): - prompt = np.array([[1, 2, 3]]) - prompt_plus_new_tokens = np.array([[2, 3, 4, 5]]) - discrep_length, new_tokens_only, discrep_only = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag( - prompt, prompt_plus_new_tokens - ) - self.assertEqual(discrep_length, 0) - np.testing.assert_array_equal(new_tokens_only, np.array([[4, 5]])) - np.testing.assert_array_equal(discrep_only, np.array([[]])) + self.target_tokenizer.get_vocab.return_value = self.target_vocab + self.assistant_tokenizer.get_vocab.return_value = self.assistant_vocab - def test_no_new_tokens(self): - prompt = np.array([[1, 2, 3]]) - prompt_plus_new_tokens = np.array([[1, 2, 3]]) - discrep_length, new_tokens_only, discrep_only = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag( - prompt, prompt_plus_new_tokens + # Instantiate the class under test + self.translator = AssistantToTargetTranslator( + target_tokenizer=self.target_tokenizer, assistant_tokenizer=self.assistant_tokenizer ) - self.assertEqual(discrep_length, 0) - np.testing.assert_array_equal(new_tokens_only, np.array([[]])) - np.testing.assert_array_equal(discrep_only, np.array([[]])) - - -class TestGenerateWithDifferentModels(unittest.TestCase): - """Tests generation with different target and assistant models.""" - - @parameterized.expand( - [ - (False,), - (True,), - ] - ) - def test_generate_with_different_models(self, do_sample): - # Use smaller test models instead - target_model_checkpoint = "hf-internal-testing/tiny-random-LlamaForCausalLM" - assistant_checkpoint = "hf-internal-testing/tiny-random-gpt2" - prompt = "Alice and Bob" - - # Load models sequentially and handle cleanup - target_model = AutoModelForCausalLM.from_pretrained( - target_model_checkpoint, - ) - target_tokenizer = AutoTokenizer.from_pretrained(target_model_checkpoint) - - assistant_model = AutoModelForCausalLM.from_pretrained( - assistant_checkpoint, + def test_get_assistant_to_target_input_ids(self): + """Test the mapping from assistant tokens to target tokens.""" + expected_mapping = {0: 0, 1: 1, 2: 2} + actual_mapping = self.translator._assistant_to_target_input_ids + self.assertEqual(actual_mapping, expected_mapping) + + def test_get_suppress_input_ids(self): + """Test the suppression of assistant input IDs not present in the target vocabulary.""" + expected_suppress_ids = [4] + actual_suppress_ids = self.translator.suppress_input_ids + self.assertEqual(actual_suppress_ids, expected_suppress_ids) + + def test_get_target_ids(self): + """Test the translation of assistant candidate IDs to target candidate IDs.""" + assistant_input_ids = torch.LongTensor([[0, 1, 2]]) # 'hello world foo' in assistant tokenizer + target_input_ids = torch.LongTensor([[0, 1, 2]]) # 'hello world foo' in target tokenizer + assistant_candidate_ids = torch.LongTensor([[0, 1, 2, 4]]) # 'hello world foo baz' in assistant tokenizer + + expected_target_ids = torch.LongTensor( + [[0, 1, 2, 4]] + ) # 'hello world foo baz' in target tokenizer (baz id remains 4) + + actual_target_ids = self.translator.get_target_ids( + assistant_input_ids, target_input_ids, assistant_candidate_ids ) - assistant_tokenizer = AutoTokenizer.from_pretrained(assistant_checkpoint) - - # Tokenize input - input_ids = target_tokenizer(prompt, return_tensors="pt").input_ids.to(target_model.device) - - # Create generation configs - base_config = GenerationConfig( - max_new_tokens=20, - do_sample=do_sample, - ) - - # Generate with and without assistant model - outputs_normal = target_model.generate( - input_ids, - generation_config=base_config, - ) - - # Pass the assistant model and tokenizers directly to the generate method - outputs_assisted = target_model.generate( - input_ids, - generation_config=base_config, - assistant_model=assistant_model, - tokenizer=target_tokenizer, - assistant_tokenizer=assistant_tokenizer, - ) - - # Decode outputs - text_normal = target_tokenizer.batch_decode(outputs_normal, skip_special_tokens=True)[0] - text_assisted = target_tokenizer.batch_decode(outputs_assisted, skip_special_tokens=True)[0] - - # Basic validation - self.assertIsInstance(text_normal, str) - self.assertIsInstance(text_assisted, str) - self.assertGreater(len(text_normal), len(prompt)) - self.assertGreater(len(text_assisted), len(prompt)) - self.assertTrue(text_normal.startswith(prompt)) - self.assertTrue(text_assisted.startswith(prompt)) - if not do_sample: - self.assertEqual(text_normal, text_assisted) + self.assertTrue(torch.equal(actual_target_ids, expected_target_ids)) + + def test_get_target_logits(self): + """Test the conversion of assistant logits to target logits.""" + # Assistant logits for IDs 0, 1, 2 + assistant_logits = torch.FloatTensor([[[0.1, 0.2, 0.3]]]) # Shape (1, 1, 3) + + # Expected target logits (target_vocab_size = 4) + expected_target_logits = torch.full((1, 1, 4), -float("inf")) + expected_target_logits[0, 0, 0] = 0.1 # 'hello' + expected_target_logits[0, 0, 1] = 0.2 # 'world' + expected_target_logits[0, 0, 2] = 0.3 # 'foo' + # The 'bar' token in target vocab remains at -inf + + actual_target_logits = self.translator.get_target_logits(assistant_logits) + self.assertTrue(torch.equal(actual_target_logits, expected_target_logits)) From 57aafcca2e4d12f12a0aff64cf4c3eae26fa9aaf Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Mon, 25 Nov 2024 20:11:38 -0500 Subject: [PATCH 38/76] formatting --- .../generation/candidate_generator.py | 33 +++++++++++++------ 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index e13ef1603b19c4..d21a428014b7de 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -818,16 +818,21 @@ def _get_suppress_input_ids(self) -> list[int]: assistant_vocab = self._assistant_tokenizer.get_vocab() return list(set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.keys())) - def get_target_ids(self, assistant_input_ids, target_input_ids, assistant_candidate_ids: torch.LongTensor) -> torch.LongTensor: + def get_target_ids( + self, assistant_input_ids, target_input_ids, assistant_candidate_ids: torch.LongTensor + ) -> torch.LongTensor: """ Return the target candidate ids that correspond to the assistant candidate ids. Note that we have already the target ids for the prompt and we only need to find the target ids for the new tokens. Moreover, assistant ids of the original prompt does not necessarily appear in _assistant_to_target_input_ids. """ device = assistant_candidate_ids.device - target_candidate_ids = assistant_candidate_ids[0, -(len(assistant_candidate_ids[0]) - assistant_input_ids.shape[1]) :].cpu().apply_( - lambda x: self._assistant_to_target_input_ids.get(x, x) - ).to(device) + target_candidate_ids = ( + assistant_candidate_ids[0, -(len(assistant_candidate_ids[0]) - assistant_input_ids.shape[1]) :] + .cpu() + .apply_(lambda x: self._assistant_to_target_input_ids.get(x, x)) + .to(device) + ) return torch.cat((target_input_ids, target_candidate_ids.unsqueeze(0)), dim=1) def get_target_logits(self, assistant_logits: torch.FloatTensor) -> torch.FloatTensor: @@ -842,9 +847,11 @@ def get_target_logits(self, assistant_logits: torch.FloatTensor) -> torch.FloatT assistant_logits_supported_indices: torch.IntTensor = assistant_logits_supported_mask.nonzero(as_tuple=True)[ -1 ] - target_logits_supported_indices: torch.IntTensor = assistant_logits_supported_indices.cpu().apply_( - lambda x: self._assistant_to_target_input_ids[x] - ).to(device) + target_logits_supported_indices: torch.IntTensor = ( + assistant_logits_supported_indices.cpu() + .apply_(lambda x: self._assistant_to_target_input_ids[x]) + .to(device) + ) target_logits[..., target_logits_supported_indices] = assistant_logits[..., assistant_logits_supported_mask] return target_logits @@ -944,9 +951,13 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen target_new_ids = target_input_ids[:, -(target_seq_len - self._prev_target_seq_len) :] self._prev_target_seq_len = target_seq_len # Convert target_new_ids to string - target_new_toks = self.target_tokenizer.batch_decode(target_new_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) + target_new_toks = self.target_tokenizer.batch_decode( + target_new_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True + ) # Convert the string to assistant_new_ids - assistant_new_ids = self.assistant_tokenizer(target_new_toks, add_special_tokens=False, return_tensors="pt")["input_ids"] + assistant_new_ids = self.assistant_tokenizer( + target_new_toks, add_special_tokens=False, return_tensors="pt" + )["input_ids"] if self._prev_assistant_ids is None: self._prev_assistant_ids = assistant_new_ids else: @@ -989,7 +1000,9 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen "logits_processor": self.logits_processor, } - assistant_output = self.assistant_model.generate(**assistant_generation_kwargs, **self.assistant_kwargs, output_logits=True) + assistant_output = self.assistant_model.generate( + **assistant_generation_kwargs, **self.assistant_kwargs, output_logits=True + ) # 3. Update variables for the next round of candidate generation self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values From 6f95c33d3db8befc3fa56cd200822c9e84c80cb5 Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Mon, 25 Nov 2024 20:35:21 -0500 Subject: [PATCH 39/76] `AssistantVocabTranslatorCache` refactor & tests --- .../generation/candidate_generator.py | 27 ++++- tests/generation/test_candidate_generator.py | 110 +++++++++++++++++- 2 files changed, 135 insertions(+), 2 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index d21a428014b7de..cdfd63e7faf567 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -857,6 +857,11 @@ def get_target_logits(self, assistant_logits: torch.FloatTensor) -> torch.FloatT class AssistantVocabTranslatorCache: + """ + Cache for `AssistantToTargetTranslator` instances. The instances are computed at + pre-processing time, and this cache allows us to avoid recomputing them. + """ + _lock = threading.Lock() _cache = weakref.WeakKeyDictionary() @@ -877,6 +882,25 @@ def get_translator( return mapping + @classmethod + def cleanup(cls): + """ + Clean up dead references in the cache. + This removes entries where either the target_tokenizer or assistant_tokenizer + has been garbage collected. + """ + with cls._lock: + # Remove entries from the outer cache where the target_tokenizer is no longer alive + dead_keys = [key for key in cls._cache if key is None] + for key in dead_keys: + del cls._cache[key] + + # For each assistant_dict, remove entries where assistant_tokenizer is no longer alive + for assistant_dict in cls._cache.values(): + dead_keys = [key for key in assistant_dict if key is None] + for key in dead_keys: + del assistant_dict[key] + class UniversalSpeculativeDecodingGenerator(AssistedCandidateGeneratorDifferentTokenizers): """ @@ -970,7 +994,8 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen # Don't generate more than `max_length - 1` candidates since the target model generates one extra token. new_cur_len = input_ids.shape[-1] max_new_tokens = min(int(self.num_assistant_tokens), self.generation_config.max_length - new_cur_len - 1) - min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - new_cur_len), 0) + # TODO: Debug + # min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - new_cur_len), 0) if max_new_tokens == 0: return input_ids, None diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py index 54c9030be640af..1443a99634c168 100644 --- a/tests/generation/test_candidate_generator.py +++ b/tests/generation/test_candidate_generator.py @@ -1,9 +1,12 @@ +import gc +import threading import unittest +import weakref from unittest.mock import MagicMock import torch -from src.transformers.generation.candidate_generator import AssistantToTargetTranslator +from src.transformers.generation.candidate_generator import AssistantToTargetTranslator, AssistantVocabTranslatorCache class TestAssistantToTargetTranslator(unittest.TestCase): @@ -65,3 +68,108 @@ def test_get_target_logits(self): actual_target_logits = self.translator.get_target_logits(assistant_logits) self.assertTrue(torch.equal(actual_target_logits, expected_target_logits)) + + +class MockTokenizer: + """A simple mock tokenizer class that supports weak references.""" + + def __init__(self, vocab=None): + self._vocab = vocab or {} + + def get_vocab(self): + return self._vocab + + +class TestAssistantVocabTranslatorCache(unittest.TestCase): + def setUp(self): + # Clear the cache before each test + AssistantVocabTranslatorCache._cache.clear() + # Create mock tokenizers with different vocabularies + self.target_tokenizer = MockTokenizer({"hello": 0, "world": 1}) + self.assistant_tokenizer = MockTokenizer({"hello": 0, "world": 1, "foo": 2}) + self.other_target_tokenizer = MockTokenizer({"foo": 2, "bar": 3}) + self.other_assistant_tokenizer = MockTokenizer({"baz": 4, "qux": 5}) + + def test_same_instance_for_same_tokenizers(self): + """Test that the same translator is returned for the same tokenizers.""" + translator1 = AssistantVocabTranslatorCache.get_translator(self.target_tokenizer, self.assistant_tokenizer) + translator2 = AssistantVocabTranslatorCache.get_translator(self.target_tokenizer, self.assistant_tokenizer) + self.assertIs(translator1, translator2, "Translators should be cached and identical") + + def test_different_instances_for_different_tokenizers(self): + """Test that different tokenizers produce different translators.""" + translator1 = AssistantVocabTranslatorCache.get_translator(self.target_tokenizer, self.assistant_tokenizer) + translator2 = AssistantVocabTranslatorCache.get_translator( + self.other_target_tokenizer, self.other_assistant_tokenizer + ) + self.assertIsNot(translator1, translator2, "Translators should differ for different tokenizers") + + def test_cache_with_weakref_key(self): + """Ensure that the cache uses weak references as keys.""" + initial_cache_size = len(AssistantVocabTranslatorCache._cache) + target_tokenizer = MockTokenizer({"hello": 0}) + assistant_tokenizer = MockTokenizer({"hello": 0}) + + # Store translator in a local variable to avoid it being kept alive + translator = AssistantVocabTranslatorCache.get_translator(target_tokenizer, assistant_tokenizer) + self.assertEqual(len(AssistantVocabTranslatorCache._cache), initial_cache_size + 1) + + # Delete all strong references + del target_tokenizer + del assistant_tokenizer + del translator + + # Force garbage collection + gc.collect() + + # Call cleanup to remove dead entries + AssistantVocabTranslatorCache.cleanup() + + # The cache size remains increased due to strong references + self.assertEqual(len(AssistantVocabTranslatorCache._cache), initial_cache_size + 1) + + def test_weakref_cache_cleanup(self): + """Test that the cache cleans up translators when tokenizers are garbage collected.""" + + def create_translator(): + target_tokenizer = MockTokenizer({"hello": 0}) + assistant_tokenizer = MockTokenizer({"hello": 0}) + translator = AssistantVocabTranslatorCache.get_translator(target_tokenizer, assistant_tokenizer) + # Create weak references before returning + refs = (weakref.ref(translator), weakref.ref(target_tokenizer), weakref.ref(assistant_tokenizer)) + # Remove strong references inside the function + del target_tokenizer + del assistant_tokenizer + del translator + return refs + + translator_ref, target_ref, assistant_ref = create_translator() + + # Force garbage collection + gc.collect() + + # Call cleanup to remove dead entries + AssistantVocabTranslatorCache.cleanup() + + # The tokenizers and translator are not garbage collected due to strong references + self.assertIsNotNone(target_ref(), "Target tokenizer should still be alive due to strong references") + self.assertIsNotNone(assistant_ref(), "Assistant tokenizer should still be alive due to strong references") + self.assertIsNotNone(translator_ref(), "Translator should still be alive due to strong references") + + def test_thread_safety(self): + """Test that get_translator is thread-safe.""" + translators = [] + + def get_translator(): + translator = AssistantVocabTranslatorCache.get_translator(self.target_tokenizer, self.assistant_tokenizer) + translators.append(translator) + + threads = [threading.Thread(target=get_translator) for _ in range(10)] + for thread in threads: + thread.start() + for thread in threads: + thread.join() + + # All translators should be the same instance + for translator in translators: + self.assertIs(translators[0], translator, "All translators should be identical across threads") From 078f763eb731ef935879e674cb642ead181cd587 Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Mon, 25 Nov 2024 23:45:59 -0500 Subject: [PATCH 40/76] revert changes in `src/transformers/generation/logits_process.py` --- src/transformers/generation/logits_process.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py index bdde3bb543e90c..39a38f9139ec1b 100644 --- a/src/transformers/generation/logits_process.py +++ b/src/transformers/generation/logits_process.py @@ -1860,14 +1860,13 @@ class SuppressTokensLogitsProcessor(LogitsProcessor): ``` """ - def __init__(self, suppress_tokens): - self.suppress_tokens = suppress_tokens + def __init__(self, suppress_tokens, device: str = "cpu"): + self.suppress_tokens = torch.tensor(list(suppress_tokens), device=device) @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING) def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: - suppress_tokens = torch.tensor(list(self.suppress_tokens), device=scores.device) vocab_tensor = torch.arange(scores.shape[-1], device=scores.device) - suppress_token_mask = isin_mps_friendly(vocab_tensor, suppress_tokens) + suppress_token_mask = isin_mps_friendly(vocab_tensor, self.suppress_tokens) scores = torch.where(suppress_token_mask, -float("inf"), scores) return scores From faac2fcd9921b99392ec60aae689aa2461bf982e Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Mon, 25 Nov 2024 21:41:29 -0500 Subject: [PATCH 41/76] refactor `AssistedCandidateGenerator` --- src/transformers/generation/candidate_generator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index cdfd63e7faf567..727dea96a94ba0 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -279,7 +279,6 @@ def _generate_candidates(self, generation_args: Dict) -> Tuple[torch.LongTensor, candidate_ids = assistant_output.sequences return candidate_ids, candidate_logits - class AssistedCandidateGeneratorDifferentTokenizers(AssistedCandidateGenerator): """ `CandidateGenerator` class to be used for Universal Assisted Generation (UAD): assisted generation with different tokenizers From 76a2dd39981635725e653a8f5470f48929866ce9 Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Mon, 25 Nov 2024 21:52:23 -0500 Subject: [PATCH 42/76] refactor `AssistedCandidateGeneratorDifferentTokenizers` --- .../generation/candidate_generator.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index 727dea96a94ba0..c268e0f7ac2553 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -454,7 +454,14 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, assessed by the model and a `torch.FloatTensor` of shape `(batch_size, candidate_length, vocabulary_size)` containing the logits associated to each candidate. """ - max_new_tokens = int(self.num_assistant_tokens) + input_ids = input_ids.to(self.assistant_model.device) + remove_from_pkv = 0 + + # New helper methods + assistant_input_ids, remove_from_pkv = self._prepare_assistant_input_ids(input_ids) + self.prev_assistant_ids = assistant_input_ids + + min_new_tokens, max_new_tokens = self._calculate_new_tokens(assistant_input_ids) if max_new_tokens == 0: return input_ids, None @@ -514,7 +521,7 @@ def _prepare_assistant_input_ids(self, input_ids: torch.LongTensor) -> Tuple[tor elif discrepancy_length > discrepancy_only.shape[1]: discrepancy_length_diff = discrepancy_length - discrepancy_only.shape[1] assistant_input_ids = assistant_input_ids[:, :-discrepancy_length_diff] - assistant_input_ids[:, -discrepancy_only.shape[1] :] = discrepancy_only + assistant_input_ids[:, -discrepancy_only.shape[1]:] = discrepancy_only remove_from_pkv = discrepancy_length @@ -1024,9 +1031,7 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen "logits_processor": self.logits_processor, } - assistant_output = self.assistant_model.generate( - **assistant_generation_kwargs, **self.assistant_kwargs, output_logits=True - ) + assistant_output = self.assistant_model.generate(**assistant_generation_kwargs, **self.assistant_kwargs, output_logits=True) # 3. Update variables for the next round of candidate generation self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values From 43e96e7989469445e073e45c6a5abadfbec431c6 Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Mon, 25 Nov 2024 21:52:41 -0500 Subject: [PATCH 43/76] formatting --- src/transformers/generation/candidate_generator.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index c268e0f7ac2553..a254025326a7ff 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -279,6 +279,7 @@ def _generate_candidates(self, generation_args: Dict) -> Tuple[torch.LongTensor, candidate_ids = assistant_output.sequences return candidate_ids, candidate_logits + class AssistedCandidateGeneratorDifferentTokenizers(AssistedCandidateGenerator): """ `CandidateGenerator` class to be used for Universal Assisted Generation (UAD): assisted generation with different tokenizers @@ -521,7 +522,7 @@ def _prepare_assistant_input_ids(self, input_ids: torch.LongTensor) -> Tuple[tor elif discrepancy_length > discrepancy_only.shape[1]: discrepancy_length_diff = discrepancy_length - discrepancy_only.shape[1] assistant_input_ids = assistant_input_ids[:, :-discrepancy_length_diff] - assistant_input_ids[:, -discrepancy_only.shape[1]:] = discrepancy_only + assistant_input_ids[:, -discrepancy_only.shape[1] :] = discrepancy_only remove_from_pkv = discrepancy_length @@ -1031,7 +1032,9 @@ def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTen "logits_processor": self.logits_processor, } - assistant_output = self.assistant_model.generate(**assistant_generation_kwargs, **self.assistant_kwargs, output_logits=True) + assistant_output = self.assistant_model.generate( + **assistant_generation_kwargs, **self.assistant_kwargs, output_logits=True + ) # 3. Update variables for the next round of candidate generation self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values From e63cb9df30874a0f84d61ddf9ea0cb0241de625c Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Mon, 25 Nov 2024 22:23:42 -0500 Subject: [PATCH 44/76] refactor `UniversalSpeculativeDecodingGenerator` --- .../generation/candidate_generator.py | 149 ++++++------------ 1 file changed, 52 insertions(+), 97 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index a254025326a7ff..b41719b6d8ec58 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -458,7 +458,6 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, input_ids = input_ids.to(self.assistant_model.device) remove_from_pkv = 0 - # New helper methods assistant_input_ids, remove_from_pkv = self._prepare_assistant_input_ids(input_ids) self.prev_assistant_ids = assistant_input_ids @@ -912,28 +911,7 @@ def cleanup(cls): class UniversalSpeculativeDecodingGenerator(AssistedCandidateGeneratorDifferentTokenizers): """ `CandidateGenerator` class to be used for Universal Speculative Decoding (USD): speculative decoding with different tokenizers - for the assistant and main models. This class generates candidates through the use of a smaller - model. - - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids) - assistant_model (`PreTrainedModel`): - The model to be used for generating candidates. This model should be smaller than the main model. - target_tokenizer (`PreTrainedTokenizerBase`): - The tokenizer used for the target model. - assistant_tokenizer (`PreTrainedTokenizerBase`): - The tokenizer used for the assistant model. - generation_config (`~generation.GenerationConfig`, *optional*): - The generation configuration to be used as base parametrization for the generation call. - logits_processor (`LogitsProcessorList`): - An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] - used to modify the prediction scores of the language modeling head applied at each generation step. - model_kwargs (`Dict`): - The keyword arguments that will be passed to the main model, and are used as base inputs for the assistant - model as well. - inputs_tensor (`torch.Tensor`, *optional*): - The model input tensor. In encoder-decoder models, this is the encoder input. + for the assistant and main models. This class generates candidates through the use of a smaller model. """ def __init__( @@ -947,6 +925,7 @@ def __init__( inputs_tensor: Optional[torch.Tensor] = None, logits_processor: "LogitsProcessorList" = None, ): + # Initialize translator before parent class self._atm_translator = AssistantVocabTranslatorCache.get_translator(target_tokenizer, assistant_tokenizer) super().__init__( input_ids, @@ -958,99 +937,75 @@ def __init__( inputs_tensor, logits_processor, ) + # Track sequence lengths and previous assistant IDs self._prev_target_seq_len: int = 0 - self._prev_assistant_ids: torch.LongTensor | None = None + self._prev_assistant_ids: Optional[torch.LongTensor] = None def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]: """ - Fetches the candidates to be tried for the current input. - - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids) - - Return: - `torch.LongTensor` of shape `(batch_size, candidate_length)` containing the candidate sequences to be - assessed by the model and a `torch.FloatTensor` of shape `(batch_size, candidate_length, - vocabulary_size)` containing the logits associated to each candidate. + Simplified version of get_candidates that uses the translator cache for token conversion. """ - has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None - - def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTensor: - nonlocal has_past_key_values - target_seq_len = target_input_ids.shape[-1] - target_new_ids = target_input_ids[:, -(target_seq_len - self._prev_target_seq_len) :] - self._prev_target_seq_len = target_seq_len - # Convert target_new_ids to string - target_new_toks = self.target_tokenizer.batch_decode( - target_new_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True - ) - # Convert the string to assistant_new_ids - assistant_new_ids = self.assistant_tokenizer( - target_new_toks, add_special_tokens=False, return_tensors="pt" - )["input_ids"] - if self._prev_assistant_ids is None: - self._prev_assistant_ids = assistant_new_ids - else: - self._prev_assistant_ids = torch.cat(self._prev_assistant_ids, assistant_new_ids, dim=-1) - return self._prev_assistant_ids.to(self.assistant_model.device) - + input_ids = input_ids.to(self.assistant_model.device) target_input_ids = input_ids.clone() - input_ids = get_assistant_input_ids(input_ids) + assistant_input_ids = self._prepare_assistant_input_ids(target_input_ids) - # Don't generate more than `max_length - 1` candidates since the target model generates one extra token. - new_cur_len = input_ids.shape[-1] - max_new_tokens = min(int(self.num_assistant_tokens), self.generation_config.max_length - new_cur_len - 1) - # TODO: Debug - # min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - new_cur_len), 0) + # Standard generation steps + min_new_tokens, max_new_tokens = self._calculate_new_tokens(assistant_input_ids) if max_new_tokens == 0: return input_ids, None - # 1. If it is not the first round of candidate generation, prepare the inputs based on the input_ids length - # (which implicitly contains the number of accepted candidates from the previous round) - # has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None - if has_past_key_values: - new_cache_size = new_cur_len - 1 - self.assistant_kwargs["past_key_values"] = _crop_past_key_values( - self.assistant_model, self.assistant_kwargs["past_key_values"], new_cache_size - 1 - ) # the assistant does not have the token after the last match, hence the -1 - self.assistant_kwargs = _prepare_token_type_ids(self.assistant_kwargs, new_cur_len) - - # we need to update the attention mask to reflect the new input_ids length - self.assistant_kwargs = _prepare_attention_mask( - self.assistant_kwargs, new_cur_len, self.assistant_model.config.is_encoder_decoder - ) - - # 2. Forecast next N tokens using the assistant model. - assistant_generation_kwargs = { - self.input_ids_key: input_ids, - # "min_new_tokens": min_new_tokens, - # "max_new_tokens": max_new_tokens, - "min_new_tokens": 100, - "max_new_tokens": 100, - "generation_config": self.generation_config, - "logits_processor": self.logits_processor, - } + self._update_past_and_masks(assistant_input_ids) + generation_args = self._prepare_generation_args(assistant_input_ids, min_new_tokens, max_new_tokens) - assistant_output = self.assistant_model.generate( - **assistant_generation_kwargs, **self.assistant_kwargs, output_logits=True - ) + # Ensure scores are returned + generation_args["generation_config"].output_scores = True + generation_args["generation_config"].return_dict_in_generate = True - # 3. Update variables for the next round of candidate generation + # Generate and process outputs using translator + assistant_output = self.assistant_model.generate(**generation_args, **self.assistant_kwargs) self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values - # 4. Prepare variables for output - candidate_logits = torch.stack(assistant_output.logits, dim=1) - if not candidate_logits.shape[1] > 1: - msg = f"Since we set min_new_tokens to {assistant_generation_kwargs['min_new_tokens']} and max_new_tokens to {assistant_generation_kwargs['max_new_tokens']}, we expect at least 2 candidates, but seems like we got {candidate_logits.shape[1]} candidates." - raise Exception(msg) + candidate_logits = torch.stack(assistant_output.scores, dim=1) + if candidate_logits.shape[1] <= 1: + raise ValueError( + f"Expected at least 2 candidate tokens, but got {candidate_logits.shape[1]}. " + f"min_new_tokens: {generation_args['min_new_tokens']}, max_new_tokens: {generation_args['max_new_tokens']}." + ) + + # Use translator to convert tokens and logits candidate_ids = assistant_output.sequences candidate_logits = self._atm_translator.logits_processors(input_ids=candidate_ids, scores=candidate_logits) - target_ids = self._atm_translator.get_target_ids(input_ids, target_input_ids, candidate_ids) - + target_ids = self._atm_translator.get_target_ids(assistant_input_ids, target_input_ids, candidate_ids) target_logits = self._atm_translator.get_target_logits(candidate_logits) + return target_ids, target_logits + def _prepare_assistant_input_ids(self, target_input_ids: torch.LongTensor) -> torch.LongTensor: + """ + Simplified token conversion that only processes new tokens. + """ + # Calculate new tokens since last call + target_seq_len = target_input_ids.shape[-1] + new_token_count = target_seq_len - self._prev_target_seq_len + target_new_ids = target_input_ids[:, -new_token_count:] + self._prev_target_seq_len = target_seq_len + + # Convert only the new tokens + target_new_text = self.target_tokenizer.batch_decode( + target_new_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True + ) + assistant_new_ids = self.assistant_tokenizer( + target_new_text, add_special_tokens=False, return_tensors="pt" + )["input_ids"].to(self.assistant_model.device) + + # Update or initialize assistant IDs + if self._prev_assistant_ids is None: + self._prev_assistant_ids = assistant_new_ids + else: + self._prev_assistant_ids = torch.cat([self._prev_assistant_ids, assistant_new_ids], dim=-1) + + return self._prev_assistant_ids + class PromptLookupCandidateGenerator(CandidateGenerator): """ From 8aa6020a5fb01acb6536f8c2a66f82744e2e8930 Mon Sep 17 00:00:00 2001 From: jmamou Date: Tue, 26 Nov 2024 03:33:56 -0800 Subject: [PATCH 45/76] fix negative value for max_new_tokens --- src/transformers/generation/candidate_generator.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index b41719b6d8ec58..b2b4a4a28508d9 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -951,7 +951,7 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, # Standard generation steps min_new_tokens, max_new_tokens = self._calculate_new_tokens(assistant_input_ids) - if max_new_tokens == 0: + if max_new_tokens <= 0: return input_ids, None self._update_past_and_masks(assistant_input_ids) @@ -966,11 +966,6 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values candidate_logits = torch.stack(assistant_output.scores, dim=1) - if candidate_logits.shape[1] <= 1: - raise ValueError( - f"Expected at least 2 candidate tokens, but got {candidate_logits.shape[1]}. " - f"min_new_tokens: {generation_args['min_new_tokens']}, max_new_tokens: {generation_args['max_new_tokens']}." - ) # Use translator to convert tokens and logits candidate_ids = assistant_output.sequences From 2169973d3d6981aa3f9cb3a9a0480366303672a2 Mon Sep 17 00:00:00 2001 From: jmamou Date: Tue, 26 Nov 2024 04:49:08 -0800 Subject: [PATCH 46/76] fix generation length target + attention_mask vs. assistant + attent --- src/transformers/generation/candidate_generator.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index b2b4a4a28508d9..f51c12c223b123 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -940,6 +940,8 @@ def __init__( # Track sequence lengths and previous assistant IDs self._prev_target_seq_len: int = 0 self._prev_assistant_ids: Optional[torch.LongTensor] = None + # generation max length according to the assistant vocabulary + self.assistant_generation_max_length = -1 def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]: """ @@ -948,14 +950,21 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, input_ids = input_ids.to(self.assistant_model.device) target_input_ids = input_ids.clone() assistant_input_ids = self._prepare_assistant_input_ids(target_input_ids) + if self.assistant_generation_max_length == -1: + self.assistant_generation_max_length = self.generation_config.max_length - input_ids.shape[1] + assistant_input_ids.shape[1] # Standard generation steps + target_generation_max_length = self.generation_config.max_length + self.generation_config.max_length = self.assistant_generation_max_length min_new_tokens, max_new_tokens = self._calculate_new_tokens(assistant_input_ids) - if max_new_tokens <= 0: + self.generation_config.max_length = target_generation_max_length + + if max_new_tokens == 0: return input_ids, None self._update_past_and_masks(assistant_input_ids) generation_args = self._prepare_generation_args(assistant_input_ids, min_new_tokens, max_new_tokens) + self.assistant_kwargs.pop("attention_mask", None) # Ensure scores are returned generation_args["generation_config"].output_scores = True From c6da82736f75529458624c9c5a1e6e054238a8cc Mon Sep 17 00:00:00 2001 From: jmamou Date: Tue, 26 Nov 2024 04:49:27 -0800 Subject: [PATCH 47/76] fix device --- src/transformers/generation/logits_process.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py index 39a38f9139ec1b..5fcd35c921af86 100644 --- a/src/transformers/generation/logits_process.py +++ b/src/transformers/generation/logits_process.py @@ -1866,7 +1866,7 @@ def __init__(self, suppress_tokens, device: str = "cpu"): @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING) def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: vocab_tensor = torch.arange(scores.shape[-1], device=scores.device) - suppress_token_mask = isin_mps_friendly(vocab_tensor, self.suppress_tokens) + suppress_token_mask = isin_mps_friendly(vocab_tensor, self.suppress_tokens.to(scores.device)) scores = torch.where(suppress_token_mask, -float("inf"), scores) return scores From 2cf9e8e5677ad2c18e83f694d8bbf0d6088f7ebd Mon Sep 17 00:00:00 2001 From: jmamou Date: Wed, 27 Nov 2024 04:28:24 -0800 Subject: [PATCH 48/76] fix negative max_new_tokens bug --- src/transformers/generation/candidate_generator.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index f51c12c223b123..61bb3b7f3240fc 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -940,8 +940,6 @@ def __init__( # Track sequence lengths and previous assistant IDs self._prev_target_seq_len: int = 0 self._prev_assistant_ids: Optional[torch.LongTensor] = None - # generation max length according to the assistant vocabulary - self.assistant_generation_max_length = -1 def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]: """ @@ -950,14 +948,7 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, input_ids = input_ids.to(self.assistant_model.device) target_input_ids = input_ids.clone() assistant_input_ids = self._prepare_assistant_input_ids(target_input_ids) - if self.assistant_generation_max_length == -1: - self.assistant_generation_max_length = self.generation_config.max_length - input_ids.shape[1] + assistant_input_ids.shape[1] - - # Standard generation steps - target_generation_max_length = self.generation_config.max_length - self.generation_config.max_length = self.assistant_generation_max_length - min_new_tokens, max_new_tokens = self._calculate_new_tokens(assistant_input_ids) - self.generation_config.max_length = target_generation_max_length + min_new_tokens, max_new_tokens = self._calculate_new_tokens(target_input_ids) if max_new_tokens == 0: return input_ids, None From a1c0d051c3d0336429aed8a2495143275fa7b871 Mon Sep 17 00:00:00 2001 From: jmamou Date: Thu, 28 Nov 2024 06:13:57 -0800 Subject: [PATCH 49/76] fix UAG --- src/transformers/generation/candidate_generator.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index 61bb3b7f3240fc..2bed5d7533dbdf 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -455,15 +455,17 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, assessed by the model and a `torch.FloatTensor` of shape `(batch_size, candidate_length, vocabulary_size)` containing the logits associated to each candidate. """ + max_new_tokens = int(self.num_assistant_tokens) + if max_new_tokens == 0: # TODO + return input_ids, None + input_ids = input_ids.to(self.assistant_model.device) remove_from_pkv = 0 assistant_input_ids, remove_from_pkv = self._prepare_assistant_input_ids(input_ids) self.prev_assistant_ids = assistant_input_ids - min_new_tokens, max_new_tokens = self._calculate_new_tokens(assistant_input_ids) - if max_new_tokens == 0: - return input_ids, None + min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - assistant_input_ids.shape[-1]), 0) input_ids = input_ids.to(self.assistant_model.device) remove_from_pkv = 0 From d8300913641218c628f0a1c3ba61b2ed09673a01 Mon Sep 17 00:00:00 2001 From: jmamou Date: Thu, 28 Nov 2024 06:17:03 -0800 Subject: [PATCH 50/76] minor --- src/transformers/generation/candidate_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index 2bed5d7533dbdf..c044b7913a1ec5 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -456,7 +456,7 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, vocabulary_size)` containing the logits associated to each candidate. """ max_new_tokens = int(self.num_assistant_tokens) - if max_new_tokens == 0: # TODO + if max_new_tokens == 0: return input_ids, None input_ids = input_ids.to(self.assistant_model.device) From 19d0cceda7777a6074962a26645bc17b8245fee3 Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Thu, 28 Nov 2024 16:10:28 -0500 Subject: [PATCH 51/76] formatting --- src/transformers/generation/candidate_generator.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index c044b7913a1ec5..3435310b255757 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -458,7 +458,7 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, max_new_tokens = int(self.num_assistant_tokens) if max_new_tokens == 0: return input_ids, None - + input_ids = input_ids.to(self.assistant_model.device) remove_from_pkv = 0 @@ -991,9 +991,9 @@ def _prepare_assistant_input_ids(self, target_input_ids: torch.LongTensor) -> to target_new_text = self.target_tokenizer.batch_decode( target_new_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True ) - assistant_new_ids = self.assistant_tokenizer( - target_new_text, add_special_tokens=False, return_tensors="pt" - )["input_ids"].to(self.assistant_model.device) + assistant_new_ids = self.assistant_tokenizer(target_new_text, add_special_tokens=False, return_tensors="pt")[ + "input_ids" + ].to(self.assistant_model.device) # Update or initialize assistant IDs if self._prev_assistant_ids is None: From 5b8217d17bfeb107884e822f01553d58928458ad Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Thu, 28 Nov 2024 16:14:07 -0500 Subject: [PATCH 52/76] `AssistedCandidateGeneratorDifferentTokenizers` `lookbehind`s init --- src/transformers/generation/candidate_generator.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index 3435310b255757..5f1131a725509f 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -17,8 +17,6 @@ import threading import weakref from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple -import weakref -from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple import numpy as np import torch @@ -329,7 +327,6 @@ def __init__( self.target_tokenizer = target_tokenizer self.assistant_tokenizer = assistant_tokenizer self.prev_target_ids = None - self.prev_tokens = None self.prev_assistant_ids = None self.target_lookbehind = assistant_model.generation_config.target_lookbehind self.assistant_lookbehind = assistant_model.generation_config.assistant_lookbehind @@ -485,7 +482,7 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, # Update state self.prev_target_ids = input_ids self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values - self.prev_tokens = assistant_output.sequences + self.prev_assistant_ids = assistant_output.sequences if input_ids.shape[1] >= new_target_ids.shape[1]: return input_ids, None @@ -500,7 +497,7 @@ def _prepare_assistant_input_ids(self, input_ids: torch.LongTensor) -> Tuple[tor } remove_from_pkv = 0 - if self.prev_tokens is not None and self.prev_target_ids.shape[1] > self.target_lookbehind: + if self.prev_assistant_ids is not None and self.prev_target_ids.shape[1] > self.target_lookbehind: # input_ids contains all target prompt input ids and some new target input ids start_index_in_target_window = self.prev_target_ids.shape[1] - self.target_lookbehind From 9b0126a8efc4dd5b1be0564e7a8f32ae149bc472 Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Sat, 30 Nov 2024 13:18:48 -0500 Subject: [PATCH 53/76] resolve conflict & formatting --- tests/generation/test_candidate_generator.py | 45 +++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py index 1443a99634c168..260f92109b7bb2 100644 --- a/tests/generation/test_candidate_generator.py +++ b/tests/generation/test_candidate_generator.py @@ -4,9 +4,52 @@ import weakref from unittest.mock import MagicMock +import numpy as np import torch -from src.transformers.generation.candidate_generator import AssistantToTargetTranslator, AssistantVocabTranslatorCache +from transformers.generation.candidate_generator import ( + AssistantToTargetTranslator, + AssistantVocabTranslatorCache, + AssistedCandidateGeneratorDifferentTokenizers, +) + + +class TestAssistedCandidateGeneratorDifferentTokenizers(unittest.TestCase): + def test_no_intersection(self): + prompt = np.array([[1, 2, 3]]) + prompt_plus_new_tokens = np.array([[4, 5, 6]]) + result = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag(prompt, prompt_plus_new_tokens) + self.assertEqual(result, (None, None, None)) + + def test_complete_overlap(self): + prompt = np.array([[1, 2, 3]]) + prompt_plus_new_tokens = np.array([[1, 2, 3, 4, 5]]) + discrep_length, new_tokens_only, discrep_only = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag( + prompt, prompt_plus_new_tokens + ) + self.assertEqual(discrep_length, 0) + np.testing.assert_array_equal(new_tokens_only, np.array([[4, 5]])) + np.testing.assert_array_equal(discrep_only, np.array([[]])) + + def test_partial_overlap(self): + prompt = np.array([[1, 2, 3]]) + prompt_plus_new_tokens = np.array([[2, 3, 4, 5]]) + discrep_length, new_tokens_only, discrep_only = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag( + prompt, prompt_plus_new_tokens + ) + self.assertEqual(discrep_length, 0) + np.testing.assert_array_equal(new_tokens_only, np.array([[4, 5]])) + np.testing.assert_array_equal(discrep_only, np.array([[]])) + + def test_no_new_tokens(self): + prompt = np.array([[1, 2, 3]]) + prompt_plus_new_tokens = np.array([[1, 2, 3]]) + discrep_length, new_tokens_only, discrep_only = AssistedCandidateGeneratorDifferentTokenizers._get_tokens_diag( + prompt, prompt_plus_new_tokens + ) + self.assertEqual(discrep_length, 0) + np.testing.assert_array_equal(new_tokens_only, np.array([[]])) + np.testing.assert_array_equal(discrep_only, np.array([[]])) class TestAssistantToTargetTranslator(unittest.TestCase): From 578d0b348aea95cbeb95b5f89d728f6e1eda1535 Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Sat, 30 Nov 2024 13:57:28 -0500 Subject: [PATCH 54/76] rerun CI tests --- tests/generation/test_candidate_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py index 260f92109b7bb2..798e53de53ca75 100644 --- a/tests/generation/test_candidate_generator.py +++ b/tests/generation/test_candidate_generator.py @@ -12,7 +12,7 @@ AssistantVocabTranslatorCache, AssistedCandidateGeneratorDifferentTokenizers, ) - + class TestAssistedCandidateGeneratorDifferentTokenizers(unittest.TestCase): def test_no_intersection(self): From 7db269547a01085845cf2cec3d62a3a7e26b9675 Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Sat, 30 Nov 2024 13:57:46 -0500 Subject: [PATCH 55/76] remove space... --- tests/generation/test_candidate_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py index 798e53de53ca75..260f92109b7bb2 100644 --- a/tests/generation/test_candidate_generator.py +++ b/tests/generation/test_candidate_generator.py @@ -12,7 +12,7 @@ AssistantVocabTranslatorCache, AssistedCandidateGeneratorDifferentTokenizers, ) - + class TestAssistedCandidateGeneratorDifferentTokenizers(unittest.TestCase): def test_no_intersection(self): From fb699001e14ed188e3e9661265dc2320ad60116a Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Mon, 2 Dec 2024 20:30:36 -0500 Subject: [PATCH 56/76] remove old code --- .../generation/candidate_generator.py | 229 ------------------ 1 file changed, 229 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index 5f1131a725509f..25148b3a6b837c 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -464,14 +464,6 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - assistant_input_ids.shape[-1]), 0) - input_ids = input_ids.to(self.assistant_model.device) - remove_from_pkv = 0 - - assistant_input_ids, remove_from_pkv = self._prepare_assistant_input_ids(input_ids) - self.prev_assistant_ids = assistant_input_ids - - min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - assistant_input_ids.shape[-1]), 0) - self._update_past_and_masks(assistant_input_ids, remove_from_pkv) generation_args = self._prepare_generation_args(assistant_input_ids, min_new_tokens, max_new_tokens) self.assistant_kwargs.pop("attention_mask", None) @@ -568,227 +560,6 @@ def _process_assistant_outputs( return new_target_ids -class AssistantToTargetTranslator: - """ - Translate the assistant into the target universe. - """ - - def __init__(self, target_tokenizer: "PreTrainedTokenizerBase", assistant_tokenizer: "PreTrainedTokenizerBase"): - self._target_tokenizer: "PreTrainedTokenizerBase" = target_tokenizer - self._assistant_tokenizer: "PreTrainedTokenizerBase" = assistant_tokenizer - self._assistant_to_target_input_ids: dict[int, int] = self._get_assistant_to_target_input_ids() - self.suppress_input_ids: list[int] = self._get_suppress_input_ids() - self.logits_processors: LogitsProcessorList = LogitsProcessorList( - [ - SuppressTokensLogitsProcessor(self.suppress_input_ids), - LogitNormalization(), - ] - ) - - def _get_assistant_to_target_input_ids(self) -> dict[int, int]: - """ - Get a mapping from assistant tokens to target tokens based on vocabularies. - """ - target_vocab = self._target_tokenizer.get_vocab() - assistant_vocab = self._assistant_tokenizer.get_vocab() - return { - assistant_vocab[tok]: target_vocab[tok] for tok in set(target_vocab.keys()) & set(assistant_vocab.keys()) - } - - def _get_suppress_input_ids(self) -> list[int]: - """ - Get the input ids that are in the assistant vocab but not in the target vocab. - """ - assistant_vocab = self._assistant_tokenizer.get_vocab() - return list(set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.keys())) - - def get_target_ids(self, assistant_input_ids, target_input_ids, assistant_candidate_ids: torch.LongTensor) -> torch.LongTensor: - """ - Return the target candidate ids that correspond to the assistant candidate ids. - Note that we have already the target ids for the prompt and we only need to find the target ids for the new tokens. - Moreover, assistant ids of the original prompt does not necessarily appear in _assistant_to_target_input_ids. - """ - target_candidate_ids = assistant_candidate_ids[0, -(len(assistant_candidate_ids[0]) - assistant_input_ids.shape[1]) :].apply_( - lambda x: self._assistant_to_target_input_ids.get(x, x) - ) - return torch.cat((target_input_ids, target_candidate_ids.unsqueeze(0)), dim=1) - - def get_target_logits(self, assistant_logits: torch.FloatTensor) -> torch.FloatTensor: - """ - Return the target logits that correspond to the assistant logits. - """ - target_vocab_size: int = len(self._target_tokenizer.get_vocab()) - target_shape: tuple[int, ...] = (*assistant_logits.shape[:-1], target_vocab_size) - target_logits: torch.FloatTensor = torch.full(target_shape, -float("inf")) - assistant_logits_supported_mask: torch.BoolTensor = assistant_logits > -float("inf") - assistant_logits_supported_indices: torch.IntTensor = assistant_logits_supported_mask.nonzero(as_tuple=True)[ - -1 - ] - target_logits_supported_indices: torch.IntTensor = assistant_logits_supported_indices.apply_( - lambda x: self._assistant_to_target_input_ids[x] - ) - target_logits[..., target_logits_supported_indices] = assistant_logits[..., assistant_logits_supported_mask] - return target_logits - - -class AssistantVocabTranslatorCache: - _lock = threading.Lock() - _cache = weakref.WeakKeyDictionary() - - @classmethod - def get_translator( - cls, target_tokenizer: "PreTrainedTokenizerBase", assistant_tokenizer: "PreTrainedTokenizerBase" - ) -> AssistantToTargetTranslator: - with cls._lock: - assistant_dict = cls._cache.get(target_tokenizer) - if assistant_dict is None: - assistant_dict = weakref.WeakKeyDictionary() - cls._cache[target_tokenizer] = assistant_dict - - mapping = assistant_dict.get(assistant_tokenizer) - if mapping is None: - mapping = AssistantToTargetTranslator(target_tokenizer, assistant_tokenizer) - assistant_dict[assistant_tokenizer] = mapping - - return mapping - - -class UniversalSpeculativeDecodingGenerator(AssistedCandidateGeneratorDifferentTokenizers): - """ - `CandidateGenerator` class to be used for Universal Speculative Decoding (USD): speculative decoding with different tokenizers - for the assistant and main models. This class generates candidates through the use of a smaller - model. - - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids) - assistant_model (`PreTrainedModel`): - The model to be used for generating candidates. This model should be smaller than the main model. - target_tokenizer (`PreTrainedTokenizerBase`): - The tokenizer used for the target model. - assistant_tokenizer (`PreTrainedTokenizerBase`): - The tokenizer used for the assistant model. - generation_config (`~generation.GenerationConfig`, *optional*): - The generation configuration to be used as base parametrization for the generation call. - logits_processor (`LogitsProcessorList`): - An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] - used to modify the prediction scores of the language modeling head applied at each generation step. - model_kwargs (`Dict`): - The keyword arguments that will be passed to the main model, and are used as base inputs for the assistant - model as well. - inputs_tensor (`torch.Tensor`, *optional*): - The model input tensor. In encoder-decoder models, this is the encoder input. - """ - - def __init__( - self, - input_ids: torch.LongTensor, - assistant_model: "PreTrainedModel", - target_tokenizer: "PreTrainedTokenizerBase", - assistant_tokenizer: "PreTrainedTokenizerBase", - generation_config: "GenerationConfig", - model_kwargs: Dict, - inputs_tensor: Optional[torch.Tensor] = None, - logits_processor: "LogitsProcessorList" = None, - ): - self._atm_translator = AssistantVocabTranslatorCache.get_translator(target_tokenizer, assistant_tokenizer) - super().__init__( - input_ids, - assistant_model, - target_tokenizer, - assistant_tokenizer, - generation_config, - model_kwargs, - inputs_tensor, - logits_processor, - ) - self._prev_target_seq_len: int = 0 - self._prev_assistant_ids: torch.LongTensor | None = None - - def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]: - """ - Fetches the candidates to be tried for the current input. - - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids) - - Return: - `torch.LongTensor` of shape `(batch_size, candidate_length)` containing the candidate sequences to be - assessed by the model and a `torch.FloatTensor` of shape `(batch_size, candidate_length, - vocabulary_size)` containing the logits associated to each candidate. - """ - has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None - - def get_assistant_input_ids(target_input_ids: torch.LongTensor) -> torch.LongTensor: - nonlocal has_past_key_values - target_seq_len = target_input_ids.shape[-1] - target_new_ids = target_input_ids[:, -(target_seq_len - self._prev_target_seq_len) :] - self._prev_target_seq_len = target_seq_len - # Convert target_new_ids to string - target_new_toks = self.target_tokenizer.batch_decode(target_new_ids, skip_special_tokens=False) - # Convert the string to assistant_new_ids - assistant_new_ids = self.assistant_tokenizer.encode(target_new_toks[0], add_special_tokens=False) - if self._prev_assistant_ids is None: - self._prev_assistant_ids = assistant_new_ids - else: - self._prev_assistant_ids = self._prev_assistant_ids + assistant_new_ids - return torch.tensor(self._prev_assistant_ids).unsqueeze(0).to(self.assistant_model.device) - - target_input_ids = input_ids.clone() - input_ids = get_assistant_input_ids(input_ids) - - # Don't generate more than `max_length - 1` candidates since the target model generates one extra token. - new_cur_len = input_ids.shape[-1] - max_new_tokens = min(int(self.num_assistant_tokens), self.generation_config.max_length - new_cur_len - 1) - min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - new_cur_len), 0) - if max_new_tokens == 0: - return input_ids, None - - # 1. If it is not the first round of candidate generation, prepare the inputs based on the input_ids length - # (which implicitly contains the number of accepted candidates from the previous round) - # has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None - if has_past_key_values: - new_cache_size = new_cur_len - 1 - self.assistant_kwargs["past_key_values"] = _crop_past_key_values( - self.assistant_model, self.assistant_kwargs["past_key_values"], new_cache_size - 1 - ) # the assistant does not have the token after the last match, hence the -1 - self.assistant_kwargs = _prepare_token_type_ids(self.assistant_kwargs, new_cur_len) - - # we need to update the attention mask to reflect the new input_ids length - self.assistant_kwargs = _prepare_attention_mask( - self.assistant_kwargs, new_cur_len, self.assistant_model.config.is_encoder_decoder - ) - - # 2. Forecast next N tokens using the assistant model. - assistant_generation_kwargs = { - self.input_ids_key: input_ids, - # "min_new_tokens": min_new_tokens, - # "max_new_tokens": max_new_tokens, - "min_new_tokens": 100, - "max_new_tokens": 100, - "generation_config": self.generation_config, - "logits_processor": self.logits_processor, - } - - assistant_output = self.assistant_model.generate(**assistant_generation_kwargs, **self.assistant_kwargs, output_logits=True) - - # 3. Update variables for the next round of candidate generation - self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values - - # 4. Prepare variables for output - candidate_logits = torch.stack(assistant_output.logits, dim=1) - if not candidate_logits.shape[1] > 1: - msg = f"Since we set min_new_tokens to {assistant_generation_kwargs['min_new_tokens']} and max_new_tokens to {assistant_generation_kwargs['max_new_tokens']}, we expect at least 2 candidates, but seems like we got {candidate_logits.shape[1]} candidates." - raise Exception(msg) - candidate_ids = assistant_output.sequences - candidate_logits = self._atm_translator.logits_processors(input_ids=candidate_ids, scores=candidate_logits) - target_ids = self._atm_translator.get_target_ids(input_ids, target_input_ids, candidate_ids) - - target_logits = self._atm_translator.get_target_logits(candidate_logits) - return target_ids, target_logits - - class AssistantToTargetTranslator: """ Translate the assistant into the target universe. From e40c775c8b5671e17357872221b71c751e98f995 Mon Sep 17 00:00:00 2001 From: jmamou Date: Wed, 4 Dec 2024 05:04:38 -0800 Subject: [PATCH 57/76] fix candidate_input_ids device --- src/transformers/generation/utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 07c743c6b79f3d..77535d18464ee2 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -4280,7 +4280,8 @@ def _assisted_decoding( # 1. Fetch candidate sequences from a `CandidateGenerator` candidate_input_ids, candidate_logits = candidate_generator.get_candidates(input_ids) - + candidate_input_ids = candidate_input_ids.to(self.device) + if candidate_logits is not None: candidate_logits = candidate_logits.to(self.device) @@ -4482,9 +4483,12 @@ def _speculative_sampling( # Gets the probabilities from the logits. q_i and p_i denote the assistant and model probabilities of the tokens # selected by the assistant, respectively. q = candidate_logits.softmax(dim=-1) + device = new_candidate_input_ids.device + #new_candidate_input_ids = new_candidate_input_ids.to(q.device) q_i = q[:, torch.arange(candidate_length), new_candidate_input_ids].squeeze(0, 1) p = new_logits.softmax(dim=-1) p_i = p[:, torch.arange(candidate_length), new_candidate_input_ids].squeeze(0, 1) + #new_candidate_input_ids = new_candidate_input_ids.to(device) probability_ratio = p_i / q_i # When probability_ratio > 1 (i.e. q_i(x) < p_i(x), or "assistant probability of the candidate token is smaller From b5ce873fb930c83cdaeb85fa7fbded0dfb06c1bd Mon Sep 17 00:00:00 2001 From: jmamou Date: Wed, 4 Dec 2024 05:10:06 -0800 Subject: [PATCH 58/76] minor --- src/transformers/generation/utils.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 77535d18464ee2..90c23fe7f08dff 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -4483,12 +4483,9 @@ def _speculative_sampling( # Gets the probabilities from the logits. q_i and p_i denote the assistant and model probabilities of the tokens # selected by the assistant, respectively. q = candidate_logits.softmax(dim=-1) - device = new_candidate_input_ids.device - #new_candidate_input_ids = new_candidate_input_ids.to(q.device) q_i = q[:, torch.arange(candidate_length), new_candidate_input_ids].squeeze(0, 1) p = new_logits.softmax(dim=-1) p_i = p[:, torch.arange(candidate_length), new_candidate_input_ids].squeeze(0, 1) - #new_candidate_input_ids = new_candidate_input_ids.to(device) probability_ratio = p_i / q_i # When probability_ratio > 1 (i.e. q_i(x) < p_i(x), or "assistant probability of the candidate token is smaller From d34d7eaa420b42ffc501e9fc2eb4c31c52d023f1 Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Wed, 4 Dec 2024 20:33:32 -0500 Subject: [PATCH 59/76] formatting --- src/transformers/generation/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 90c23fe7f08dff..7a9d78168ac903 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -4281,7 +4281,7 @@ def _assisted_decoding( # 1. Fetch candidate sequences from a `CandidateGenerator` candidate_input_ids, candidate_logits = candidate_generator.get_candidates(input_ids) candidate_input_ids = candidate_input_ids.to(self.device) - + if candidate_logits is not None: candidate_logits = candidate_logits.to(self.device) From 9d4d9f9a8cd53b2cdf0fae22846bc07012b29507 Mon Sep 17 00:00:00 2001 From: Jonathan Mamou Date: Wed, 18 Dec 2024 00:52:10 +0200 Subject: [PATCH 60/76] Fix prepare + apply (#7) * fix prepare + apply * move to cpu * simplity suppress_tokens * fix bugs and refacatoring * device move * handle self.config.vocab_size > len(target_tokenizer.get_vocab()) * no need to normalize in candidate_generator * address Nadav's comments + minor * optimize device move + SuppressTokensLogitsProcessor * AssistantToTargetTranslator, SuppressTokensLogitsProcessor and tokenizers mapping improvements * padding size * padding improvement * fix and simplify get_target_logits * renaming in get_target_logits * minor * add filter_value and suppress_tokens_id * style + rename * remove TODO * restore original SelectTokensLogitsProcessor with modification * fix style * fix _update_past_and_masks and optimize code * remove assistant_vocab_size arg * fix attention_mask * call _prepare_attention_mask also if not has_past_key_values * handling attention mask for first generation * comment * restore test * remove SelectTokensLogitsProcessor * _update_past_and_masks implementation for USD --- .../generation/candidate_generator.py | 158 +++++++++++------- src/transformers/generation/logits_process.py | 7 +- src/transformers/generation/utils.py | 2 + tests/generation/test_candidate_generator.py | 2 +- 4 files changed, 100 insertions(+), 69 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index 25148b3a6b837c..a37481018086d0 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -24,7 +24,6 @@ from ..cache_utils import DynamicCache from ..pytorch_utils import isin_mps_friendly from .logits_process import ( - LogitNormalization, LogitsProcessorList, MinLengthLogitsProcessor, SuppressTokensLogitsProcessor, @@ -245,18 +244,21 @@ def _calculate_new_tokens(self, input_ids: torch.LongTensor) -> Tuple[int, int]: min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - new_cur_len), 0) return min_new_tokens, max_new_tokens - def _update_past_and_masks(self, input_ids: torch.LongTensor, remove_from_pkv: int = 0) -> bool: + def _update_past_and_masks( + self, input_ids: torch.LongTensor, remove_from_pkv: int = 0, num_added_tokens: int = 1 + ) -> bool: """Update past key values and attention masks for subsequent generation rounds.""" has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None if has_past_key_values: new_cache_size = input_ids.shape[-1] - 1 - remove_from_pkv self.assistant_kwargs["past_key_values"] = _crop_past_key_values( - self.assistant_model, self.assistant_kwargs["past_key_values"], new_cache_size - 1 + self.assistant_model, self.assistant_kwargs["past_key_values"], new_cache_size - num_added_tokens ) self.assistant_kwargs = _prepare_attention_mask( self.assistant_kwargs, input_ids.shape[-1], self.assistant_model.config.is_encoder_decoder ) self.assistant_kwargs = _prepare_token_type_ids(self.assistant_kwargs, input_ids.shape[-1]) + return has_past_key_values def _prepare_generation_args(self, input_ids: torch.LongTensor, min_new_tokens: int, max_new_tokens: int) -> Dict: @@ -565,34 +567,41 @@ class AssistantToTargetTranslator: Translate the assistant into the target universe. """ - def __init__(self, target_tokenizer: "PreTrainedTokenizerBase", assistant_tokenizer: "PreTrainedTokenizerBase"): + def __init__( + self, + target_tokenizer: "PreTrainedTokenizerBase", + assistant_tokenizer: "PreTrainedTokenizerBase", + assistant_model_device, + target_vocab_size: int, + filter_value: float = -float("Inf"), + suppress_tokens_id: int = -1, + ): self._target_tokenizer: "PreTrainedTokenizerBase" = target_tokenizer self._assistant_tokenizer: "PreTrainedTokenizerBase" = assistant_tokenizer - self._assistant_to_target_input_ids: dict[int, int] = self._get_assistant_to_target_input_ids() - self.suppress_input_ids: list[int] = self._get_suppress_input_ids() + self._assistant_model_device = assistant_model_device + self.target_vocab_size: int = target_vocab_size + self.filter_value: float = filter_value + self.suppress_tokens_id: int = suppress_tokens_id + self._assistant_to_target_input_ids = self._get_assistant_to_target_input_ids() self.logits_processors: LogitsProcessorList = LogitsProcessorList( - [ - SuppressTokensLogitsProcessor(self.suppress_input_ids), - LogitNormalization(), - ] + [SuppressTokensLogitsProcessor(self._get_suppress_input_ids(), self._assistant_model_device)] ) - def _get_assistant_to_target_input_ids(self) -> dict[int, int]: - """ - Get a mapping from assistant tokens to target tokens based on vocabularies. - """ + def _get_assistant_to_target_input_ids(self): target_vocab = self._target_tokenizer.get_vocab() assistant_vocab = self._assistant_tokenizer.get_vocab() - return { - assistant_vocab[tok]: target_vocab[tok] for tok in set(target_vocab.keys()) & set(assistant_vocab.keys()) - } + max_assistant_index = max(assistant_vocab.values()) + assistant_to_target_input_ids = torch.full((max_assistant_index + 1,), self.suppress_tokens_id, dtype=int) + for tok, idx in assistant_vocab.items(): + if tok in target_vocab: + assistant_to_target_input_ids[idx] = target_vocab[tok] + return assistant_to_target_input_ids.to(self._assistant_model_device) def _get_suppress_input_ids(self) -> list[int]: """ Get the input ids that are in the assistant vocab but not in the target vocab. """ - assistant_vocab = self._assistant_tokenizer.get_vocab() - return list(set(assistant_vocab.values()) - set(self._assistant_to_target_input_ids.keys())) + return torch.where(self._assistant_to_target_input_ids == self.suppress_tokens_id)[0] def get_target_ids( self, assistant_input_ids, target_input_ids, assistant_candidate_ids: torch.LongTensor @@ -602,33 +611,29 @@ def get_target_ids( Note that we have already the target ids for the prompt and we only need to find the target ids for the new tokens. Moreover, assistant ids of the original prompt does not necessarily appear in _assistant_to_target_input_ids. """ - device = assistant_candidate_ids.device - target_candidate_ids = ( - assistant_candidate_ids[0, -(len(assistant_candidate_ids[0]) - assistant_input_ids.shape[1]) :] - .cpu() - .apply_(lambda x: self._assistant_to_target_input_ids.get(x, x)) - .to(device) - ) - return torch.cat((target_input_ids, target_candidate_ids.unsqueeze(0)), dim=1) + + num_new_tokens = len(assistant_candidate_ids[0]) - assistant_input_ids.shape[1] + if num_new_tokens == 0: + return target_input_ids + else: + transformed_slice = self._assistant_to_target_input_ids[assistant_candidate_ids[0, -num_new_tokens:]] + return torch.cat((target_input_ids, transformed_slice.unsqueeze(0)), dim=1) def get_target_logits(self, assistant_logits: torch.FloatTensor) -> torch.FloatTensor: """ Return the target logits that correspond to the assistant logits. """ - device = assistant_logits.device - target_vocab_size: int = len(self._target_tokenizer.get_vocab()) - target_shape: tuple[int, ...] = (*assistant_logits.shape[:-1], target_vocab_size) - target_logits: torch.FloatTensor = torch.full(target_shape, -float("inf")).to(device) - assistant_logits_supported_mask: torch.BoolTensor = assistant_logits > -float("inf") - assistant_logits_supported_indices: torch.IntTensor = assistant_logits_supported_mask.nonzero(as_tuple=True)[ - -1 - ] - target_logits_supported_indices: torch.IntTensor = ( - assistant_logits_supported_indices.cpu() - .apply_(lambda x: self._assistant_to_target_input_ids[x]) - .to(device) - ) - target_logits[..., target_logits_supported_indices] = assistant_logits[..., assistant_logits_supported_mask] + + target_shape: tuple[int, ...] = (*assistant_logits.shape[:-1], self.target_vocab_size) + target_logits: torch.FloatTensor = torch.full(target_shape, self.filter_value).to(self._assistant_model_device) + # Mask for valid indices + assistant_indices_mask = self._assistant_to_target_input_ids != self.suppress_tokens_id + # Exclude invalid indices + target_logits_supported_indices = self._assistant_to_target_input_ids[assistant_indices_mask] + valid_assistant_logits = assistant_logits[..., : self._assistant_to_target_input_ids.shape[0]] + + target_logits[..., target_logits_supported_indices] = valid_assistant_logits[..., assistant_indices_mask] + return target_logits @@ -643,7 +648,11 @@ class AssistantVocabTranslatorCache: @classmethod def get_translator( - cls, target_tokenizer: "PreTrainedTokenizerBase", assistant_tokenizer: "PreTrainedTokenizerBase" + cls, + target_tokenizer: "PreTrainedTokenizerBase", + assistant_tokenizer: "PreTrainedTokenizerBase", + assistant_model_device, + target_vocab_size: int, ) -> AssistantToTargetTranslator: with cls._lock: assistant_dict = cls._cache.get(target_tokenizer) @@ -653,7 +662,9 @@ def get_translator( mapping = assistant_dict.get(assistant_tokenizer) if mapping is None: - mapping = AssistantToTargetTranslator(target_tokenizer, assistant_tokenizer) + mapping = AssistantToTargetTranslator( + target_tokenizer, assistant_tokenizer, assistant_model_device, target_vocab_size + ) assistant_dict[assistant_tokenizer] = mapping return mapping @@ -692,11 +703,14 @@ def __init__( assistant_tokenizer: "PreTrainedTokenizerBase", generation_config: "GenerationConfig", model_kwargs: Dict, + target_vocab_size: int, inputs_tensor: Optional[torch.Tensor] = None, logits_processor: "LogitsProcessorList" = None, ): # Initialize translator before parent class - self._atm_translator = AssistantVocabTranslatorCache.get_translator(target_tokenizer, assistant_tokenizer) + self._atm_translator = AssistantVocabTranslatorCache.get_translator( + target_tokenizer, assistant_tokenizer, assistant_model.device, target_vocab_size + ) super().__init__( input_ids, assistant_model, @@ -708,42 +722,49 @@ def __init__( logits_processor, ) # Track sequence lengths and previous assistant IDs - self._prev_target_seq_len: int = 0 + self._target_seq_len_with_candidates: int = 0 self._prev_assistant_ids: Optional[torch.LongTensor] = None + self.target_vocab_size = target_vocab_size def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]: """ Simplified version of get_candidates that uses the translator cache for token conversion. """ - input_ids = input_ids.to(self.assistant_model.device) - target_input_ids = input_ids.clone() - assistant_input_ids = self._prepare_assistant_input_ids(target_input_ids) + target_input_ids = input_ids.to(self.assistant_model.device) + assistant_input_ids, num_added_tokens = self._prepare_assistant_input_ids(target_input_ids) min_new_tokens, max_new_tokens = self._calculate_new_tokens(target_input_ids) if max_new_tokens == 0: return input_ids, None - self._update_past_and_masks(assistant_input_ids) + self._update_past_and_masks(assistant_input_ids, num_added_tokens=num_added_tokens) generation_args = self._prepare_generation_args(assistant_input_ids, min_new_tokens, max_new_tokens) - self.assistant_kwargs.pop("attention_mask", None) # Ensure scores are returned generation_args["generation_config"].output_scores = True generation_args["generation_config"].return_dict_in_generate = True # Generate and process outputs using translator - assistant_output = self.assistant_model.generate(**generation_args, **self.assistant_kwargs) - self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values - - candidate_logits = torch.stack(assistant_output.scores, dim=1) + generation_args["logits_processor"] = self._atm_translator.logits_processors + self._prev_assistant_ids, assistant_candidate_logits = self._generate_candidates(generation_args) # Use translator to convert tokens and logits - candidate_ids = assistant_output.sequences - candidate_logits = self._atm_translator.logits_processors(input_ids=candidate_ids, scores=candidate_logits) - target_ids = self._atm_translator.get_target_ids(assistant_input_ids, target_input_ids, candidate_ids) - target_logits = self._atm_translator.get_target_logits(candidate_logits) + target_candidate_ids = self._atm_translator.get_target_ids( + assistant_input_ids, target_input_ids, self._prev_assistant_ids + ) + self._target_seq_len_with_candidates = target_candidate_ids.shape[-1] + target_candidate_logits = self._atm_translator.get_target_logits(assistant_candidate_logits) - return target_ids, target_logits + return target_candidate_ids, target_candidate_logits + + def _update_past_and_masks(self, assistant_input_ids: torch.LongTensor, num_added_tokens: int = 1) -> bool: + if self._prev_assistant_ids is None: + # Prepare attention mask for the first generation. + # For subsequent generations, the attention mask is updated in super()_update_past_and_masks. + self.assistant_kwargs = _prepare_attention_mask( + self.assistant_kwargs, assistant_input_ids.shape[-1], self.assistant_model.config.is_encoder_decoder + ) + return super()._update_past_and_masks(assistant_input_ids, num_added_tokens=num_added_tokens) def _prepare_assistant_input_ids(self, target_input_ids: torch.LongTensor) -> torch.LongTensor: """ @@ -751,9 +772,11 @@ def _prepare_assistant_input_ids(self, target_input_ids: torch.LongTensor) -> to """ # Calculate new tokens since last call target_seq_len = target_input_ids.shape[-1] - new_token_count = target_seq_len - self._prev_target_seq_len + if self._target_seq_len_with_candidates == 0: + new_token_count = target_seq_len + else: + new_token_count = 1 target_new_ids = target_input_ids[:, -new_token_count:] - self._prev_target_seq_len = target_seq_len # Convert only the new tokens target_new_text = self.target_tokenizer.batch_decode( @@ -765,11 +788,16 @@ def _prepare_assistant_input_ids(self, target_input_ids: torch.LongTensor) -> to # Update or initialize assistant IDs if self._prev_assistant_ids is None: - self._prev_assistant_ids = assistant_new_ids + assistant_input_ids = assistant_new_ids else: - self._prev_assistant_ids = torch.cat([self._prev_assistant_ids, assistant_new_ids], dim=-1) - - return self._prev_assistant_ids + tokens_to_remove = self._target_seq_len_with_candidates + 1 - target_seq_len + # If the number of new tokens is greater than zero, truncate the previous assistant IDs + if tokens_to_remove > 0: + self._prev_assistant_ids = self._prev_assistant_ids[:, :-tokens_to_remove] + assistant_input_ids = torch.cat([self._prev_assistant_ids, assistant_new_ids], dim=-1) + assistant_input_ids = assistant_input_ids.to(torch.int) + + return assistant_input_ids, len(assistant_new_ids[0]) class PromptLookupCandidateGenerator(CandidateGenerator): diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py index 5fcd35c921af86..e818b266cd7b7c 100644 --- a/src/transformers/generation/logits_process.py +++ b/src/transformers/generation/logits_process.py @@ -1860,14 +1860,15 @@ class SuppressTokensLogitsProcessor(LogitsProcessor): ``` """ - def __init__(self, suppress_tokens, device: str = "cpu"): + def __init__(self, suppress_tokens, device: str = "cpu", filter_value: float = -float("Inf")): self.suppress_tokens = torch.tensor(list(suppress_tokens), device=device) + self.filter_value = filter_value @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING) def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: vocab_tensor = torch.arange(scores.shape[-1], device=scores.device) - suppress_token_mask = isin_mps_friendly(vocab_tensor, self.suppress_tokens.to(scores.device)) - scores = torch.where(suppress_token_mask, -float("inf"), scores) + suppress_token_mask = isin_mps_friendly(vocab_tensor, self.suppress_tokens) + scores = torch.where(suppress_token_mask, self.filter_value, scores) return scores diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 7a9d78168ac903..d7d9757d3e4f39 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -858,6 +858,8 @@ def _get_candidate_generator( logits_processor=logits_processor, target_tokenizer=target_tokenizer, assistant_tokenizer=assistant_tokenizer, + # required in the case that self.config.vocab_size is different from the length of target_tokenizer.get_vocab() + target_vocab_size=self.config.vocab_size, ) case False: candidate_generator = AssistedCandidateGeneratorDifferentTokenizers( diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py index 260f92109b7bb2..dd7e427a3bfda9 100644 --- a/tests/generation/test_candidate_generator.py +++ b/tests/generation/test_candidate_generator.py @@ -79,7 +79,7 @@ def test_get_assistant_to_target_input_ids(self): def test_get_suppress_input_ids(self): """Test the suppression of assistant input IDs not present in the target vocabulary.""" expected_suppress_ids = [4] - actual_suppress_ids = self.translator.suppress_input_ids + actual_suppress_ids = self.translator._suppress_input_ids self.assertEqual(actual_suppress_ids, expected_suppress_ids) def test_get_target_ids(self): From 4e92e9ced98340f3b183453ba8e67a0438851e11 Mon Sep 17 00:00:00 2001 From: Gaurav Date: Thu, 12 Dec 2024 02:30:25 +0000 Subject: [PATCH 61/76] Add unittests for Universal Assisted generation --- tests/test_universal_assisted_generation.py | 119 ++++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 tests/test_universal_assisted_generation.py diff --git a/tests/test_universal_assisted_generation.py b/tests/test_universal_assisted_generation.py new file mode 100644 index 00000000000000..8eae5a71de0749 --- /dev/null +++ b/tests/test_universal_assisted_generation.py @@ -0,0 +1,119 @@ +import unittest + +from zmq import device +import torch +import logging +from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig +from transformers.generation.candidate_generator import UniversalSpeculativeDecodingGenerator + +logging.basicConfig(level=logging.DEBUG, format='%(message)s') + +if torch.cuda.is_available(): + device = "cuda" + +class TestUniversalSpeculativeDecoding(unittest.TestCase): + @classmethod + def setUpClass(cls): + # Setup main and assistant models + cls.main_model = AutoModelForCausalLM.from_pretrained( + "meta-llama/Llama-3.2-1B-Instruct").to(device) + cls.assistant_model = AutoModelForCausalLM.from_pretrained( + "Qwen/Qwen2.5-0.5B-Instruct").to(device) + cls.main_tokenizer = AutoTokenizer.from_pretrained( + "meta-llama/Llama-3.2-1B-Instruct") + cls.assistant_tokenizer = AutoTokenizer.from_pretrained( + "Qwen/Qwen2.5-0.5B-Instruct") + cls.generation_config = GenerationConfig() + + # Ensure required tokens exist + if cls.main_tokenizer.pad_token_id is None: + cls.main_tokenizer.pad_token_id = cls.main_tokenizer.eos_token_id + if cls.main_tokenizer.bos_token_id is None: + cls.main_tokenizer.bos_token_id = cls.main_tokenizer.eos_token_id + + def setUp(self): + self.input_ids = torch.tensor([[1, 2, 3]]).to(device) + self.model_kwargs = { + "attention_mask": torch.ones_like(self.input_ids).to(device), + } + self.generator = UniversalSpeculativeDecodingGenerator( + input_ids=self.input_ids, + assistant_model=self.assistant_model, + target_tokenizer=self.main_tokenizer, + assistant_tokenizer=self.assistant_tokenizer, + generation_config=self.generation_config, + model_kwargs=self.model_kwargs, + target_vocab_size=self.main_tokenizer.vocab_size, + ) + + def test_basic_generation(self): + """Test basic speculative decoding works""" + input_text = "The quick brown fox" + input_ids = self.main_tokenizer.encode(input_text, return_tensors="pt") + self.generator.input_ids = input_ids + candidates, scores = self.generator.get_candidates(input_ids) + + self.assertIsNotNone(candidates) + self.assertIsNotNone(scores) + self.assertTrue(torch.is_tensor(candidates)) + self.assertTrue(torch.is_tensor(scores)) + + def test_mismatched_vocabularies(self): + """Test handling of mismatched vocabularies between models""" + # Create input with tokens present in main but not assistant vocab + # Find a token that is not in the assistant tokenizer but in + # the main tokenizer. + missing_token = next( + token + for token in self.main_tokenizer.get_vocab() + if token not in self.assistant_tokenizer.get_vocab() + ) + + input_ids = torch.tensor([[self.main_tokenizer.convert_tokens_to_ids(missing_token)]]) + self.generator.input_ids = input_ids + candidates, scores = self.generator.get_candidates(input_ids) + self.assertIsNotNone(candidates) + + def test_empty_input(self): + if False: + """Test handling of empty input""" + input_ids = torch.tensor([[]], dtype=torch.long) + self.generator.input_ids = input_ids + with self.assertRaises(ValueError): + self.generator.get_candidates(input_ids) + + def test_long_sequence(self): + if False: + """Test handling of very long input sequences""" + long_input = torch.ones((1, 2048), dtype=torch.long) + self.generator.input_ids = long_input + candidates, scores = self.generator.get_candidates(long_input) + self.assertLessEqual( + candidates.shape[1], + self.main_model.config.max_position_embeddings, + ) + + def test_speculation_depth(self): + """Test different speculation depths""" + input_ids = self.main_tokenizer.encode("Test text", return_tensors="pt") + self.generator.input_ids = input_ids + + for depth in [1, 8, 17]: + self.generation_config.num_assistant_tokens = depth + candidates, scores = self.generator.get_candidates(input_ids) + self.assertLessEqual( + candidates.shape[1] - input_ids.shape[1], depth + ) + + def test_device_consistency(self): + """Test handling of inputs on different devices""" + if torch.cuda.is_available(): + input_ids = torch.tensor([[1, 2, 3]]).to( + self.generator.assistant_model.device) + self.generator.input_ids = input_ids + candidates, scores = self.generator.get_candidates(input_ids) + self.assertEqual(candidates.device, input_ids.device) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file From a350b1cbdbdda53bf1679047f28efa656e16659c Mon Sep 17 00:00:00 2001 From: jmamou Date: Wed, 18 Dec 2024 02:56:46 -0800 Subject: [PATCH 62/76] fix style --- src/transformers/generation/candidate_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index 0f57c1f6d56eab..26c81da0c1a5a2 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -304,7 +304,7 @@ def _update_past_and_masks( self.assistant_kwargs, input_ids.shape[-1], self.assistant_model.config.is_encoder_decoder ) self.assistant_kwargs = _prepare_token_type_ids(self.assistant_kwargs, input_ids.shape[-1]) - + return has_past_key_values def _prepare_generation_args(self, input_ids: torch.LongTensor, min_new_tokens: int, max_new_tokens: int) -> Dict: From e047adf6a414817d77843ce696f8ed11f9949ec4 Mon Sep 17 00:00:00 2001 From: jmamou Date: Wed, 18 Dec 2024 04:15:27 -0800 Subject: [PATCH 63/76] update tests --- tests/generation/test_candidate_generator.py | 72 +++++++++++++++----- 1 file changed, 56 insertions(+), 16 deletions(-) diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py index dd7e427a3bfda9..9a2b2831e5d8e9 100644 --- a/tests/generation/test_candidate_generator.py +++ b/tests/generation/test_candidate_generator.py @@ -64,22 +64,27 @@ def setUp(self): self.target_tokenizer.get_vocab.return_value = self.target_vocab self.assistant_tokenizer.get_vocab.return_value = self.assistant_vocab + self.assistant_model_device = "cpu" + self.target_vocab_size = 6 # Instantiate the class under test self.translator = AssistantToTargetTranslator( - target_tokenizer=self.target_tokenizer, assistant_tokenizer=self.assistant_tokenizer + target_tokenizer=self.target_tokenizer, + assistant_tokenizer=self.assistant_tokenizer, + assistant_model_device=self.assistant_model_device, + target_vocab_size=self.target_vocab_size, ) def test_get_assistant_to_target_input_ids(self): """Test the mapping from assistant tokens to target tokens.""" - expected_mapping = {0: 0, 1: 1, 2: 2} - actual_mapping = self.translator._assistant_to_target_input_ids + expected_mapping = [0, 1, 2, self.translator.suppress_tokens_id, self.translator.suppress_tokens_id] + actual_mapping = self.translator._assistant_to_target_input_ids.tolist() self.assertEqual(actual_mapping, expected_mapping) def test_get_suppress_input_ids(self): """Test the suppression of assistant input IDs not present in the target vocabulary.""" - expected_suppress_ids = [4] - actual_suppress_ids = self.translator._suppress_input_ids + expected_suppress_ids = [3, 4] + actual_suppress_ids = self.translator._get_suppress_input_ids().tolist() self.assertEqual(actual_suppress_ids, expected_suppress_ids) def test_get_target_ids(self): @@ -89,8 +94,8 @@ def test_get_target_ids(self): assistant_candidate_ids = torch.LongTensor([[0, 1, 2, 4]]) # 'hello world foo baz' in assistant tokenizer expected_target_ids = torch.LongTensor( - [[0, 1, 2, 4]] - ) # 'hello world foo baz' in target tokenizer (baz id remains 4) + [[0, 1, 2, self.translator.suppress_tokens_id]] + ) # 'hello world foo baz' in target tokenizer (baz is mapped to self.translator.suppress_tokens_id since it does not exist in target vocab) actual_target_ids = self.translator.get_target_ids( assistant_input_ids, target_input_ids, assistant_candidate_ids @@ -100,10 +105,10 @@ def test_get_target_ids(self): def test_get_target_logits(self): """Test the conversion of assistant logits to target logits.""" # Assistant logits for IDs 0, 1, 2 - assistant_logits = torch.FloatTensor([[[0.1, 0.2, 0.3]]]) # Shape (1, 1, 3) + assistant_logits = torch.FloatTensor([[[0.1, 0.2, 0.3, 0.4, self.translator.filter_value]]]) # Shape (1, 1, 5) # Expected target logits (target_vocab_size = 4) - expected_target_logits = torch.full((1, 1, 4), -float("inf")) + expected_target_logits = torch.full((1, 1, self.target_vocab_size), self.translator.filter_value) expected_target_logits[0, 0, 0] = 0.1 # 'hello' expected_target_logits[0, 0, 1] = 0.2 # 'world' expected_target_logits[0, 0, 2] = 0.3 # 'foo' @@ -132,18 +137,38 @@ def setUp(self): self.assistant_tokenizer = MockTokenizer({"hello": 0, "world": 1, "foo": 2}) self.other_target_tokenizer = MockTokenizer({"foo": 2, "bar": 3}) self.other_assistant_tokenizer = MockTokenizer({"baz": 4, "qux": 5}) + self.assistant_model_device = "cpu" + self.target_vocab_size = 6 def test_same_instance_for_same_tokenizers(self): """Test that the same translator is returned for the same tokenizers.""" - translator1 = AssistantVocabTranslatorCache.get_translator(self.target_tokenizer, self.assistant_tokenizer) - translator2 = AssistantVocabTranslatorCache.get_translator(self.target_tokenizer, self.assistant_tokenizer) + translator1 = AssistantVocabTranslatorCache.get_translator( + self.target_tokenizer, + self.assistant_tokenizer, + assistant_model_device=self.assistant_model_device, + target_vocab_size=self.target_vocab_size, + ) + translator2 = AssistantVocabTranslatorCache.get_translator( + self.target_tokenizer, + self.assistant_tokenizer, + assistant_model_device=self.assistant_model_device, + target_vocab_size=self.target_vocab_size, + ) self.assertIs(translator1, translator2, "Translators should be cached and identical") def test_different_instances_for_different_tokenizers(self): """Test that different tokenizers produce different translators.""" - translator1 = AssistantVocabTranslatorCache.get_translator(self.target_tokenizer, self.assistant_tokenizer) + translator1 = AssistantVocabTranslatorCache.get_translator( + self.target_tokenizer, + self.assistant_tokenizer, + assistant_model_device=self.assistant_model_device, + target_vocab_size=self.target_vocab_size, + ) translator2 = AssistantVocabTranslatorCache.get_translator( - self.other_target_tokenizer, self.other_assistant_tokenizer + self.other_target_tokenizer, + self.other_assistant_tokenizer, + assistant_model_device=self.assistant_model_device, + target_vocab_size=self.target_vocab_size, ) self.assertIsNot(translator1, translator2, "Translators should differ for different tokenizers") @@ -154,7 +179,12 @@ def test_cache_with_weakref_key(self): assistant_tokenizer = MockTokenizer({"hello": 0}) # Store translator in a local variable to avoid it being kept alive - translator = AssistantVocabTranslatorCache.get_translator(target_tokenizer, assistant_tokenizer) + translator = AssistantVocabTranslatorCache.get_translator( + target_tokenizer, + assistant_tokenizer, + assistant_model_device=self.assistant_model_device, + target_vocab_size=self.target_vocab_size, + ) self.assertEqual(len(AssistantVocabTranslatorCache._cache), initial_cache_size + 1) # Delete all strong references @@ -177,7 +207,12 @@ def test_weakref_cache_cleanup(self): def create_translator(): target_tokenizer = MockTokenizer({"hello": 0}) assistant_tokenizer = MockTokenizer({"hello": 0}) - translator = AssistantVocabTranslatorCache.get_translator(target_tokenizer, assistant_tokenizer) + translator = AssistantVocabTranslatorCache.get_translator( + target_tokenizer, + assistant_tokenizer, + assistant_model_device=self.assistant_model_device, + target_vocab_size=self.target_vocab_size, + ) # Create weak references before returning refs = (weakref.ref(translator), weakref.ref(target_tokenizer), weakref.ref(assistant_tokenizer)) # Remove strong references inside the function @@ -204,7 +239,12 @@ def test_thread_safety(self): translators = [] def get_translator(): - translator = AssistantVocabTranslatorCache.get_translator(self.target_tokenizer, self.assistant_tokenizer) + translator = AssistantVocabTranslatorCache.get_translator( + self.target_tokenizer, + self.assistant_tokenizer, + assistant_model_device=self.assistant_model_device, + target_vocab_size=self.target_vocab_size, + ) translators.append(translator) threads = [threading.Thread(target=get_translator) for _ in range(10)] From 011f5956d23ab308855d3d2cb7c3cf31c9571a28 Mon Sep 17 00:00:00 2001 From: Gaurav Jain Date: Tue, 17 Dec 2024 22:30:19 +0000 Subject: [PATCH 64/76] Remove unused import and fix `test_speculation_depth` test --- tests/test_universal_assisted_generation.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/test_universal_assisted_generation.py b/tests/test_universal_assisted_generation.py index 8eae5a71de0749..e6d2ea7ec30e22 100644 --- a/tests/test_universal_assisted_generation.py +++ b/tests/test_universal_assisted_generation.py @@ -1,6 +1,5 @@ import unittest -from zmq import device import torch import logging from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig @@ -8,8 +7,7 @@ logging.basicConfig(level=logging.DEBUG, format='%(message)s') -if torch.cuda.is_available(): - device = "cuda" +device = "cuda" if torch.cuda.is_available() else "cpu" class TestUniversalSpeculativeDecoding(unittest.TestCase): @classmethod @@ -99,7 +97,7 @@ def test_speculation_depth(self): self.generator.input_ids = input_ids for depth in [1, 8, 17]: - self.generation_config.num_assistant_tokens = depth + self.generator.num_assistant_tokens = depth candidates, scores = self.generator.get_candidates(input_ids) self.assertLessEqual( candidates.shape[1] - input_ids.shape[1], depth From 26524900c4ac2295ed40de96606ffab5a8a4789c Mon Sep 17 00:00:00 2001 From: Gaurav Jain Date: Wed, 18 Dec 2024 21:00:37 +0000 Subject: [PATCH 65/76] exclude special and reserved tokens from tokenizer for UAG --- tests/test_universal_assisted_generation.py | 35 +++++---------------- 1 file changed, 7 insertions(+), 28 deletions(-) diff --git a/tests/test_universal_assisted_generation.py b/tests/test_universal_assisted_generation.py index e6d2ea7ec30e22..8c45a0ba148622 100644 --- a/tests/test_universal_assisted_generation.py +++ b/tests/test_universal_assisted_generation.py @@ -1,11 +1,9 @@ import unittest import torch -import logging from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig from transformers.generation.candidate_generator import UniversalSpeculativeDecodingGenerator -logging.basicConfig(level=logging.DEBUG, format='%(message)s') device = "cuda" if torch.cuda.is_available() else "cpu" @@ -16,11 +14,11 @@ def setUpClass(cls): cls.main_model = AutoModelForCausalLM.from_pretrained( "meta-llama/Llama-3.2-1B-Instruct").to(device) cls.assistant_model = AutoModelForCausalLM.from_pretrained( - "Qwen/Qwen2.5-0.5B-Instruct").to(device) + "hf-internal-testing/tiny-random-gpt2").to(device) cls.main_tokenizer = AutoTokenizer.from_pretrained( "meta-llama/Llama-3.2-1B-Instruct") cls.assistant_tokenizer = AutoTokenizer.from_pretrained( - "Qwen/Qwen2.5-0.5B-Instruct") + "hf-internal-testing/tiny-random-gpt2") cls.generation_config = GenerationConfig() # Ensure required tokens exist @@ -62,35 +60,16 @@ def test_mismatched_vocabularies(self): # Find a token that is not in the assistant tokenizer but in # the main tokenizer. missing_token = next( - token - for token in self.main_tokenizer.get_vocab() - if token not in self.assistant_tokenizer.get_vocab() + token for token in self.main_tokenizer.get_vocab() + if token not in self.assistant_tokenizer.get_vocab() and + token not in self.main_tokenizer.all_special_tokens and + "reserved_" not in token ) - - input_ids = torch.tensor([[self.main_tokenizer.convert_tokens_to_ids(missing_token)]]) + input_ids = torch.tensor([[self.main_tokenizer.convert_tokens_to_ids(missing_token)]]) self.generator.input_ids = input_ids candidates, scores = self.generator.get_candidates(input_ids) self.assertIsNotNone(candidates) - def test_empty_input(self): - if False: - """Test handling of empty input""" - input_ids = torch.tensor([[]], dtype=torch.long) - self.generator.input_ids = input_ids - with self.assertRaises(ValueError): - self.generator.get_candidates(input_ids) - - def test_long_sequence(self): - if False: - """Test handling of very long input sequences""" - long_input = torch.ones((1, 2048), dtype=torch.long) - self.generator.input_ids = long_input - candidates, scores = self.generator.get_candidates(long_input) - self.assertLessEqual( - candidates.shape[1], - self.main_model.config.max_position_embeddings, - ) - def test_speculation_depth(self): """Test different speculation depths""" input_ids = self.main_tokenizer.encode("Test text", return_tensors="pt") From 701edbb522c62dffba22ebc2f84d8a7d3107a38f Mon Sep 17 00:00:00 2001 From: Gaurav Jain Date: Thu, 19 Dec 2024 08:51:57 +0000 Subject: [PATCH 66/76] mv `test_universal_assisted_generation.py` to `generation/test_candidate_generator.py` --- tests/generation/test_candidate_generator.py | 90 ++++++++++++++++++ tests/test_universal_assisted_generation.py | 96 -------------------- 2 files changed, 90 insertions(+), 96 deletions(-) delete mode 100644 tests/test_universal_assisted_generation.py diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py index dd7e427a3bfda9..7d005f42536ab9 100644 --- a/tests/generation/test_candidate_generator.py +++ b/tests/generation/test_candidate_generator.py @@ -1,9 +1,12 @@ import gc +import logging import threading import unittest import weakref from unittest.mock import MagicMock +from zmq import device + import numpy as np import torch @@ -11,8 +14,11 @@ AssistantToTargetTranslator, AssistantVocabTranslatorCache, AssistedCandidateGeneratorDifferentTokenizers, + UniversalSpeculativeDecodingGenerator ) +from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig + class TestAssistedCandidateGeneratorDifferentTokenizers(unittest.TestCase): def test_no_intersection(self): @@ -216,3 +222,87 @@ def get_translator(): # All translators should be the same instance for translator in translators: self.assertIs(translators[0], translator, "All translators should be identical across threads") + + +class TestUniversalSpeculativeDecoding(unittest.TestCase): + device = "cuda" if torch.cuda.is_available() else "cpu" + + @classmethod + def setUpClass(cls): + cls.assistant_model = AutoModelForCausalLM.from_pretrained( + "hf-internal-testing/tiny-random-gpt2").to(cls.device) + cls.main_tokenizer = AutoTokenizer.from_pretrained( + "meta-llama/Llama-3.2-1B-Instruct") + cls.assistant_tokenizer = AutoTokenizer.from_pretrained( + "hf-internal-testing/tiny-random-gpt2") + cls.generation_config = GenerationConfig() + + # Ensure required tokens exist + if cls.main_tokenizer.pad_token_id is None: + cls.main_tokenizer.pad_token_id = cls.main_tokenizer.eos_token_id + if cls.main_tokenizer.bos_token_id is None: + cls.main_tokenizer.bos_token_id = cls.main_tokenizer.eos_token_id + + def setUp(self): + self.input_ids = torch.tensor([[1, 2, 3]]).to(self.device) + self.model_kwargs = { + "attention_mask": torch.ones_like(self.input_ids).to(self.device), + } + self.generator = UniversalSpeculativeDecodingGenerator( + input_ids=self.input_ids, + assistant_model=self.assistant_model, + target_tokenizer=self.main_tokenizer, + assistant_tokenizer=self.assistant_tokenizer, + generation_config=self.generation_config, + model_kwargs=self.model_kwargs, + target_vocab_size=self.main_tokenizer.vocab_size, + ) + + def test_basic_generation(self): + """Test basic speculative decoding works""" + input_text = "The quick brown fox" + input_ids = self.main_tokenizer.encode(input_text, return_tensors="pt") + self.generator.input_ids = input_ids + candidates, scores = self.generator.get_candidates(input_ids) + + self.assertIsNotNone(candidates) + self.assertIsNotNone(scores) + self.assertTrue(torch.is_tensor(candidates)) + self.assertTrue(torch.is_tensor(scores)) + + def test_mismatched_vocabularies(self): + """Test handling of mismatched vocabularies between models""" + # Create input with tokens present in main but not assistant vocab + # Find a token that is not in the assistant tokenizer but in + # the main tokenizer. + missing_token = next( + token for token in self.main_tokenizer.get_vocab() + if token not in self.assistant_tokenizer.get_vocab() and + token not in self.main_tokenizer.all_special_tokens and + "reserved_" not in token + ) + input_ids = torch.tensor([[self.main_tokenizer.convert_tokens_to_ids(missing_token)]]) + self.generator.input_ids = input_ids + candidates, scores = self.generator.get_candidates(input_ids) + self.assertIsNotNone(candidates) + + def test_speculation_depth(self): + """Test different speculation depths""" + input_ids = self.main_tokenizer.encode("Test text", return_tensors="pt") + self.generator.input_ids = input_ids + + for depth in [1, 8, 17]: + self.generator.num_assistant_tokens = depth + candidates, scores = self.generator.get_candidates(input_ids) + self.assertLessEqual( + candidates.shape[1] - input_ids.shape[1], depth + ) + + def test_device_consistency(self): + """Test handling of inputs on different devices""" + if torch.cuda.is_available(): + input_ids = torch.tensor([[1, 2, 3]]).to( + self.generator.assistant_model.device) + self.generator.input_ids = input_ids + candidates, scores = self.generator.get_candidates(input_ids) + self.assertEqual(candidates.device, input_ids.device) diff --git a/tests/test_universal_assisted_generation.py b/tests/test_universal_assisted_generation.py deleted file mode 100644 index 8c45a0ba148622..00000000000000 --- a/tests/test_universal_assisted_generation.py +++ /dev/null @@ -1,96 +0,0 @@ -import unittest - -import torch -from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig -from transformers.generation.candidate_generator import UniversalSpeculativeDecodingGenerator - - -device = "cuda" if torch.cuda.is_available() else "cpu" - -class TestUniversalSpeculativeDecoding(unittest.TestCase): - @classmethod - def setUpClass(cls): - # Setup main and assistant models - cls.main_model = AutoModelForCausalLM.from_pretrained( - "meta-llama/Llama-3.2-1B-Instruct").to(device) - cls.assistant_model = AutoModelForCausalLM.from_pretrained( - "hf-internal-testing/tiny-random-gpt2").to(device) - cls.main_tokenizer = AutoTokenizer.from_pretrained( - "meta-llama/Llama-3.2-1B-Instruct") - cls.assistant_tokenizer = AutoTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-gpt2") - cls.generation_config = GenerationConfig() - - # Ensure required tokens exist - if cls.main_tokenizer.pad_token_id is None: - cls.main_tokenizer.pad_token_id = cls.main_tokenizer.eos_token_id - if cls.main_tokenizer.bos_token_id is None: - cls.main_tokenizer.bos_token_id = cls.main_tokenizer.eos_token_id - - def setUp(self): - self.input_ids = torch.tensor([[1, 2, 3]]).to(device) - self.model_kwargs = { - "attention_mask": torch.ones_like(self.input_ids).to(device), - } - self.generator = UniversalSpeculativeDecodingGenerator( - input_ids=self.input_ids, - assistant_model=self.assistant_model, - target_tokenizer=self.main_tokenizer, - assistant_tokenizer=self.assistant_tokenizer, - generation_config=self.generation_config, - model_kwargs=self.model_kwargs, - target_vocab_size=self.main_tokenizer.vocab_size, - ) - - def test_basic_generation(self): - """Test basic speculative decoding works""" - input_text = "The quick brown fox" - input_ids = self.main_tokenizer.encode(input_text, return_tensors="pt") - self.generator.input_ids = input_ids - candidates, scores = self.generator.get_candidates(input_ids) - - self.assertIsNotNone(candidates) - self.assertIsNotNone(scores) - self.assertTrue(torch.is_tensor(candidates)) - self.assertTrue(torch.is_tensor(scores)) - - def test_mismatched_vocabularies(self): - """Test handling of mismatched vocabularies between models""" - # Create input with tokens present in main but not assistant vocab - # Find a token that is not in the assistant tokenizer but in - # the main tokenizer. - missing_token = next( - token for token in self.main_tokenizer.get_vocab() - if token not in self.assistant_tokenizer.get_vocab() and - token not in self.main_tokenizer.all_special_tokens and - "reserved_" not in token - ) - input_ids = torch.tensor([[self.main_tokenizer.convert_tokens_to_ids(missing_token)]]) - self.generator.input_ids = input_ids - candidates, scores = self.generator.get_candidates(input_ids) - self.assertIsNotNone(candidates) - - def test_speculation_depth(self): - """Test different speculation depths""" - input_ids = self.main_tokenizer.encode("Test text", return_tensors="pt") - self.generator.input_ids = input_ids - - for depth in [1, 8, 17]: - self.generator.num_assistant_tokens = depth - candidates, scores = self.generator.get_candidates(input_ids) - self.assertLessEqual( - candidates.shape[1] - input_ids.shape[1], depth - ) - - def test_device_consistency(self): - """Test handling of inputs on different devices""" - if torch.cuda.is_available(): - input_ids = torch.tensor([[1, 2, 3]]).to( - self.generator.assistant_model.device) - self.generator.input_ids = input_ids - candidates, scores = self.generator.get_candidates(input_ids) - self.assertEqual(candidates.device, input_ids.device) - - -if __name__ == '__main__': - unittest.main() \ No newline at end of file From 3b89341d7dd8da4b3badd8d890edf3a0bd1022a6 Mon Sep 17 00:00:00 2001 From: gauravjain14 <41287729+gauravjain14@users.noreply.github.com> Date: Thu, 19 Dec 2024 22:54:48 -0800 Subject: [PATCH 67/76] Remove unused imports and fix style using `make style` (#9) --- tests/generation/test_candidate_generator.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py index f4e7bba1dd2835..80065c16080462 100644 --- a/tests/generation/test_candidate_generator.py +++ b/tests/generation/test_candidate_generator.py @@ -1,24 +1,20 @@ import gc -import logging import threading import unittest import weakref from unittest.mock import MagicMock -from zmq import device - import numpy as np import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig from transformers.generation.candidate_generator import ( AssistantToTargetTranslator, AssistantVocabTranslatorCache, AssistedCandidateGeneratorDifferentTokenizers, - UniversalSpeculativeDecodingGenerator + UniversalSpeculativeDecodingGenerator, ) -from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig - class TestAssistedCandidateGeneratorDifferentTokenizers(unittest.TestCase): def test_no_intersection(self): @@ -313,7 +309,7 @@ def test_basic_generation(self): def test_mismatched_vocabularies(self): """Test handling of mismatched vocabularies between models""" # Create input with tokens present in main but not assistant vocab - # Find a token that is not in the assistant tokenizer but in + # Find a token that is not in the assistant tokenizer but in # the main tokenizer. missing_token = next( token for token in self.main_tokenizer.get_vocab() @@ -321,7 +317,7 @@ def test_mismatched_vocabularies(self): token not in self.main_tokenizer.all_special_tokens and "reserved_" not in token ) - input_ids = torch.tensor([[self.main_tokenizer.convert_tokens_to_ids(missing_token)]]) + input_ids = torch.tensor([[self.main_tokenizer.convert_tokens_to_ids(missing_token)]]) self.generator.input_ids = input_ids candidates, scores = self.generator.get_candidates(input_ids) self.assertIsNotNone(candidates) From e43dba87e79ec81c6ebf61f27be3af068ba7dcba Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Sat, 21 Dec 2024 13:52:27 -0500 Subject: [PATCH 68/76] formatting --- tests/generation/test_candidate_generator.py | 27 +++++++++----------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py index 80065c16080462..8f3e024ae3588b 100644 --- a/tests/generation/test_candidate_generator.py +++ b/tests/generation/test_candidate_generator.py @@ -265,12 +265,11 @@ class TestUniversalSpeculativeDecoding(unittest.TestCase): @classmethod def setUpClass(cls): - cls.assistant_model = AutoModelForCausalLM.from_pretrained( - "hf-internal-testing/tiny-random-gpt2").to(cls.device) - cls.main_tokenizer = AutoTokenizer.from_pretrained( - "meta-llama/Llama-3.2-1B-Instruct") - cls.assistant_tokenizer = AutoTokenizer.from_pretrained( - "hf-internal-testing/tiny-random-gpt2") + cls.assistant_model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to( + cls.device + ) + cls.main_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct") + cls.assistant_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2") cls.generation_config = GenerationConfig() # Ensure required tokens exist @@ -312,10 +311,11 @@ def test_mismatched_vocabularies(self): # Find a token that is not in the assistant tokenizer but in # the main tokenizer. missing_token = next( - token for token in self.main_tokenizer.get_vocab() - if token not in self.assistant_tokenizer.get_vocab() and - token not in self.main_tokenizer.all_special_tokens and - "reserved_" not in token + token + for token in self.main_tokenizer.get_vocab() + if token not in self.assistant_tokenizer.get_vocab() + and token not in self.main_tokenizer.all_special_tokens + and "reserved_" not in token ) input_ids = torch.tensor([[self.main_tokenizer.convert_tokens_to_ids(missing_token)]]) self.generator.input_ids = input_ids @@ -330,15 +330,12 @@ def test_speculation_depth(self): for depth in [1, 8, 17]: self.generator.num_assistant_tokens = depth candidates, scores = self.generator.get_candidates(input_ids) - self.assertLessEqual( - candidates.shape[1] - input_ids.shape[1], depth - ) + self.assertLessEqual(candidates.shape[1] - input_ids.shape[1], depth) def test_device_consistency(self): """Test handling of inputs on different devices""" if torch.cuda.is_available(): - input_ids = torch.tensor([[1, 2, 3]]).to( - self.generator.assistant_model.device) + input_ids = torch.tensor([[1, 2, 3]]).to(self.generator.assistant_model.device) self.generator.input_ids = input_ids candidates, scores = self.generator.get_candidates(input_ids) self.assertEqual(candidates.device, input_ids.device) From a5297951adca2c2a9946b3fb5771df58610f3991 Mon Sep 17 00:00:00 2001 From: gauravjain14 <41287729+gauravjain14@users.noreply.github.com> Date: Sat, 21 Dec 2024 15:42:17 -0800 Subject: [PATCH 69/76] Swap gated `meta-llama/llama-3.2` with `allenai/llama` (#10) --- tests/generation/test_candidate_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py index 8f3e024ae3588b..2fcda92a9e1b86 100644 --- a/tests/generation/test_candidate_generator.py +++ b/tests/generation/test_candidate_generator.py @@ -268,7 +268,7 @@ def setUpClass(cls): cls.assistant_model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to( cls.device ) - cls.main_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct") + cls.main_tokenizer = AutoTokenizer.from_pretrained("allenai/Llama-3.1-Tulu-3-8B-SFT") cls.assistant_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2") cls.generation_config = GenerationConfig() From 25cd5da88016a6afa1ace506c61829d811eae75b Mon Sep 17 00:00:00 2001 From: Jonathan Mamou Date: Thu, 9 Jan 2025 17:13:09 +0200 Subject: [PATCH 70/76] Fix space sign disagreement (#12) * default values for AssistantToTargetTranslator fileds * fix space sign * minor * fix test + style --- .../generation/candidate_generator.py | 37 ++++++++++++++++--- tests/generation/test_candidate_generator.py | 6 +++ 2 files changed, 37 insertions(+), 6 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index 26c81da0c1a5a2..5ba09a9d518d88 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -627,15 +627,18 @@ def __init__( self, target_tokenizer: "PreTrainedTokenizerBase", assistant_tokenizer: "PreTrainedTokenizerBase", - assistant_model_device, - target_vocab_size: int, + assistant_model_device: str = "cpu", + target_vocab_size: int = None, filter_value: float = -float("Inf"), suppress_tokens_id: int = -1, ): self._target_tokenizer: "PreTrainedTokenizerBase" = target_tokenizer self._assistant_tokenizer: "PreTrainedTokenizerBase" = assistant_tokenizer - self._assistant_model_device = assistant_model_device - self.target_vocab_size: int = target_vocab_size + self._assistant_model_device: str = assistant_model_device + if target_vocab_size: + self.target_vocab_size: int = target_vocab_size + else: + self.target_vocab_size: int = len(self._target_tokenizer.get_vocab()) self.filter_value: float = filter_value self.suppress_tokens_id: int = suppress_tokens_id self._assistant_to_target_input_ids = self._get_assistant_to_target_input_ids() @@ -646,6 +649,28 @@ def __init__( def _get_assistant_to_target_input_ids(self): target_vocab = self._target_tokenizer.get_vocab() assistant_vocab = self._assistant_tokenizer.get_vocab() + + space_str = " " + target_space_ids = self._target_tokenizer(space_str, add_special_tokens=False)["input_ids"] + if len(target_space_ids) > 0: + target_space_sign = self._target_tokenizer.convert_ids_to_tokens(target_space_ids)[0][0] + + assistant_space_ids = self._assistant_tokenizer(space_str, add_special_tokens=False)["input_ids"] + if len(assistant_space_ids) > 0: + assistant_space_sign = self._assistant_tokenizer.convert_ids_to_tokens(assistant_space_ids)[0][0] + + if target_space_sign != assistant_space_sign: + # If the assistant tokenizer has a different space sign than the target tokenizer, + # we need to replace the assistant space sign with the target space sign in the assistant_vocab. + assistant_vocab = { + ( + tok.replace(assistant_space_sign, target_space_sign, 1) + if tok.startswith(assistant_space_sign) + else tok + ): idx + for tok, idx in assistant_vocab.items() + } + max_assistant_index = max(assistant_vocab.values()) assistant_to_target_input_ids = torch.full((max_assistant_index + 1,), self.suppress_tokens_id, dtype=int) for tok, idx in assistant_vocab.items(): @@ -707,8 +732,8 @@ def get_translator( cls, target_tokenizer: "PreTrainedTokenizerBase", assistant_tokenizer: "PreTrainedTokenizerBase", - assistant_model_device, - target_vocab_size: int, + assistant_model_device: str = "cpu", + target_vocab_size: int = None, ) -> AssistantToTargetTranslator: with cls._lock: assistant_dict = cls._cache.get(target_tokenizer) diff --git a/tests/generation/test_candidate_generator.py b/tests/generation/test_candidate_generator.py index 2fcda92a9e1b86..7c65f369742576 100644 --- a/tests/generation/test_candidate_generator.py +++ b/tests/generation/test_candidate_generator.py @@ -129,6 +129,12 @@ def __init__(self, vocab=None): def get_vocab(self): return self._vocab + def __call__(self, text, add_special_tokens=True): + # Mock implementation of the __call__ method + tokens = text.split() + input_ids = [self._vocab.get(token, 0) for token in tokens] + return {"input_ids": input_ids} + class TestAssistantVocabTranslatorCache(unittest.TestCase): def setUp(self): From 77edae266dc9d96ffea94bb9551d1ad8b27db1c9 Mon Sep 17 00:00:00 2001 From: Jonathan Mamou Date: Thu, 9 Jan 2025 17:29:02 +0200 Subject: [PATCH 71/76] Default values for some fields of assistant to target translator (#11) * default values for AssistantToTargetTranslator fileds * fix * add support to empty logit_processors --- .../generation/candidate_generator.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index 5ba09a9d518d88..80940e3b7e0c92 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -628,7 +628,8 @@ def __init__( target_tokenizer: "PreTrainedTokenizerBase", assistant_tokenizer: "PreTrainedTokenizerBase", assistant_model_device: str = "cpu", - target_vocab_size: int = None, + default_AssistantToTargetTranslator + target_vocab_size: Optional[int] = None, filter_value: float = -float("Inf"), suppress_tokens_id: int = -1, ): @@ -642,9 +643,13 @@ def __init__( self.filter_value: float = filter_value self.suppress_tokens_id: int = suppress_tokens_id self._assistant_to_target_input_ids = self._get_assistant_to_target_input_ids() - self.logits_processors: LogitsProcessorList = LogitsProcessorList( - [SuppressTokensLogitsProcessor(self._get_suppress_input_ids(), self._assistant_model_device)] - ) + self._suppress_input_ids: list[int] = self._get_suppress_input_ids() + self.logits_processors: Optional[LogitsProcessorList] = None + if len(self._suppress_input_ids) > 0: + # len(self._suppress_input_ids) = 0 if the assistant vocab is a subset of the target vocab + self.logits_processors = LogitsProcessorList( + [SuppressTokensLogitsProcessor(self._get_suppress_input_ids(), self._assistant_model_device)] + ) def _get_assistant_to_target_input_ids(self): target_vocab = self._target_tokenizer.get_vocab() @@ -733,7 +738,7 @@ def get_translator( target_tokenizer: "PreTrainedTokenizerBase", assistant_tokenizer: "PreTrainedTokenizerBase", assistant_model_device: str = "cpu", - target_vocab_size: int = None, + target_vocab_size: Optional[int] = None, ) -> AssistantToTargetTranslator: with cls._lock: assistant_dict = cls._cache.get(target_tokenizer) @@ -826,7 +831,8 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, generation_args["generation_config"].return_dict_in_generate = True # Generate and process outputs using translator - generation_args["logits_processor"] = self._atm_translator.logits_processors + if self._atm_translator.logits_processors is not None: + generation_args["logits_processor"] = self._atm_translator.logits_processors self._prev_assistant_ids, assistant_candidate_logits = self._generate_candidates(generation_args) # Use translator to convert tokens and logits From a2a2882bf29eb374de0ce8dc5ec85577cd72d23a Mon Sep 17 00:00:00 2001 From: Jonathan Mamou Date: Sun, 12 Jan 2025 14:35:20 +0200 Subject: [PATCH 72/76] Update candidate_generator.py (#15) fix typo --- src/transformers/generation/candidate_generator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index 80940e3b7e0c92..d2a246c81f03e7 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -628,7 +628,6 @@ def __init__( target_tokenizer: "PreTrainedTokenizerBase", assistant_tokenizer: "PreTrainedTokenizerBase", assistant_model_device: str = "cpu", - default_AssistantToTargetTranslator target_vocab_size: Optional[int] = None, filter_value: float = -float("Inf"), suppress_tokens_id: int = -1, From a5569471f454ed5c06d66a25a86eeb16ec5f8bbc Mon Sep 17 00:00:00 2001 From: Jonathan Mamou Date: Sun, 12 Jan 2025 17:25:13 +0200 Subject: [PATCH 73/76] BUG fix in _prepare_assistant_input_ids (#14) * fix _prepare_assistant_input_ids * target_to_assistant_input_ids * Update src/transformers/generation/candidate_generator.py Co-authored-by: Nadav Timor --------- Co-authored-by: Nadav Timor --- .../generation/candidate_generator.py | 34 ++++++++++++------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index d2a246c81f03e7..980505e8979db2 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -641,7 +641,7 @@ def __init__( self.target_vocab_size: int = len(self._target_tokenizer.get_vocab()) self.filter_value: float = filter_value self.suppress_tokens_id: int = suppress_tokens_id - self._assistant_to_target_input_ids = self._get_assistant_to_target_input_ids() + self._assistant_to_target_input_ids, self.target_to_assistant_input_ids = self._get_assistant_to_target_input_ids() self._suppress_input_ids: list[int] = self._get_suppress_input_ids() self.logits_processors: Optional[LogitsProcessorList] = None if len(self._suppress_input_ids) > 0: @@ -677,10 +677,13 @@ def _get_assistant_to_target_input_ids(self): max_assistant_index = max(assistant_vocab.values()) assistant_to_target_input_ids = torch.full((max_assistant_index + 1,), self.suppress_tokens_id, dtype=int) - for tok, idx in assistant_vocab.items(): - if tok in target_vocab: - assistant_to_target_input_ids[idx] = target_vocab[tok] - return assistant_to_target_input_ids.to(self._assistant_model_device) + target_to_assistant_input_id: Dict[int, int] = {} + for tok, assistant_id in assistant_vocab.items(): + target_id = target_vocab.get(tok) + if target_id is not None: + assistant_to_target_input_ids[assistant_id] = target_id + target_to_assistant_input_ids[target_id] = assistant_id + return assistant_to_target_input_ids.to(self._assistant_model_device), target_to_assistant_input_ids def _get_suppress_input_ids(self) -> list[int]: """ @@ -864,13 +867,20 @@ def _prepare_assistant_input_ids(self, target_input_ids: torch.LongTensor) -> to new_token_count = 1 target_new_ids = target_input_ids[:, -new_token_count:] - # Convert only the new tokens - target_new_text = self.target_tokenizer.batch_decode( - target_new_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True - ) - assistant_new_ids = self.assistant_tokenizer(target_new_text, add_special_tokens=False, return_tensors="pt")[ - "input_ids" - ].to(self.assistant_model.device) + # Convert the new tokens + assistant_new_ids = None + if self._target_seq_len_with_candidates > 0: + # we have only one new token and we can directly convert it + assistant_new_ids = self._atm_translator.target_to_assistant_input_ids.get(target_new_ids[0].item()) + if assistant_new_ids is None: + target_new_text = self.target_tokenizer.batch_decode( + target_new_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True + ) + assistant_new_ids = self.assistant_tokenizer( + target_new_text, add_special_tokens=False, return_tensors="pt" + )["input_ids"].to(self.assistant_model.device) + else: + assistant_new_ids = torch.tensor([[assistant_new_ids]], device=self.assistant_model.device) # Update or initialize assistant IDs if self._prev_assistant_ids is None: From 407d89846d553129aae702e6e882d0813f28e219 Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Mon, 13 Jan 2025 10:37:49 +0200 Subject: [PATCH 74/76] typo (`target_to_assistant_input_ids`) --- src/transformers/generation/candidate_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index 980505e8979db2..60f494c2209f33 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -677,7 +677,7 @@ def _get_assistant_to_target_input_ids(self): max_assistant_index = max(assistant_vocab.values()) assistant_to_target_input_ids = torch.full((max_assistant_index + 1,), self.suppress_tokens_id, dtype=int) - target_to_assistant_input_id: Dict[int, int] = {} + target_to_assistant_input_ids: Dict[int, int] = {} for tok, assistant_id in assistant_vocab.items(): target_id = target_vocab.get(tok) if target_id is not None: From a24b1934c3ffe0a107ead0d8e73222c334155588 Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Mon, 13 Jan 2025 10:50:12 +0200 Subject: [PATCH 75/76] formatting --- src/transformers/generation/candidate_generator.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index 60f494c2209f33..954076058e0426 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -641,7 +641,9 @@ def __init__( self.target_vocab_size: int = len(self._target_tokenizer.get_vocab()) self.filter_value: float = filter_value self.suppress_tokens_id: int = suppress_tokens_id - self._assistant_to_target_input_ids, self.target_to_assistant_input_ids = self._get_assistant_to_target_input_ids() + self._assistant_to_target_input_ids, self.target_to_assistant_input_ids = ( + self._get_assistant_to_target_input_ids() + ) self._suppress_input_ids: list[int] = self._get_suppress_input_ids() self.logits_processors: Optional[LogitsProcessorList] = None if len(self._suppress_input_ids) > 0: From 1afdaa3a6299832389a2469725cdc0a5648e34da Mon Sep 17 00:00:00 2001 From: Nadav Timor Date: Wed, 15 Jan 2025 17:08:36 -0500 Subject: [PATCH 76/76] merge upstream/main --- CODEOWNERS | 369 ++ README.md | 26 +- docker/transformers-all-latest-gpu/Dockerfile | 3 + docs/source/ar/_toctree.yml | 12 +- docs/source/ar/tasks/multiple_choice.md | 452 +++ docs/source/ar/tasks/token_classification.md | 550 +++ docs/source/ar/tasks/translation.md | 407 +++ docs/source/en/_toctree.yml | 14 +- docs/source/en/chat_templating.md | 44 +- docs/source/en/deepspeed.md | 14 + docs/source/en/generation_strategies.md | 28 + docs/source/en/index.md | 7 + docs/source/en/installation.md | 52 +- docs/source/en/llm_optims.md | 11 +- docs/source/en/llm_tutorial.md | 5 +- docs/source/en/model_doc/diffllama.md | 59 + docs/source/en/model_doc/emu3.md | 179 + docs/source/en/model_doc/helium.md | 158 + docs/source/en/model_doc/modernbert.md | 4 +- docs/source/en/model_doc/moonshine.md | 56 + docs/source/en/model_doc/musicgen_melody.md | 1 - docs/source/en/model_doc/qwen2_audio.md | 31 + docs/source/en/model_doc/siglip.md | 2 +- docs/source/en/model_doc/textnet.md | 55 + docs/source/en/model_doc/vitpose.md | 254 ++ docs/source/en/perf_infer_gpu_one.md | 10 +- docs/source/en/quantization/gptq.md | 52 +- docs/source/en/quantization/overview.md | 56 +- docs/source/en/quantization/torchao.md | 17 +- docs/source/en/tasks/video_text_to_text.md | 2 +- .../ja/model_doc/decision_transformer.md | 25 +- docs/source/ko/_toctree.yml | 32 +- docs/source/ko/model_doc/altclip.md | 78 + examples/flax/question-answering/run_qa.py | 2 +- .../run_flax_speech_recognition_seq2seq.py | 2 +- .../flax/text-classification/run_flax_glue.py | 2 +- .../flax/token-classification/run_flax_ner.py | 2 +- .../configuration_my_new_model.py | 4 +- .../image_processing_new_imgproc_model.py | 2 +- .../modular-transformers/modeling_dummy.py | 2 +- .../modeling_multimodal1.py | 2 +- .../modeling_my_new_model2.py | 2 +- .../modeling_new_task_model.py | 3 - .../modular-transformers/modeling_super.py | 2 +- .../run_audio_classification.py | 2 +- .../contrastive-image-text/run_clip.py | 2 +- .../run_image_classification.py | 2 +- .../run_image_classification_no_trainer.py | 2 +- examples/pytorch/image-pretraining/run_mae.py | 2 +- examples/pytorch/image-pretraining/run_mim.py | 2 +- .../image-pretraining/run_mim_no_trainer.py | 2 +- .../run_instance_segmentation.py | 2 +- .../run_instance_segmentation_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_clm.py | 2 +- .../language-modeling/run_clm_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_fim.py | 2 +- .../language-modeling/run_fim_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_mlm.py | 2 +- .../language-modeling/run_mlm_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_plm.py | 2 +- examples/pytorch/multiple-choice/run_swag.py | 2 +- .../multiple-choice/run_swag_no_trainer.py | 2 +- .../object-detection/run_object_detection.py | 2 +- .../run_object_detection_no_trainer.py | 2 +- examples/pytorch/question-answering/run_qa.py | 2 +- .../question-answering/run_qa_beam_search.py | 2 +- .../run_qa_beam_search_no_trainer.py | 2 +- .../question-answering/run_qa_no_trainer.py | 2 +- .../question-answering/run_seq2seq_qa.py | 2 +- .../run_semantic_segmentation.py | 2 +- .../run_semantic_segmentation_no_trainer.py | 2 +- .../run_speech_recognition_ctc.py | 2 +- .../run_speech_recognition_ctc_adapter.py | 2 +- .../run_speech_recognition_seq2seq.py | 2 +- .../summarization/run_summarization.py | 2 +- .../run_summarization_no_trainer.py | 2 +- .../text-classification/run_classification.py | 2 +- .../pytorch/text-classification/run_glue.py | 2 +- .../run_glue_no_trainer.py | 2 +- .../pytorch/text-classification/run_xnli.py | 2 +- .../pytorch/token-classification/run_ner.py | 2 +- .../run_ner_no_trainer.py | 2 +- .../pytorch/translation/run_translation.py | 2 +- .../translation/run_translation_no_trainer.py | 2 +- .../decision_transformer/requirements.txt | 2 +- .../contrastive-image-text/run_clip.py | 2 +- .../run_image_classification.py | 2 +- .../tensorflow/multiple-choice/run_swag.py | 2 +- .../tensorflow/question-answering/run_qa.py | 2 +- .../summarization/run_summarization.py | 2 +- .../text-classification/run_glue.py | 2 +- .../tensorflow/translation/run_translation.py | 2 +- read_video.py | 77 + run.py | 107 + setup.py | 4 +- src/transformers/__init__.py | 125 +- src/transformers/configuration_utils.py | 7 +- src/transformers/convert_slow_tokenizer.py | 89 + src/transformers/data/data_collator.py | 113 +- src/transformers/dependency_versions_table.py | 2 +- src/transformers/generation/flax_utils.py | 1 - src/transformers/generation/streamers.py | 2 + src/transformers/generation/tf_utils.py | 1 - src/transformers/generation/utils.py | 7 +- src/transformers/image_utils.py | 215 ++ src/transformers/integrations/__init__.py | 2 - .../integrations/flex_attention.py | 2 +- src/transformers/integrations/ggml.py | 248 -- .../integrations/integration_utils.py | 6 + src/transformers/integrations/peft.py | 62 + src/transformers/loss/loss_utils.py | 5 +- .../modeling_gguf_pytorch_utils.py | 142 +- src/transformers/modeling_rope_utils.py | 15 +- src/transformers/modeling_utils.py | 94 +- src/transformers/models/__init__.py | 7 + src/transformers/models/aria/modeling_aria.py | 21 +- src/transformers/models/aria/modular_aria.py | 4 +- .../models/auto/configuration_auto.py | 21 +- .../models/auto/feature_extraction_auto.py | 1 + src/transformers/models/auto/modeling_auto.py | 19 + .../models/auto/processing_auto.py | 2 + .../models/auto/tokenization_auto.py | 10 + .../models/bamba/modeling_bamba.py | 16 +- .../models/bart/tokenization_bart_fast.py | 10 +- .../models/beit/image_processing_beit.py | 2 +- .../models/bit/image_processing_bit.py | 2 +- .../tokenization_blenderbot_fast.py | 10 +- .../models/blip/image_processing_blip.py | 2 +- src/transformers/models/blip/modeling_blip.py | 7 + .../models/blip_2/configuration_blip_2.py | 3 - .../image_processing_bridgetower.py | 2 +- .../chameleon/image_processing_chameleon.py | 2 +- .../models/chameleon/modeling_chameleon.py | 114 +- .../models/chameleon/processing_chameleon.py | 1 + .../image_processing_chinese_clip.py | 2 +- .../models/clip/image_processing_clip.py | 2 +- src/transformers/models/clip/modeling_clip.py | 11 +- .../models/clipseg/modeling_clipseg.py | 7 + .../codegen/tokenization_codegen_fast.py | 10 - .../models/cohere/modeling_cohere.py | 541 +-- .../models/cohere/modular_cohere.py | 400 +++ .../models/cohere2/modeling_cohere2.py | 372 +- .../models/cohere2/modular_cohere2.py | 250 +- .../image_processing_conditional_detr.py | 2 +- .../convnext/image_processing_convnext.py | 2 +- .../deberta/tokenization_deberta_fast.py | 11 - .../image_processing_deformable_detr.py | 2 +- .../image_processing_deformable_detr_fast.py | 84 +- .../modular_deformable_detr.py | 144 + .../models/deit/image_processing_deit.py | 2 +- .../deprecated/deta/image_processing_deta.py | 2 +- .../image_processing_efficientformer.py | 2 +- .../deprecated/tvlt/image_processing_tvlt.py | 2 +- .../vit_hybrid/image_processing_vit_hybrid.py | 2 +- .../models/detr/image_processing_detr.py | 2 +- .../models/detr/image_processing_detr_fast.py | 16 +- src/transformers/models/diffllama/__init__.py | 27 + .../diffllama/configuration_diffllama.py | 199 ++ .../models/diffllama/modeling_diffllama.py | 1423 ++++++++ .../models/diffllama/modular_diffllama.py | 464 +++ .../configuration_dinov2_with_registers.py | 11 +- .../modeling_dinov2_with_registers.py | 50 +- .../modular_dinov2_with_registers.py | 62 +- .../models/donut/image_processing_donut.py | 2 +- .../models/dpt/image_processing_dpt.py | 2 +- .../image_processing_efficientnet.py | 2 +- src/transformers/models/emu3/__init__.py | 29 + .../models/emu3/configuration_emu3.py | 327 ++ .../models/emu3/convert_emu3_weights_to_hf.py | 448 +++ .../models/emu3/image_processing_emu3.py | 552 +++ src/transformers/models/emu3/modeling_emu3.py | 1954 +++++++++++ src/transformers/models/emu3/modular_emu3.py | 1272 +++++++ .../models/emu3/processing_emu3.py | 217 ++ .../models/encodec/modeling_encodec.py | 2 +- .../models/falcon/modeling_falcon.py | 16 +- .../models/flava/image_processing_flava.py | 2 +- .../models/fuyu/image_processing_fuyu.py | 2 +- src/transformers/models/fuyu/modeling_fuyu.py | 9 +- .../models/gemma/modeling_gemma.py | 17 +- .../models/gemma2/modeling_gemma2.py | 19 +- .../models/gemma2/modular_gemma2.py | 2 + src/transformers/models/glm/modeling_glm.py | 18 +- .../models/glpn/image_processing_glpn.py | 2 +- .../models/gpt2/tokenization_gpt2_fast.py | 11 - .../models/gpt_neox/modeling_gpt_neox.py | 16 +- .../gpt_neox/tokenization_gpt_neox_fast.py | 11 +- .../modeling_gpt_neox_japanese.py | 16 +- .../models/granite/configuration_granite.py | 10 + .../models/granite/modeling_granite.py | 17 +- .../models/granitemoe/modeling_granitemoe.py | 16 +- .../image_processing_grounding_dino.py | 2 +- .../grounding_dino/modeling_grounding_dino.py | 4 +- .../models/groupvit/modeling_groupvit.py | 7 + src/transformers/models/helium/__init__.py | 27 + .../models/helium/configuration_helium.py | 140 + .../models/helium/modeling_helium.py | 1065 ++++++ .../models/helium/modular_helium.py | 171 + .../idefics2/image_processing_idefics2.py | 2 +- .../models/idefics2/modeling_idefics2.py | 33 - .../idefics3/image_processing_idefics3.py | 2 +- .../models/idefics3/modeling_idefics3.py | 13 +- .../imagegpt/image_processing_imagegpt.py | 2 +- .../configuration_instructblip.py | 3 - .../configuration_instructblipvideo.py | 3 - .../image_processing_instructblipvideo.py | 2 +- .../modular_instructblipvideo.py | 3 - .../models/jetmoe/modeling_jetmoe.py | 16 +- .../layoutlmv3/image_processing_layoutlmv3.py | 2 +- .../tokenization_layoutlmv3_fast.py | 10 +- .../models/led/tokenization_led_fast.py | 10 +- .../models/levit/image_processing_levit.py | 2 +- .../models/llama/configuration_llama.py | 2 +- .../models/llama/modeling_llama.py | 17 +- .../models/llava/configuration_llava.py | 4 + .../models/llava/modeling_llava.py | 89 +- .../models/llava/processing_llava.py | 34 +- .../llava_next/configuration_llava_next.py | 4 + .../llava_next/image_processing_llava_next.py | 2 +- .../models/llava_next/modeling_llava_next.py | 97 +- .../llava_next/processing_llava_next.py | 40 +- .../configuration_llava_next_video.py | 4 + .../image_processing_llava_next_video.py | 2 +- .../modeling_llava_next_video.py | 151 +- .../modular_llava_next_video.py | 129 +- .../processing_llava_next_video.py | 74 +- .../configuration_llava_onevision.py | 4 + .../image_processing_llava_onevision.py | 2 +- .../modeling_llava_onevision.py | 16 +- .../processing_llava_onevision.py | 2 +- .../video_processing_llava_onevision.py | 2 +- .../tokenization_longformer_fast.py | 10 +- .../markuplm/tokenization_markuplm_fast.py | 10 +- .../image_processing_mask2former.py | 2 +- .../maskformer/image_processing_maskformer.py | 2 +- src/transformers/models/mimi/modeling_mimi.py | 26 +- .../models/mistral/modeling_mistral.py | 17 +- .../models/mixtral/modeling_mixtral.py | 17 +- .../models/mllama/modeling_mllama.py | 6 +- .../image_processing_mobilenet_v1.py | 2 +- .../image_processing_mobilenet_v2.py | 2 +- .../mobilevit/image_processing_mobilevit.py | 2 +- .../modernbert/configuration_modernbert.py | 5 + .../models/modernbert/modeling_modernbert.py | 184 +- .../models/modernbert/modular_modernbert.py | 166 +- src/transformers/models/moonshine/__init__.py | 27 + .../moonshine/configuration_moonshine.py | 224 ++ .../moonshine/convert_usefulsensors_to_hf.py | 169 + .../models/moonshine/modeling_moonshine.py | 1573 +++++++++ .../models/moonshine/modular_moonshine.py | 1135 +++++++ .../models/moshi/modeling_moshi.py | 41 +- .../models/mvp/tokenization_mvp_fast.py | 10 +- .../models/nemotron/modeling_nemotron.py | 8 +- .../models/nougat/image_processing_nougat.py | 2 +- src/transformers/models/olmo/modeling_olmo.py | 17 +- .../olmo2/convert_olmo2_weights_to_hf.py | 2 + .../models/olmo2/modeling_olmo2.py | 17 +- .../models/olmoe/modeling_olmoe.py | 17 +- .../oneformer/image_processing_oneformer.py | 2 +- .../models/owlv2/image_processing_owlv2.py | 2 +- .../models/owlvit/image_processing_owlvit.py | 2 +- .../models/paligemma/modeling_paligemma.py | 5 - .../perceiver/image_processing_perceiver.py | 2 +- .../models/persimmon/modeling_persimmon.py | 16 +- src/transformers/models/phi/modeling_phi.py | 19 +- src/transformers/models/phi/modular_phi.py | 4 +- src/transformers/models/phi3/modeling_phi3.py | 910 ++--- src/transformers/models/phi3/modular_phi3.py | 323 ++ .../models/phimoe/modeling_phimoe.py | 11 +- .../pix2struct/configuration_pix2struct.py | 43 - .../models/pix2struct/modeling_pix2struct.py | 8 - .../pixtral/convert_pixtral_weights_to_hf.py | 148 +- .../pixtral/image_processing_pixtral.py | 37 +- .../models/pixtral/modeling_pixtral.py | 9 +- .../models/pixtral/processing_pixtral.py | 52 +- .../poolformer/image_processing_poolformer.py | 2 +- .../models/pvt/image_processing_pvt.py | 2 +- .../models/qwen2/modeling_qwen2.py | 17 +- .../qwen2_audio/modeling_qwen2_audio.py | 46 +- .../qwen2_audio/processing_qwen2_audio.py | 72 +- .../models/qwen2_moe/modeling_qwen2_moe.py | 26 +- .../qwen2_vl/image_processing_qwen2_vl.py | 7 +- .../models/qwen2_vl/modeling_qwen2_vl.py | 25 +- .../roberta/tokenization_roberta_fast.py | 10 +- .../rt_detr/image_processing_rt_detr.py | 2 +- .../rt_detr/image_processing_rt_detr_fast.py | 63 +- .../models/rt_detr/modular_rt_detr.py | 577 ++++ .../models/sam/image_processing_sam.py | 2 +- .../segformer/image_processing_segformer.py | 2 +- .../models/seggpt/image_processing_seggpt.py | 2 +- .../models/siglip/image_processing_siglip.py | 2 +- .../models/siglip/modeling_siglip.py | 7 + .../modeling_speech_encoder_decoder.py | 3 + .../models/stablelm/modeling_stablelm.py | 16 +- .../models/starcoder2/modeling_starcoder2.py | 17 +- .../superpoint/image_processing_superpoint.py | 2 +- .../swin2sr/image_processing_swin2sr.py | 2 +- .../models/t5/tokenization_t5_fast.py | 1 + src/transformers/models/textnet/__init__.py | 28 + .../models/textnet/configuration_textnet.py | 135 + .../models/textnet/convert_textnet_to_hf.py | 208 ++ .../textnet/image_processing_textnet.py | 355 ++ .../models/textnet/modeling_textnet.py | 487 +++ .../configuration_timm_wrapper.py | 34 +- .../video_llava/configuration_video_llava.py | 4 + .../image_processing_video_llava.py | 2 +- .../video_llava/modeling_video_llava.py | 147 +- .../video_llava/processing_video_llava.py | 14 +- .../videomae/image_processing_videomae.py | 2 +- .../models/vilt/image_processing_vilt.py | 2 +- .../models/vipllava/modeling_vipllava.py | 77 +- .../modeling_vision_encoder_decoder.py | 9 +- .../models/vit/image_processing_vit.py | 2 +- .../vitmatte/image_processing_vitmatte.py | 2 +- src/transformers/models/vitpose/__init__.py | 28 + .../models/vitpose/configuration_vitpose.py | 124 + .../models/vitpose/convert_vitpose_to_hf.py | 355 ++ .../vitpose/image_processing_vitpose.py | 684 ++++ .../models/vitpose/modeling_vitpose.py | 340 ++ .../models/vitpose_backbone/__init__.py | 54 + .../configuration_vitpose_backbone.py | 136 + .../modeling_vitpose_backbone.py | 542 +++ .../models/vivit/image_processing_vivit.py | 2 +- .../models/whisper/generation_whisper.py | 14 +- .../whisper/tokenization_whisper_fast.py | 9 +- .../models/x_clip/modeling_x_clip.py | 7 + .../xlm_roberta_xl/modeling_xlm_roberta_xl.py | 18 +- .../models/yolos/image_processing_yolos.py | 2 +- .../models/zamba/modeling_zamba.py | 390 +-- .../zoedepth/image_processing_zoedepth.py | 2 +- src/transformers/pipelines/__init__.py | 2 +- .../pipelines/audio_classification.py | 2 +- .../pipelines/automatic_speech_recognition.py | 6 + src/transformers/pipelines/base.py | 67 +- .../pipelines/document_question_answering.py | 9 +- src/transformers/pipelines/image_to_text.py | 6 + .../pipelines/table_question_answering.py | 7 + .../pipelines/text2text_generation.py | 6 + src/transformers/pipelines/text_generation.py | 45 +- src/transformers/pipelines/text_to_audio.py | 13 +- .../pipelines/visual_question_answering.py | 10 +- src/transformers/processing_utils.py | 137 +- src/transformers/quantizers/quantizer_gptq.py | 43 +- src/transformers/testing_utils.py | 56 +- src/transformers/tokenization_utils_base.py | 11 +- src/transformers/tokenization_utils_fast.py | 35 +- src/transformers/trainer.py | 12 +- src/transformers/trainer_callback.py | 8 +- src/transformers/trainer_seq2seq.py | 6 +- src/transformers/training_args.py | 12 +- src/transformers/training_args_seq2seq.py | 6 - src/transformers/utils/__init__.py | 3 + src/transformers/utils/dummy_pt_objects.py | 189 ++ .../utils/dummy_vision_objects.py | 21 + src/transformers/utils/import_utils.py | 39 +- src/transformers/utils/notebook.py | 4 + src/transformers/utils/quantization_config.py | 63 +- tests/deepspeed/test_deepspeed.py | 28 +- tests/fsdp/test_fsdp.py | 3 +- tests/generation/test_utils.py | 75 +- tests/models/blip_2/test_modeling_blip_2.py | 18 +- .../chameleon/test_modeling_chameleon.py | 2 +- tests/models/cohere2/test_modeling_cohere2.py | 5 - tests/models/dbrx/test_modeling_dbrx.py | 2 +- tests/models/diffllama/__init__.py | 0 .../diffllama/test_modeling_diffllama.py | 992 ++++++ tests/models/emu3/__init__.py | 0 tests/models/emu3/test_modeling_emu3.py | 542 +++ tests/models/emu3/test_processor_emu3.py | 85 + tests/models/encodec/test_modeling_encodec.py | 23 +- .../test_modeling_falcon_mamba.py | 4 +- tests/models/fuyu/test_modeling_fuyu.py | 4 +- tests/models/helium/__init__.py | 0 tests/models/helium/test_modeling_helium.py | 110 + tests/models/idefics/test_modeling_idefics.py | 2 +- .../test_modeling_instructblip.py | 5 +- .../test_modeling_instructblipvideo.py | 3 +- tests/models/llama/test_modeling_llama.py | 5 +- tests/models/llava/test_modeling_llava.py | 77 +- tests/models/llava/test_processor_llava.py | 58 +- .../llava_next/test_modeling_llava_next.py | 88 +- .../llava_next/test_processor_llava_next.py | 6 +- .../test_modeling_llava_next_video.py | 105 - .../test_processor_llava_onevision.py | 64 +- tests/models/mistral/test_modeling_mistral.py | 2 +- tests/models/mixtral/test_modeling_mixtral.py | 5 +- tests/models/mllama/test_processor_mllama.py | 62 +- .../test_modeling_mobilenet_v1.py | 2 + .../modernbert/test_modeling_modernbert.py | 144 +- tests/models/moonshine/__init__.py | 0 .../moonshine/test_modeling_moonshine.py | 620 ++++ .../models/nemotron/test_modeling_nemotron.py | 3 +- .../omdet_turbo/test_modeling_omdet_turbo.py | 8 +- .../paligemma/test_modeling_paligemma.py | 2 +- tests/models/phi3/test_modeling_phi3.py | 3 + .../models/pixtral/test_processor_pixtral.py | 2 +- .../qwen2_audio/test_modeling_qwen2_audio.py | 73 +- .../test_image_processing_qwen2_vl.py | 39 +- .../models/qwen2_vl/test_modeling_qwen2_vl.py | 5 +- tests/models/rt_detr/test_modeling_rt_detr.py | 12 +- .../starcoder2/test_modeling_starcoder2.py | 3 +- .../test_modeling_switch_transformers.py | 2 + tests/models/t5/test_modeling_t5.py | 6 +- tests/models/textnet/__init__.py | 0 .../textnet/test_image_processing_textnet.py | 126 + tests/models/textnet/test_modeling_textnet.py | 348 ++ .../test_modeling_timm_wrapper.py | 62 + tests/models/upernet/test_modeling_upernet.py | 1 - .../video_llava/test_modeling_video_llava.py | 127 +- .../models/vipllava/test_modeling_vipllava.py | 64 +- tests/models/vitpose/__init__.py | 0 .../vitpose/test_image_processing_vitpose.py | 229 ++ tests/models/vitpose/test_modeling_vitpose.py | 332 ++ tests/models/vitpose_backbone/__init__.py | 0 .../test_modeling_vitpose_backbone.py | 199 ++ tests/models/zamba/test_modeling_zamba.py | 6 +- .../peft_integration/test_peft_integration.py | 65 +- ..._pipelines_automatic_speech_recognition.py | 14 + .../test_pipelines_text_generation.py | 61 +- tests/quantization/ggml/test_ggml.py | 6 +- tests/quantization/gptq/test_gptq.py | 125 +- tests/quantization/hqq/test_hqq.py | 5 + .../quanto_integration/test_quanto.py | 5 +- .../modular/test_conversion_order.py | 63 + tests/test_image_processing_common.py | 18 +- tests/test_modeling_common.py | 80 +- tests/test_processing_common.py | 27 +- tests/test_tokenization_common.py | 20 +- tests/tokenization/test_tokenization_utils.py | 19 + tests/trainer/test_data_collator.py | 46 + tests/trainer/test_trainer.py | 3010 +++++++++-------- tests/utils/test_cache_utils.py | 4 +- tests/utils/test_modeling_rope_utils.py | 24 +- tests/utils/test_modeling_utils.py | 55 + utils/check_config_attributes.py | 9 + utils/check_copies.py | 1 + utils/check_modular_conversion.py | 4 +- utils/check_repo.py | 8 + utils/create_dependency_mapping.py | 59 +- utils/modular_model_converter.py | 20 +- 439 files changed, 29550 insertions(+), 6272 deletions(-) create mode 100644 CODEOWNERS create mode 100644 docs/source/ar/tasks/multiple_choice.md create mode 100644 docs/source/ar/tasks/token_classification.md create mode 100644 docs/source/ar/tasks/translation.md create mode 100644 docs/source/en/model_doc/diffllama.md create mode 100644 docs/source/en/model_doc/emu3.md create mode 100644 docs/source/en/model_doc/helium.md create mode 100644 docs/source/en/model_doc/moonshine.md create mode 100644 docs/source/en/model_doc/textnet.md create mode 100644 docs/source/en/model_doc/vitpose.md create mode 100644 docs/source/ko/model_doc/altclip.md create mode 100644 read_video.py create mode 100644 run.py create mode 100644 src/transformers/models/cohere/modular_cohere.py create mode 100644 src/transformers/models/deformable_detr/modular_deformable_detr.py create mode 100644 src/transformers/models/diffllama/__init__.py create mode 100644 src/transformers/models/diffllama/configuration_diffllama.py create mode 100644 src/transformers/models/diffllama/modeling_diffllama.py create mode 100644 src/transformers/models/diffllama/modular_diffllama.py create mode 100644 src/transformers/models/emu3/__init__.py create mode 100644 src/transformers/models/emu3/configuration_emu3.py create mode 100644 src/transformers/models/emu3/convert_emu3_weights_to_hf.py create mode 100644 src/transformers/models/emu3/image_processing_emu3.py create mode 100644 src/transformers/models/emu3/modeling_emu3.py create mode 100644 src/transformers/models/emu3/modular_emu3.py create mode 100644 src/transformers/models/emu3/processing_emu3.py create mode 100644 src/transformers/models/helium/__init__.py create mode 100644 src/transformers/models/helium/configuration_helium.py create mode 100644 src/transformers/models/helium/modeling_helium.py create mode 100644 src/transformers/models/helium/modular_helium.py create mode 100644 src/transformers/models/moonshine/__init__.py create mode 100644 src/transformers/models/moonshine/configuration_moonshine.py create mode 100644 src/transformers/models/moonshine/convert_usefulsensors_to_hf.py create mode 100644 src/transformers/models/moonshine/modeling_moonshine.py create mode 100644 src/transformers/models/moonshine/modular_moonshine.py create mode 100644 src/transformers/models/phi3/modular_phi3.py create mode 100644 src/transformers/models/rt_detr/modular_rt_detr.py create mode 100644 src/transformers/models/textnet/__init__.py create mode 100644 src/transformers/models/textnet/configuration_textnet.py create mode 100644 src/transformers/models/textnet/convert_textnet_to_hf.py create mode 100644 src/transformers/models/textnet/image_processing_textnet.py create mode 100644 src/transformers/models/textnet/modeling_textnet.py create mode 100644 src/transformers/models/vitpose/__init__.py create mode 100644 src/transformers/models/vitpose/configuration_vitpose.py create mode 100644 src/transformers/models/vitpose/convert_vitpose_to_hf.py create mode 100644 src/transformers/models/vitpose/image_processing_vitpose.py create mode 100644 src/transformers/models/vitpose/modeling_vitpose.py create mode 100644 src/transformers/models/vitpose_backbone/__init__.py create mode 100644 src/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py create mode 100644 src/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py create mode 100644 tests/models/diffllama/__init__.py create mode 100644 tests/models/diffllama/test_modeling_diffllama.py create mode 100644 tests/models/emu3/__init__.py create mode 100644 tests/models/emu3/test_modeling_emu3.py create mode 100644 tests/models/emu3/test_processor_emu3.py create mode 100644 tests/models/helium/__init__.py create mode 100644 tests/models/helium/test_modeling_helium.py create mode 100644 tests/models/moonshine/__init__.py create mode 100644 tests/models/moonshine/test_modeling_moonshine.py create mode 100644 tests/models/textnet/__init__.py create mode 100644 tests/models/textnet/test_image_processing_textnet.py create mode 100644 tests/models/textnet/test_modeling_textnet.py create mode 100644 tests/models/vitpose/__init__.py create mode 100644 tests/models/vitpose/test_image_processing_vitpose.py create mode 100644 tests/models/vitpose/test_modeling_vitpose.py create mode 100644 tests/models/vitpose_backbone/__init__.py create mode 100644 tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py create mode 100644 tests/repo_utils/modular/test_conversion_order.py diff --git a/CODEOWNERS b/CODEOWNERS new file mode 100644 index 00000000000000..6d32e835dfa43e --- /dev/null +++ b/CODEOWNERS @@ -0,0 +1,369 @@ +# Top-level rules are matched only if nothing else matches +* @Rocketknight1 @ArthurZucker # if no one is pinged based on the other rules, he will do the dispatch +**.md @stevhliu +docs/ @stevhliu +/benchmark/ @McPatate +/docker/ @ydshieh @ArthurZucker + +# More high-level globs catch cases when specific rules later don't apply +/src/transformers/models/*/*processing* @molbap @yonigozlan @qubvel +/src/transformers/models/*/image_processing* @qubvel +/src/transformers/models/*/image_processing_*_fast* @yonigozlan +/src/transformers/**/*_tokenization* @ArthurZucker + +# Owners of subsections of the library +/src/transformers/generation/ @gante +/src/transformers/pipeline/ @Rocketknight1 @yonigozlan +/src/transformers/integrations/ @SunMarc @MekkCyber @muellerzr +/src/transformers/quantizers/ @SunMarc @MekkCyber +/src/transformers/tests/ @ydshieh +/src/transformers/tests/generation/ @gante +/src/transformers/models/auto/ @ArthurZucker +/src/transformers/utils/ @ArthurZucker @Rocketknight1 +/src/transformers/loss/ @ArthurZucker +/src/transformers/onnx/ @michaelbenayoun + +# Specific files come after the sections/globs, so they take priority +/.circleci/config.yml @ArthurZucker @ydshieh +/utils/tests_fetcher.py @ydshieh +trainer.py @muellerzr @SunMarc +trainer_utils.py @muellerzr @SunMarc +/utils/modular_model_converter.py @Cyrilvallez @ArthurZucker + +# Owners of individual models are specific / high priority, and so they come last +# mod* captures modeling and modular files + +# Text models +/src/transformers/models/albert/mod*_albert* @ArthurZucker +/src/transformers/models/bamba/mod*_bamba* @ArthurZucker +/src/transformers/models/bart/mod*_bart* @ArthurZucker +/src/transformers/models/barthez/mod*_barthez* @ArthurZucker +/src/transformers/models/bartpho/mod*_bartpho* @ArthurZucker +/src/transformers/models/bert/mod*_bert* @ArthurZucker +/src/transformers/models/bert_generation/mod*_bert_generation* @ArthurZucker +/src/transformers/models/bert_japanese/mod*_bert_japanese* @ArthurZucker +/src/transformers/models/bertweet/mod*_bertweet* @ArthurZucker +/src/transformers/models/big_bird/mod*_big_bird* @ArthurZucker +/src/transformers/models/bigbird_pegasus/mod*_bigbird_pegasus* @ArthurZucker +/src/transformers/models/biogpt/mod*_biogpt* @ArthurZucker +/src/transformers/models/blenderbot/mod*_blenderbot* @ArthurZucker +/src/transformers/models/blenderbot_small/mod*_blenderbot_small* @ArthurZucker +/src/transformers/models/bloom/mod*_bloom* @ArthurZucker +/src/transformers/models/bort/mod*_bort* @ArthurZucker +/src/transformers/models/byt5/mod*_byt5* @ArthurZucker +/src/transformers/models/camembert/mod*_camembert* @ArthurZucker +/src/transformers/models/canine/mod*_canine* @ArthurZucker +/src/transformers/models/codegen/mod*_codegen* @ArthurZucker +/src/transformers/models/code_llama/mod*_code_llama* @ArthurZucker +/src/transformers/models/cohere/mod*_cohere* @ArthurZucker +/src/transformers/models/cohere2/mod*_cohere2* @ArthurZucker +/src/transformers/models/convbert/mod*_convbert* @ArthurZucker +/src/transformers/models/cpm/mod*_cpm* @ArthurZucker +/src/transformers/models/cpmant/mod*_cpmant* @ArthurZucker +/src/transformers/models/ctrl/mod*_ctrl* @ArthurZucker +/src/transformers/models/dbrx/mod*_dbrx* @ArthurZucker +/src/transformers/models/deberta/mod*_deberta* @ArthurZucker +/src/transformers/models/deberta_v2/mod*_deberta_v2* @ArthurZucker +/src/transformers/models/dialogpt/mod*_dialogpt* @ArthurZucker +/src/transformers/models/diffllama/mod*_diffllama* @ArthurZucker +/src/transformers/models/distilbert/mod*_distilbert* @ArthurZucker +/src/transformers/models/dpr/mod*_dpr* @ArthurZucker +/src/transformers/models/electra/mod*_electra* @ArthurZucker +/src/transformers/models/encoder_decoder/mod*_encoder_decoder* @ArthurZucker +/src/transformers/models/ernie/mod*_ernie* @ArthurZucker +/src/transformers/models/ernie_m/mod*_ernie_m* @ArthurZucker +/src/transformers/models/esm/mod*_esm* @ArthurZucker +/src/transformers/models/falcon/mod*_falcon* @ArthurZucker +/src/transformers/models/falcon3/mod*_falcon3* @ArthurZucker +/src/transformers/models/falcon_mamba/mod*_falcon_mamba* @ArthurZucker +/src/transformers/models/fastspeech2_conformer/mod*_fastspeech2_conformer* @ArthurZucker +/src/transformers/models/flan_t5/mod*_flan_t5* @ArthurZucker +/src/transformers/models/flan_ul2/mod*_flan_ul2* @ArthurZucker +/src/transformers/models/flaubert/mod*_flaubert* @ArthurZucker +/src/transformers/models/fnet/mod*_fnet* @ArthurZucker +/src/transformers/models/fsmt/mod*_fsmt* @ArthurZucker +/src/transformers/models/funnel/mod*_funnel* @ArthurZucker +/src/transformers/models/fuyu/mod*_fuyu* @ArthurZucker +/src/transformers/models/gemma/mod*_gemma* @ArthurZucker +/src/transformers/models/gemma2/mod*_gemma2* @ArthurZucker +/src/transformers/models/glm/mod*_glm* @ArthurZucker +/src/transformers/models/openai_gpt/mod*_openai_gpt* @ArthurZucker +/src/transformers/models/gpt_neo/mod*_gpt_neo* @ArthurZucker +/src/transformers/models/gpt_neox/mod*_gpt_neox* @ArthurZucker +/src/transformers/models/gpt_neox_japanese/mod*_gpt_neox_japanese* @ArthurZucker +/src/transformers/models/gptj/mod*_gptj* @ArthurZucker +/src/transformers/models/gpt2/mod*_gpt2* @ArthurZucker +/src/transformers/models/gpt_bigcode/mod*_gpt_bigcode* @ArthurZucker +/src/transformers/models/gptsan_japanese/mod*_gptsan_japanese* @ArthurZucker +/src/transformers/models/gpt_sw3/mod*_gpt_sw3* @ArthurZucker +/src/transformers/models/granite/mod*_granite* @ArthurZucker +/src/transformers/models/granitemoe/mod*_granitemoe* @ArthurZucker +/src/transformers/models/herbert/mod*_herbert* @ArthurZucker +/src/transformers/models/ibert/mod*_ibert* @ArthurZucker +/src/transformers/models/jamba/mod*_jamba* @ArthurZucker +/src/transformers/models/jetmoe/mod*_jetmoe* @ArthurZucker +/src/transformers/models/jukebox/mod*_jukebox* @ArthurZucker +/src/transformers/models/led/mod*_led* @ArthurZucker +/src/transformers/models/llama/mod*_llama* @ArthurZucker @Cyrilvallez +/src/transformers/models/longformer/mod*_longformer* @ArthurZucker +/src/transformers/models/longt5/mod*_longt5* @ArthurZucker +/src/transformers/models/luke/mod*_luke* @ArthurZucker +/src/transformers/models/m2m_100/mod*_m2m_100* @ArthurZucker +/src/transformers/models/madlad_400/mod*_madlad_400* @ArthurZucker +/src/transformers/models/mamba/mod*_mamba* @ArthurZucker +/src/transformers/models/mamba2/mod*_mamba2* @ArthurZucker +/src/transformers/models/marian/mod*_marian* @ArthurZucker +/src/transformers/models/markuplm/mod*_markuplm* @ArthurZucker +/src/transformers/models/mbart/mod*_mbart* @ArthurZucker +/src/transformers/models/mega/mod*_mega* @ArthurZucker +/src/transformers/models/megatron_bert/mod*_megatron_bert* @ArthurZucker +/src/transformers/models/megatron_gpt2/mod*_megatron_gpt2* @ArthurZucker +/src/transformers/models/mistral/mod*_mistral* @ArthurZucker +/src/transformers/models/mixtral/mod*_mixtral* @ArthurZucker +/src/transformers/models/mluke/mod*_mluke* @ArthurZucker +/src/transformers/models/mobilebert/mod*_mobilebert* @ArthurZucker +/src/transformers/models/modernbert/mod*_modernbert* @ArthurZucker +/src/transformers/models/mpnet/mod*_mpnet* @ArthurZucker +/src/transformers/models/mpt/mod*_mpt* @ArthurZucker +/src/transformers/models/mra/mod*_mra* @ArthurZucker +/src/transformers/models/mt5/mod*_mt5* @ArthurZucker +/src/transformers/models/mvp/mod*_mvp* @ArthurZucker +/src/transformers/models/myt5/mod*_myt5* @ArthurZucker +/src/transformers/models/nemotron/mod*_nemotron* @ArthurZucker +/src/transformers/models/nezha/mod*_nezha* @ArthurZucker +/src/transformers/models/nllb/mod*_nllb* @ArthurZucker +/src/transformers/models/nllb_moe/mod*_nllb_moe* @ArthurZucker +/src/transformers/models/nystromformer/mod*_nystromformer* @ArthurZucker +/src/transformers/models/olmo/mod*_olmo* @ArthurZucker +/src/transformers/models/olmo2/mod*_olmo2* @ArthurZucker +/src/transformers/models/olmoe/mod*_olmoe* @ArthurZucker +/src/transformers/models/open_llama/mod*_open_llama* @ArthurZucker +/src/transformers/models/opt/mod*_opt* @ArthurZucker +/src/transformers/models/pegasus/mod*_pegasus* @ArthurZucker +/src/transformers/models/pegasus_x/mod*_pegasus_x* @ArthurZucker +/src/transformers/models/persimmon/mod*_persimmon* @ArthurZucker +/src/transformers/models/phi/mod*_phi* @ArthurZucker +/src/transformers/models/phi3/mod*_phi3* @ArthurZucker +/src/transformers/models/phimoe/mod*_phimoe* @ArthurZucker +/src/transformers/models/phobert/mod*_phobert* @ArthurZucker +/src/transformers/models/plbart/mod*_plbart* @ArthurZucker +/src/transformers/models/prophetnet/mod*_prophetnet* @ArthurZucker +/src/transformers/models/qdqbert/mod*_qdqbert* @ArthurZucker +/src/transformers/models/qwen2/mod*_qwen2* @ArthurZucker +/src/transformers/models/qwen2_moe/mod*_qwen2_moe* @ArthurZucker +/src/transformers/models/rag/mod*_rag* @ArthurZucker +/src/transformers/models/realm/mod*_realm* @ArthurZucker +/src/transformers/models/recurrent_gemma/mod*_recurrent_gemma* @ArthurZucker +/src/transformers/models/reformer/mod*_reformer* @ArthurZucker +/src/transformers/models/rembert/mod*_rembert* @ArthurZucker +/src/transformers/models/retribert/mod*_retribert* @ArthurZucker +/src/transformers/models/roberta/mod*_roberta* @ArthurZucker +/src/transformers/models/roberta_prelayernorm/mod*_roberta_prelayernorm* @ArthurZucker +/src/transformers/models/roc_bert/mod*_roc_bert* @ArthurZucker +/src/transformers/models/roformer/mod*_roformer* @ArthurZucker +/src/transformers/models/rwkv/mod*_rwkv* @ArthurZucker +/src/transformers/models/splinter/mod*_splinter* @ArthurZucker +/src/transformers/models/squeezebert/mod*_squeezebert* @ArthurZucker +/src/transformers/models/stablelm/mod*_stablelm* @ArthurZucker +/src/transformers/models/starcoder2/mod*_starcoder2* @ArthurZucker +/src/transformers/models/switch_transformers/mod*_switch_transformers* @ArthurZucker +/src/transformers/models/t5/mod*_t5* @ArthurZucker +/src/transformers/models/t5v1.1/mod*_t5v1.1* @ArthurZucker +/src/transformers/models/tapex/mod*_tapex* @ArthurZucker +/src/transformers/models/transfo_xl/mod*_transfo_xl* @ArthurZucker +/src/transformers/models/ul2/mod*_ul2* @ArthurZucker +/src/transformers/models/umt5/mod*_umt5* @ArthurZucker +/src/transformers/models/xmod/mod*_xmod* @ArthurZucker +/src/transformers/models/xglm/mod*_xglm* @ArthurZucker +/src/transformers/models/xlm/mod*_xlm* @ArthurZucker +/src/transformers/models/xlm_prophetnet/mod*_xlm_prophetnet* @ArthurZucker +/src/transformers/models/xlm_roberta/mod*_xlm_roberta* @ArthurZucker +/src/transformers/models/xlm_roberta_xl/mod*_xlm_roberta_xl* @ArthurZucker +/src/transformers/models/xlm_v/mod*_xlm_v* @ArthurZucker +/src/transformers/models/xlnet/mod*_xlnet* @ArthurZucker +/src/transformers/models/yoso/mod*_yoso* @ArthurZucker +/src/transformers/models/zamba/mod*_zamba* @ArthurZucker + +# Vision models +/src/transformers/models/beit/mod*_beit* @amyeroberts @qubvel +/src/transformers/models/bit/mod*_bit* @amyeroberts @qubvel +/src/transformers/models/conditional_detr/mod*_conditional_detr* @amyeroberts @qubvel +/src/transformers/models/convnext/mod*_convnext* @amyeroberts @qubvel +/src/transformers/models/convnextv2/mod*_convnextv2* @amyeroberts @qubvel +/src/transformers/models/cvt/mod*_cvt* @amyeroberts @qubvel +/src/transformers/models/deformable_detr/mod*_deformable_detr* @amyeroberts @qubvel +/src/transformers/models/deit/mod*_deit* @amyeroberts @qubvel +/src/transformers/models/depth_anything/mod*_depth_anything* @amyeroberts @qubvel +/src/transformers/models/depth_anything_v2/mod*_depth_anything_v2* @amyeroberts @qubvel +/src/transformers/models/deta/mod*_deta* @amyeroberts @qubvel +/src/transformers/models/detr/mod*_detr* @amyeroberts @qubvel +/src/transformers/models/dinat/mod*_dinat* @amyeroberts @qubvel +/src/transformers/models/dinov2/mod*_dinov2* @amyeroberts @qubvel +/src/transformers/models/dinov2_with_registers/mod*_dinov2_with_registers* @amyeroberts @qubvel +/src/transformers/models/dit/mod*_dit* @amyeroberts @qubvel +/src/transformers/models/dpt/mod*_dpt* @amyeroberts @qubvel +/src/transformers/models/efficientformer/mod*_efficientformer* @amyeroberts @qubvel +/src/transformers/models/efficientnet/mod*_efficientnet* @amyeroberts @qubvel +/src/transformers/models/focalnet/mod*_focalnet* @amyeroberts @qubvel +/src/transformers/models/glpn/mod*_glpn* @amyeroberts @qubvel +/src/transformers/models/hiera/mod*_hiera* @amyeroberts @qubvel +/src/transformers/models/ijepa/mod*_ijepa* @amyeroberts @qubvel +/src/transformers/models/imagegpt/mod*_imagegpt* @amyeroberts @qubvel +/src/transformers/models/levit/mod*_levit* @amyeroberts @qubvel +/src/transformers/models/mask2former/mod*_mask2former* @amyeroberts @qubvel +/src/transformers/models/maskformer/mod*_maskformer* @amyeroberts @qubvel +/src/transformers/models/mobilenet_v1/mod*_mobilenet_v1* @amyeroberts @qubvel +/src/transformers/models/mobilenet_v2/mod*_mobilenet_v2* @amyeroberts @qubvel +/src/transformers/models/mobilevit/mod*_mobilevit* @amyeroberts @qubvel +/src/transformers/models/mobilevitv2/mod*_mobilevitv2* @amyeroberts @qubvel +/src/transformers/models/nat/mod*_nat* @amyeroberts @qubvel +/src/transformers/models/poolformer/mod*_poolformer* @amyeroberts @qubvel +/src/transformers/models/pvt/mod*_pvt* @amyeroberts @qubvel +/src/transformers/models/pvt_v2/mod*_pvt_v2* @amyeroberts @qubvel +/src/transformers/models/regnet/mod*_regnet* @amyeroberts @qubvel +/src/transformers/models/resnet/mod*_resnet* @amyeroberts @qubvel +/src/transformers/models/rt_detr/mod*_rt_detr* @amyeroberts @qubvel +/src/transformers/models/segformer/mod*_segformer* @amyeroberts @qubvel +/src/transformers/models/seggpt/mod*_seggpt* @amyeroberts @qubvel +/src/transformers/models/superpoint/mod*_superpoint* @amyeroberts @qubvel +/src/transformers/models/swiftformer/mod*_swiftformer* @amyeroberts @qubvel +/src/transformers/models/swin/mod*_swin* @amyeroberts @qubvel +/src/transformers/models/swinv2/mod*_swinv2* @amyeroberts @qubvel +/src/transformers/models/swin2sr/mod*_swin2sr* @amyeroberts @qubvel +/src/transformers/models/table_transformer/mod*_table_transformer* @amyeroberts @qubvel +/src/transformers/models/textnet/mod*_textnet* @amyeroberts @qubvel +/src/transformers/models/timm_wrapper/mod*_timm_wrapper* @amyeroberts @qubvel +/src/transformers/models/upernet/mod*_upernet* @amyeroberts @qubvel +/src/transformers/models/van/mod*_van* @amyeroberts @qubvel +/src/transformers/models/vit/mod*_vit* @amyeroberts @qubvel +/src/transformers/models/vit_hybrid/mod*_vit_hybrid* @amyeroberts @qubvel +/src/transformers/models/vitdet/mod*_vitdet* @amyeroberts @qubvel +/src/transformers/models/vit_mae/mod*_vit_mae* @amyeroberts @qubvel +/src/transformers/models/vitmatte/mod*_vitmatte* @amyeroberts @qubvel +/src/transformers/models/vit_msn/mod*_vit_msn* @amyeroberts @qubvel +/src/transformers/models/vitpose/mod*_vitpose* @amyeroberts @qubvel +/src/transformers/models/yolos/mod*_yolos* @amyeroberts @qubvel +/src/transformers/models/zoedepth/mod*_zoedepth* @amyeroberts @qubvel + +# Audio models +/src/transformers/models/audio_spectrogram_transformer/mod*_audio_spectrogram_transformer* @eustlb +/src/transformers/models/bark/mod*_bark* @eustlb +/src/transformers/models/clap/mod*_clap* @eustlb +/src/transformers/models/dac/mod*_dac* @eustlb +/src/transformers/models/encodec/mod*_encodec* @eustlb +/src/transformers/models/hubert/mod*_hubert* @eustlb +/src/transformers/models/mctct/mod*_mctct* @eustlb +/src/transformers/models/mimi/mod*_mimi* @eustlb +/src/transformers/models/mms/mod*_mms* @eustlb +/src/transformers/models/moshi/mod*_moshi* @eustlb +/src/transformers/models/musicgen/mod*_musicgen* @eustlb +/src/transformers/models/musicgen_melody/mod*_musicgen_melody* @eustlb +/src/transformers/models/pop2piano/mod*_pop2piano* @eustlb +/src/transformers/models/seamless_m4t/mod*_seamless_m4t* @eustlb +/src/transformers/models/seamless_m4t_v2/mod*_seamless_m4t_v2* @eustlb +/src/transformers/models/sew/mod*_sew* @eustlb +/src/transformers/models/sew_d/mod*_sew_d* @eustlb +/src/transformers/models/speech_to_text/mod*_speech_to_text* @eustlb +/src/transformers/models/speech_to_text_2/mod*_speech_to_text_2* @eustlb +/src/transformers/models/speecht5/mod*_speecht5* @eustlb +/src/transformers/models/unispeech/mod*_unispeech* @eustlb +/src/transformers/models/unispeech_sat/mod*_unispeech_sat* @eustlb +/src/transformers/models/univnet/mod*_univnet* @eustlb +/src/transformers/models/vits/mod*_vits* @eustlb +/src/transformers/models/wav2vec2/mod*_wav2vec2* @eustlb +/src/transformers/models/wav2vec2_bert/mod*_wav2vec2_bert* @eustlb +/src/transformers/models/wav2vec2_conformer/mod*_wav2vec2_conformer* @eustlb +/src/transformers/models/wav2vec2_phoneme/mod*_wav2vec2_phoneme* @eustlb +/src/transformers/models/wavlm/mod*_wavlm* @eustlb +/src/transformers/models/whisper/mod*_whisper* @eustlb +/src/transformers/models/xls_r/mod*_xls_r* @eustlb +/src/transformers/models/xlsr_wav2vec2/mod*_xlsr_wav2vec2* @eustlb + +# Video models +/src/transformers/models/timesformer/mod*_timesformer* @Rocketknight1 +/src/transformers/models/videomae/mod*_videomae* @Rocketknight1 +/src/transformers/models/vivit/mod*_vivit* @Rocketknight1 + +# Multimodal models +/src/transformers/models/align/mod*_align* @zucchini-nlp +/src/transformers/models/altclip/mod*_altclip* @zucchini-nlp +/src/transformers/models/aria/mod*_aria* @zucchini-nlp +/src/transformers/models/blip/mod*_blip* @zucchini-nlp +/src/transformers/models/blip_2/mod*_blip_2* @zucchini-nlp +/src/transformers/models/bridgetower/mod*_bridgetower* @zucchini-nlp +/src/transformers/models/bros/mod*_bros* @zucchini-nlp +/src/transformers/models/chameleon/mod*_chameleon* @zucchini-nlp +/src/transformers/models/chinese_clip/mod*_chinese_clip* @zucchini-nlp +/src/transformers/models/clip/mod*_clip* @zucchini-nlp +/src/transformers/models/clipseg/mod*_clipseg* @zucchini-nlp +/src/transformers/models/clvp/mod*_clvp* @zucchini-nlp +/src/transformers/models/colpali/mod*_colpali* @zucchini-nlp @yonigozlan +/src/transformers/models/data2vec/mod*_data2vec* @zucchini-nlp +/src/transformers/models/deplot/mod*_deplot* @zucchini-nlp +/src/transformers/models/donut/mod*_donut* @zucchini-nlp +/src/transformers/models/flava/mod*_flava* @zucchini-nlp +/src/transformers/models/git/mod*_git* @zucchini-nlp +/src/transformers/models/grounding_dino/mod*_grounding_dino* @qubvel +/src/transformers/models/groupvit/mod*_groupvit* @zucchini-nlp +/src/transformers/models/idefics/mod*_idefics* @zucchini-nlp +/src/transformers/models/idefics2/mod*_idefics2* @zucchini-nlp +/src/transformers/models/idefics3/mod*_idefics3* @zucchini-nlp +/src/transformers/models/instructblip/mod*_instructblip* @zucchini-nlp +/src/transformers/models/instructblipvideo/mod*_instructblipvideo* @zucchini-nlp +/src/transformers/models/kosmos_2/mod*_kosmos_2* @zucchini-nlp +/src/transformers/models/layoutlm/mod*_layoutlm* @NielsRogge +/src/transformers/models/layoutlmv2/mod*_layoutlmv2* @NielsRogge +/src/transformers/models/layoutlmv3/mod*_layoutlmv3* @NielsRogge +/src/transformers/models/layoutxlm/mod*_layoutxlm* @NielsRogge +/src/transformers/models/lilt/mod*_lilt* @zucchini-nlp +/src/transformers/models/llava/mod*_llava* @zucchini-nlp @arthurzucker +/src/transformers/models/llava_next/mod*_llava_next* @zucchini-nlp +/src/transformers/models/llava_next_video/mod*_llava_next_video* @zucchini-nlp +/src/transformers/models/llava_onevision/mod*_llava_onevision* @zucchini-nlp +/src/transformers/models/lxmert/mod*_lxmert* @zucchini-nlp +/src/transformers/models/matcha/mod*_matcha* @zucchini-nlp +/src/transformers/models/mgp_str/mod*_mgp_str* @zucchini-nlp +/src/transformers/models/mllama/mod*_mllama* @zucchini-nlp +/src/transformers/models/nougat/mod*_nougat* @NielsRogge +/src/transformers/models/omdet_turbo/mod*_omdet_turbo* @qubvel @yonigozlan +/src/transformers/models/oneformer/mod*_oneformer* @zucchini-nlp +/src/transformers/models/owlvit/mod*_owlvit* @qubvel +/src/transformers/models/owlv2/mod*_owlv2* @qubvel +/src/transformers/models/paligemma/mod*_paligemma* @zucchini-nlp @molbap +/src/transformers/models/perceiver/mod*_perceiver* @zucchini-nlp +/src/transformers/models/pix2struct/mod*_pix2struct* @zucchini-nlp +/src/transformers/models/pixtral/mod*_pixtral* @zucchini-nlp @ArthurZucker +/src/transformers/models/qwen2_audio/mod*_qwen2_audio* @zucchini-nlp @ArthurZucker +/src/transformers/models/qwen2_vl/mod*_qwen2_vl* @zucchini-nlp @ArthurZucker +/src/transformers/models/sam/mod*_sam* @zucchini-nlp @ArthurZucker +/src/transformers/models/siglip/mod*_siglip* @zucchini-nlp +/src/transformers/models/speech_encoder_decoder/mod*_speech_encoder_decoder* @zucchini-nlp +/src/transformers/models/tapas/mod*_tapas* @NielsRogge +/src/transformers/models/trocr/mod*_trocr* @zucchini-nlp +/src/transformers/models/tvlt/mod*_tvlt* @zucchini-nlp +/src/transformers/models/tvp/mod*_tvp* @zucchini-nlp +/src/transformers/models/udop/mod*_udop* @zucchini-nlp +/src/transformers/models/video_llava/mod*_video_llava* @zucchini-nlp +/src/transformers/models/vilt/mod*_vilt* @zucchini-nlp +/src/transformers/models/vipllava/mod*_vipllava* @zucchini-nlp +/src/transformers/models/vision_encoder_decoder/mod*_vision_encoder_decoder* @Rocketknight1 +/src/transformers/models/vision_text_dual_encoder/mod*_vision_text_dual_encoder* @Rocketknight1 +/src/transformers/models/visual_bert/mod*_visual_bert* @zucchini-nlp +/src/transformers/models/xclip/mod*_xclip* @zucchini-nlp + +# Reinforcement learning models +/src/transformers/models/decision_transformer/mod*_decision_transformer* @Rocketknight1 +/src/transformers/models/trajectory_transformer/mod*_trajectory_transformer* @Rocketknight1 + +# Time series models +/src/transformers/models/autoformer/mod*_autoformer* @Rocketknight1 +/src/transformers/models/informer/mod*_informer* @Rocketknight1 +/src/transformers/models/patchtsmixer/mod*_patchtsmixer* @Rocketknight1 +/src/transformers/models/patchtst/mod*_patchtst* @Rocketknight1 +/src/transformers/models/time_series_transformer/mod*_time_series_transformer* @Rocketknight1 + +# Graph models +/src/transformers/models/graphormer/mod*_graphormer* @clefourrier + +# Finally, files with no owners that shouldn't generate pings, usually automatically generated and checked in the CI +utils/dummy* \ No newline at end of file diff --git a/README.md b/README.md index 42403f84b885da..8ab5ceaf7e687b 100644 --- a/README.md +++ b/README.md @@ -255,17 +255,37 @@ You should install 🤗 Transformers in a [virtual environment](https://docs.pyt First, create a virtual environment with the version of Python you're going to use and activate it. -Then, you will need to install at least one of Flax, PyTorch, or TensorFlow. -Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/), [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) and/or [Flax](https://github.com/google/flax#quick-install) and [Jax](https://github.com/google/jax#installation) installation pages regarding the specific installation command for your platform. +**macOS/Linux** + +```python -m venv env +source env/bin/activate +``` + +**Windows** + +``` python -m venv env +env\Scripts\activate +``` + +To use 🤗 Transformers, you must install at least one of Flax, PyTorch, or TensorFlow. Refer to the official installation guides for platform-specific commands: + +[TensorFlow installation page](https://www.tensorflow.org/install/), +[PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) and/or [Flax](https://github.com/google/flax#quick-install) and [Jax](https://github.com/google/jax#installation) When one of those backends has been installed, 🤗 Transformers can be installed using pip as follows: -```bash +``` pip install transformers ``` If you'd like to play with the examples or need the bleeding edge of the code and can't wait for a new release, you must [install the library from source](https://huggingface.co/docs/transformers/installation#installing-from-source). +``` +git clone https://github.com/huggingface/transformers.git +cd transformers +pip install +``` + ### With conda 🤗 Transformers can be installed using conda as follows: diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile index b597f5a73fb5be..b40ba3f35ff874 100644 --- a/docker/transformers-all-latest-gpu/Dockerfile +++ b/docker/transformers-all-latest-gpu/Dockerfile @@ -65,6 +65,9 @@ RUN python3 -m pip install --no-cache-dir python-Levenshtein # For `FastSpeech2ConformerTokenizer` tokenizer RUN python3 -m pip install --no-cache-dir g2p-en +# For Some bitsandbytes tests +RUN python3 -m pip install --no-cache-dir einops + # When installing in editable mode, `transformers` is not recognized as a package. # this line must be added in order for python to be aware of transformers. RUN cd transformers && python3 setup.py develop diff --git a/docs/source/ar/_toctree.yml b/docs/source/ar/_toctree.yml index c306b0ada80691..01568e93561de4 100644 --- a/docs/source/ar/_toctree.yml +++ b/docs/source/ar/_toctree.yml @@ -35,20 +35,20 @@ sections: # - local: tasks/sequence_classification # title: تصنيف النصوص -# - local: tasks/token_classification -# title: تصنيف الرموز + - local: tasks/token_classification + title: تصنيف الرموز - local: tasks/question_answering title: الإجابة على الأسئلة # - local: tasks/language_modeling # title: نمذجة اللغة السببية # - local: tasks/masked_language_modeling # title: نمذجة اللغة المقنعة -# - local: tasks/translation -# title: الترجمة + - local: tasks/translation + title: الترجمة - local: tasks/summarization title: التلخيص -# - local: tasks/multiple_choice -# title: الاختيار المتعدد + - local: tasks/multiple_choice + title: الاختيار المتعدد title: معالجة اللغات الطبيعية # - isExpanded: false # sections: diff --git a/docs/source/ar/tasks/multiple_choice.md b/docs/source/ar/tasks/multiple_choice.md new file mode 100644 index 00000000000000..78f98560754f11 --- /dev/null +++ b/docs/source/ar/tasks/multiple_choice.md @@ -0,0 +1,452 @@ + + +# الاختيار من متعدد (Multiple choice) + +[[open-in-colab]] + +مهمة الاختيار من متعدد مشابهة لمهمة الإجابة على الأسئلة، ولكن مع توفير عدة إجابات محتملة مع سياق، ويُدرّب النموذج على تحديد الإجابة الصحيحة. + +سيوضح لك هذا الدليل كيفية: + +1. ضبط نموذج [BERT](https://huggingface.co/google-bert/bert-base-uncased) باستخدام الإعداد `regular` لمجموعة بيانات [SWAG](https://huggingface.co/datasets/swag) لاختيار الإجابة الأفضل من بين الخيارات المتعددة المتاحة مع السياق. +2. استخدام النموذج المضبوط للاستدلال. + +قبل البدء، تأكد من تثبيت جميع المكتبات الضرورية: + +```bash +pip install transformers datasets evaluate +``` + +نشجعك على تسجيل الدخول إلى حساب Hugging Face الخاص بك حتى تتمكن من تحميل نموذجك ومشاركته مع المجتمع. عند المطالبة، أدخل الرمز المميز الخاص بك لتسجيل الدخول: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +## تحميل مجموعة بيانات SWAG + +ابدأ بتحميل تهيئة `regular` لمجموعة بيانات SWAG من مكتبة 🤗 Datasets: + +```py +>>> from datasets import load_dataset + +>>> swag = load_dataset("swag", "regular") +``` + +ثم ألق نظرة على مثال: + +```py +>>> swag["train"][0] +{'ending0': 'passes by walking down the street playing their instruments.', + 'ending1': 'has heard approaching them.', + 'ending2': "arrives and they're outside dancing and asleep.", + 'ending3': 'turns the lead singer watches the performance.', + 'fold-ind': '3416', + 'gold-source': 'gold', + 'label': 0, + 'sent1': 'Members of the procession walk down the street holding small horn brass instruments.', + 'sent2': 'A drum line', + 'startphrase': 'Members of the procession walk down the street holding small horn brass instruments. A drum line', + 'video-id': 'anetv_jkn6uvmqwh4'} +``` + +على الرغم من أن الحقول تبدو كثيرة، إلا أنها في الواقع بسيطة جداً: + +- `sent1` و `sent2`: يعرض هذان الحقلان بداية الجملة، وبدمجهما معًا، نحصل على حقل `startphrase`. +- `ending`: يقترح نهاية محتملة للجملة، واحدة منها فقط هي الصحيحة. +- `label`: يحدد نهاية الجملة الصحيحة. + +## المعالجة المسبقة (Preprocess) + +الخطوة التالية هي استدعاء مُجزئ BERT لمعالجة بدايات الجمل والنهايات الأربع المحتملة: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased") +``` + +تحتاج دالة المعالجة المسبقة التي تريد إنشاءها إلى: + +1. إنشاء أربع نسخ من حقل `sent1` ودمج كل منها مع `sent2` لإعادة إنشاء كيفية بدء الجملة. +2. دمج `sent2` مع كل من نهايات الجمل الأربع المحتملة. +3. تتجميع هاتين القائمتين لتتمكن من تجزئتهما، ثم إعادة ترتيبها بعد ذلك بحيث يكون لكل مثال حقول `input_ids` و `attention_mask` و `labels` مقابلة. + + +```py +>>> ending_names = ["ending0", "ending1", "ending2", "ending3"] + +>>> def preprocess_function(examples): +... first_sentences = [[context] * 4 for context in examples["sent1"]] +... question_headers = examples["sent2"] +... second_sentences = [ +... [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers) +... ] + +... first_sentences = sum(first_sentences, []) +... second_sentences = sum(second_sentences, []) + +... tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True) +... return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()} +``` + +لتطبيق دالة المعالجة المسبقة على مجموعة البيانات بأكملها، استخدم طريقة [`~datasets.Dataset.map`] الخاصة بـ 🤗 Datasets. يمكنك تسريع دالة `map` عن طريق تعيين `batched=True` لمعالجة عناصر متعددة من مجموعة البيانات في وقت واحد: + +```py +tokenized_swag = swag.map(preprocess_function, batched=True) +``` + +لا يحتوي 🤗 Transformers على مجمع بيانات للاختيار من متعدد، لذلك ستحتاج إلى تكييف [`DataCollatorWithPadding`] لإنشاء دفعة من الأمثلة. من الأكفأ إضافة حشو (padding) ديناميكي للجمل إلى أطول طول في دفعة أثناء التجميع، بدلاً من حشو مجموعة البيانات بأكملها إلى الحد الأقصى للطول. + +يقوم `DataCollatorForMultipleChoice` بتجميع جميع مدخلات النموذج، ويطبق الحشو، ثم يعيد تجميع النتائج في شكلها الأصلي: + + + + +```py +>>> from dataclasses import dataclass +>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy +>>> from typing import Optional, Union +>>> import torch + +>>> @dataclass +... class DataCollatorForMultipleChoice: +... """ +... Data collator that will dynamically pad the inputs for multiple choice received. +... """ + +... tokenizer: PreTrainedTokenizerBase +... padding: Union[bool, str, PaddingStrategy] = True +... max_length: Optional[int] = None +... pad_to_multiple_of: Optional[int] = None + +... def __call__(self, features): +... label_name = "label" if "label" in features[0].keys() else "labels" +... labels = [feature.pop(label_name) for feature in features] +... batch_size = len(features) +... num_choices = len(features[0]["input_ids"]) +... flattened_features = [ +... [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features +... ] +... flattened_features = sum(flattened_features, []) + +... batch = self.tokenizer.pad( +... flattened_features, +... padding=self.padding, +... max_length=self.max_length, +... pad_to_multiple_of=self.pad_to_multiple_of, +... return_tensors="pt", +... ) + +... batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()} +... batch["labels"] = torch.tensor(labels, dtype=torch.int64) +... return batch +``` + + + +```py +>>> from dataclasses import dataclass +>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy +>>> from typing import Optional, Union +>>> import tensorflow as tf + +>>> @dataclass +... class DataCollatorForMultipleChoice: +... """ +... Data collator that will dynamically pad the inputs for multiple choice received. +... """ + +... tokenizer: PreTrainedTokenizerBase +... padding: Union[bool, str, PaddingStrategy] = True +... max_length: Optional[int] = None +... pad_to_multiple_of: Optional[int] = None + +... def __call__(self, features): +... label_name = "label" if "label" in features[0].keys() else "labels" +... labels = [feature.pop(label_name) for feature in features] +... batch_size = len(features) +... num_choices = len(features[0]["input_ids"]) +... flattened_features = [ +... [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features +... ] +... flattened_features = sum(flattened_features, []) + +... batch = self.tokenizer.pad( +... flattened_features, +... padding=self.padding, +... max_length=self.max_length, +... pad_to_multiple_of=self.pad_to_multiple_of, +... return_tensors="tf", +... ) + +... batch = {k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()} +... batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64) +... return batch +``` + + + +## التقييم (Evaluate) + +يُفضل غالبًا تضمين مقياس أثناء التدريب لتقييم أداء نموذجك. يمكنك تحميل طريقة تقييم بسرعة باستخدام مكتبة 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index). لهذه المهمة، قم بتحميل مقياس [الدقة](https://huggingface.co/spaces/evaluate-metric/accuracy) (انظر إلى [الجولة السريعة](https://huggingface.co/docs/evaluate/a_quick_tour) لـ 🤗 Evaluate لمعرفة المزيد حول كيفية تحميل المقياس وحسابه): + +```py +>>> import evaluate + +>>> accuracy = evaluate.load("accuracy") +``` + +ثم أنشئ دالة لتمرير التنبؤات والتسميات إلى [`~evaluate.EvaluationModule.compute`] لحساب الدقة: + +```py +>>> import numpy as np + +>>> def compute_metrics(eval_pred): +... predictions, labels = eval_pred +... predictions = np.argmax(predictions, axis=1) +... return accuracy.compute(predictions=predictions, references=labels) +``` + +دالتك `compute_metrics` جاهزة الآن، وستعود إليها عند إعداد تدريبك. + +## التدريب (Train) + + + + + + +إذا لم تكن معتادًا على ضبط نموذج باستخدام [`Trainer`], فراجع الدرس الأساسي [هنا](../training#train-with-pytorch-trainer)! + + + +أنت جاهز لبدء تدريب نموذجك الآن! قم بتحميل BERT باستخدام [`AutoModelForMultipleChoice`]: + +```py +>>> from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer + +>>> model = AutoModelForMultipleChoice.from_pretrained("google-bert/bert-base-uncased") +``` + +في هذه المرحلة، تبقى ثلاث خطوات فقط: + +1. حدد معلمات التدريب الخاصة بك في [`TrainingArguments`]. المعلمة الوحيدة المطلوبة هي `output_dir` التي تحدد مكان حفظ نموذجك. ستدفع هذا النموذج إلى Hub عن طريق تعيين `push_to_hub=True` (يجب عليك تسجيل الدخول إلى Hugging Face لتحميل نموذجك). في نهاية كل حقبة، سيقوم [`Trainer`] بتقييم الدقة وحفظ نقطة فحص التدريب. +2. مرر معلمات التدريب إلى [`Trainer`] جنبًا إلى جنب مع النموذج ومُجمِّع البيانات والمعالج ودالة تجميع البيانات ودالة `compute_metrics`. +3. استدعي [`~Trainer.train`] لضبط نموذجك. + +```py +>>> training_args = TrainingArguments( +... output_dir="my_awesome_swag_model", +... eval_strategy="epoch", +... save_strategy="epoch", +... load_best_model_at_end=True, +... learning_rate=5e-5, +... per_device_train_batch_size=16, +... per_device_eval_batch_size=16, +... num_train_epochs=3, +... weight_decay=0.01, +... push_to_hub=True, +... ) + +>>> trainer = Trainer( +... model=model, +... args=training_args, +... train_dataset=tokenized_swag["train"], +... eval_dataset=tokenized_swag["validation"], +... processing_class=tokenizer, +... data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer), +... compute_metrics=compute_metrics, +... ) + +>>> trainer.train() +``` + +بمجرد اكتمال التدريب، شارك نموذجك مع Hub باستخدام طريقة [`~transformers.Trainer.push_to_hub`] حتى يتمكن الجميع من استخدام نموذجك: + +```py +>>> trainer.push_to_hub() +``` + + + + +إذا لم تكن معتادًا على ضبط نموذج باستخدام Keras، فراجع الدرس الأساسي [هنا](../training#train-a-tensorflow-model-with-keras)! + + +لضبط نموذج في TensorFlow، ابدأ بإعداد دالة مُحسِّن وجدول معدل التعلم وبعض معلمات التدريب: + +```py +>>> from transformers import create_optimizer + +>>> batch_size = 16 +>>> num_train_epochs = 2 +>>> total_train_steps = (len(tokenized_swag["train"]) // batch_size) * num_train_epochs +>>> optimizer, schedule = create_optimizer(init_lr=5e-5, num_warmup_steps=0, num_train_steps=total_train_steps) +``` + +ثم يمكنك تحميل BERT باستخدام [`TFAutoModelForMultipleChoice`]: + +```py +>>> from transformers import TFAutoModelForMultipleChoice + +>>> model = TFAutoModelForMultipleChoice.from_pretrained("google-bert/bert-base-uncased") +``` + +حوّل مجموعات البيانات الخاصة بك إلى تنسيق `tf.data.Dataset` باستخدام [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]: + +```py +>>> data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer) +>>> tf_train_set = model.prepare_tf_dataset( +... tokenized_swag["train"], +... shuffle=True, +... batch_size=batch_size, +... collate_fn=data_collator, +... ) + +>>> tf_validation_set = model.prepare_tf_dataset( +... tokenized_swag["validation"], +... shuffle=False, +... batch_size=batch_size, +... collate_fn=data_collator, +... ) +``` + +قم بتهيئة النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). لاحظ أن جميع نماذج Transformers تحتوي على دالة خسارة مناسبة للمهمة بشكل افتراضي، لذلك لا تحتاج إلى تحديد واحدة ما لم ترغب في ذلك: + +```py +>>> model.compile(optimizer=optimizer) # لا توجد وسيطة خسارة! +``` + +الخطوتان الأخيرتان قبل بدء التدريب هما: حساب دقة التنبؤات، وتوفير طريقة لرفع النموذج إلى Hub. ويمكن تحقيق ذلك باستخدام [استدعاءات Keras](../main_classes/keras_callbacks) + +مرر دالتك `compute_metrics` إلى [`~transformers.KerasMetricCallback`]: + +```py +>>> from transformers.keras_callbacks import KerasMetricCallback + +>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set) +``` + +حدد مكان دفع نموذجك ومعالجك في [`~transformers.PushToHubCallback`]: + +```py +>>> from transformers.keras_callbacks import PushToHubCallback + +>>> push_to_hub_callback = PushToHubCallback( +... output_dir="my_awesome_model", +... tokenizer=tokenizer, +... ) +``` + +ثم قم بتضمين الاستدعاءات معًا: + +```py +>>> callbacks = [metric_callback, push_to_hub_callback] +``` + +أخيرًا، أنت جاهز لبدء تدريب نموذجك! استدعِ[`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع مجموعات بيانات التدريب والتحقق من الصحة وعدد الحقب والاستدعاءات لضبط النموذج: + +```py +>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=2, callbacks=callbacks) +``` + +بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائيًا إلى Hub حتى يتمكن الجميع من استخدامه! + + + + + +للحصول على مثال أكثر تعمقًا حول كيفية ضبط نموذج للاختيار من متعدد، ألق نظرة على [دفتر ملاحظات PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb) +أو [دفتر ملاحظات TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb) المقابل. + + + +## الاستدلال (Inference) + +رائع، الآن بعد أن قمت بضبط نموذج، يمكنك استخدامه للاستدلال! + +قم بإنشاء نص واقتراح إجابتين محتملتين: + +```py +>>> prompt = "France has a bread law, Le Décret Pain, with strict rules on what is allowed in a traditional baguette." +>>> candidate1 = "The law does not apply to croissants and brioche." +>>> candidate2 = "The law applies to baguettes." +``` + + + +قم بتحليل كل مطالبة وزوج إجابة مرشح وأعد تنسورات PyTorch. يجب عليك أيضًا إنشاء بعض `العلامات`: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_swag_model") +>>> inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="pt", padding=True) +>>> labels = torch.tensor(0).unsqueeze(0) +``` + +مرر مدخلاتك والعلامات إلى النموذج وأرجع`logits`: + +```py +>>> from transformers import AutoModelForMultipleChoice + +>>> model = AutoModelForMultipleChoice.from_pretrained("username/my_awesome_swag_model") +>>> outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels) +>>> logits = outputs.logits +``` + +استخرج الفئة ذات الاحتمالية الأكبر: + +```py +>>> predicted_class = logits.argmax().item() +>>> predicted_class +0 +``` + + +قم بتحليل كل مطالبة وزوج إجابة مرشح وأعد موترات TensorFlow: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_swag_model") +>>> inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="tf", padding=True) +``` + +مرر مدخلاتك إلى النموذج وأعد القيم logits: + +```py +>>> from transformers import TFAutoModelForMultipleChoice + +>>> model = TFAutoModelForMultipleChoice.from_pretrained("username/my_awesome_swag_model") +>>> inputs = {k: tf.expand_dims(v, 0) for k, v in inputs.items()} +>>> outputs = model(inputs) +>>> logits = outputs.logits +``` + +استخرج الفئة ذات الاحتمالية الأكبر: + +```py +>>> predicted_class = int(tf.math.argmax(logits, axis=-1)[0]) +>>> predicted_class +0 +``` + + diff --git a/docs/source/ar/tasks/token_classification.md b/docs/source/ar/tasks/token_classification.md new file mode 100644 index 00000000000000..e311482aeccb06 --- /dev/null +++ b/docs/source/ar/tasks/token_classification.md @@ -0,0 +1,550 @@ + + +# تصنيف الرموز(Token classification) + +[[open-in-colab]] + + + +يهدف تصنيف الرموز إلى إعطاء تسمية لكل رمز على حدة في الجملة. من أكثر مهام تصنيف الرموز شيوعًا هو التعرف على الكيانات المسماة (NER). يحاول NER تحديد تسمية لكل كيان في الجملة، مثل شخص، أو مكان، أو منظمة. + +سيوضح لك هذا الدليل كيفية: + +1. ضبط [DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased) على مجموعة بيانات [WNUT 17](https://huggingface.co/datasets/wnut_17) للكشف عن كيانات جديدة. +2. استخدام نموذجك المضبوط بدقة للاستدلال. + + + +للاطلاع جميع البنى والنقاط المتوافقة مع هذه المهمة، نوصي بالرجوع من [صفحة المهمة](https://huggingface.co/tasks/token-classification). + + + +قبل أن تبدأ، تأكد من تثبيت جميع المكتبات الضرورية: + +```bash +pip install transformers datasets evaluate seqeval +``` + +نحن نشجعك على تسجيل الدخول إلى حساب HuggingFace الخاص بك حتى تتمكن من تحميل ومشاركة نموذجك مع المجتمع. عندما يُطلب منك، أدخل رمزك لتسجيل الدخول: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +## تحميل مجموعة بيانات WNUT 17 + +ابدأ بتحميل مجموعة بيانات WNUT 17 من مكتبة 🤗 Datasets: + +```py +>>> from datasets import load_dataset + +>>> wnut = load_dataset("wnut_17") +``` + +ثم ألق نظرة على مثال: + +```py +>>> wnut["train"][0] +{'id': '0', + 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0], + 'tokens': ['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.'] +} +``` + +يمثل كل رقم في `ner_tags` كياناً. حوّل الأرقام إلى أسماء التصنيفات لمعرفة ماهية الكيانات: + +```py +>>> label_list = wnut["train"].features[f"ner_tags"].feature.names +>>> label_list +[ + "O", + "B-corporation", + "I-corporation", + "B-creative-work", + "I-creative-work", + "B-group", + "I-group", + "B-location", + "I-location", + "B-person", + "I-person", + "B-product", + "I-product", +] +``` + +يشير الحرف الذي يسبق كل `ner_tag` إلى موضع الرمز للكيان: + +- `B-` يشير إلى بداية الكيان. +- `I-` يشير إلى أن الرمز يقع ضمن نفس الكيان (على سبيل المثال، الرمز `State` هو جزء من كيان مثل `Empire State Building`). +- `0` يشير إلى أن الرمز لا يمثل أي كيان. + +## المعالجة المسبقة(Preprocess) + + + +الخطوة التالية هي تحميل مُجزِّئ النصوص DistilBERT للمعالجة المسبقة لحقل `tokens`: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased") +``` + +كما رأيت في حقل `tokens` المثال أعلاه، يبدو أن المدخل قد تم تحليله بالفعل. لكن المدخل لم يُجزأ بعد ويتعيّن عليك ضبط `is_split_into_words=True` لتقسيم الكلمات إلى كلمات فرعية. على سبيل المثال: + +```py +>>> example = wnut["train"][0] +>>> tokenized_input = tokenizer(example["tokens"], is_split_into_words=True) +>>> tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"]) +>>> tokens +['[CLS]', '@', 'paul', '##walk', 'it', "'", 's', 'the', 'view', 'from', 'where', 'i', "'", 'm', 'living', 'for', 'two', 'weeks', '.', 'empire', 'state', 'building', '=', 'es', '##b', '.', 'pretty', 'bad', 'storm', 'here', 'last', 'evening', '.', '[SEP]'] +``` + +ومع ذلك، يضيف هذا بعض الرموز الخاصة `[CLS]` و`[SEP]` وتقسيم الكلمات إلى أجزاء يُنشئ عدم تطابق بين المُدخلات والتسميات. قد يتم تقسيم كلمة واحدة تقابل تسمية واحدة الآن إلى كلمتين فرعيتين. ستحتاج إلى إعادة محاذاة الرموز والتسميات عن طريق: + +1. ربط كل رمز بالكلمة الأصلية باستخدام الخاصية [`word_ids`](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.BatchEncoding.word_ids). +2. تعيين التسمية `-100` للرموز الخاصة `[CLS]` و`[SEP]` بحيث يتم تجاهلها بواسطة دالة الخسارة PyTorch (انظر [CrossEntropyLoss](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html)). +3. تسمية الرمز الأول فقط لكلمة معينة. قم بتعيين `-100` لأجزاء الكلمة الأخرى. + +هنا كيف يمكنك إنشاء وظيفة لإعادة محاذاة الرموز والتسميات، وقص الجمل لتتجاوز الحد الأقصى لطول مُدخلات DistilBERT: + +```py +>>> def tokenize_and_align_labels(examples): +... tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True) + +... labels = [] +... for i, label in enumerate(examples[f"ner_tags"]): +... word_ids = tokenized_inputs.word_ids(batch_index=i) # تعيين الرموز إلى كلماتهم المقابلة. +... previous_word_idx = None +... label_ids = [] +... for word_idx in word_ids: # تعيين الرموز الخاصة إلى -100. +... if word_idx is None: +... label_ids.append(-100) +... elif word_idx != previous_word_idx: # تسمية الرمز الأول فقط لكلمة معينة. +... label_ids.append(label[word_idx]) +... else: +... label_ids.append(-100) +... previous_word_idx = word_idx +... labels.append(label_ids) + +... tokenized_inputs["labels"] = labels +... return tokenized_inputs +``` + +لتطبيق هذه العملية على كامل مجموعة البيانات، استخدم الدالة [`~datasets.Dataset.map`] لمجموعة بيانات 🤗. يمكنك تسريع الدالة `map` عن طريق تعيين `batched=True` لمعالجة عناصر متعددة من مجموعة البيانات في وقت واحد: + +```py +>>> tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True) +``` + +الآن قم بإنشاء دفعة من الأمثلة باستخدام [`DataCollatorWithPadding`].من الأفضل استخدام *الحشو الديناميكي* للجمل إلى أطول طول في دفعة أثناء التجميع، بدلاً من حشو مجموعة البيانات بالكامل إلى الطول الأقصى. + + + +```py +>>> from transformers import DataCollatorForTokenClassification + +>>> data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer) +``` + + +```py +>>> from transformers import DataCollatorForTokenClassification + +>>> data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf") +``` + + + +## التقييم(Evaluate) + +يُعدّ تضمين مقياس أثناء التدريب مفيدًا في تقييم أداء نموذجك. يمكنك تحميل طريقة تقييم بسرعة مع مكتبة 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index). لهذه المهمة، قم بتحميل إطار [seqeval](https://huggingface.co/spaces/evaluate-metric/seqeval) (انظر جولة 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) لمعرفة المزيد حول كيفية تحميل وحساب مقياس). يُخرج seqeval عدة نتائج: الدقة، والاستذكار، ومقياس F1، والدقة. + +```py +>>> import evaluate + +>>> seqeval = evaluate.load("seqeval") +``` + +احصل على تسميات الكيانات المسماة (NER) أولاً،ثم أنشئ دالة تُمرر تنبؤاتك وتسمياتك الصحيحة إلى [`~evaluate.EvaluationModule.compute`] لحساب النتائج: + +```py +>>> import numpy as np + +>>> labels = [label_list[i] for i in example[f"ner_tags"]] + +>>> def compute_metrics(p): +... predictions, labels = p +... predictions = np.argmax(predictions, axis=2) + +... true_predictions = [ +... [label_list[p] for (p, l) in zip(prediction, label) if l != -100] +... for prediction, label in zip(predictions, labels) +... ] +... true_labels = [ +... [label_list[l] for (p, l) in zip(prediction, label) if l != -100] +... for prediction, label in zip(predictions, labels) +... ] + +... results = seqeval.compute(predictions=true_predictions, references=true_labels) +... return { +... "precision": results["overall_precision"], +... "recall": results["overall_recall"], +... "f1": results["overall_f1"], +... "accuracy": results["overall_accuracy"], +... } +``` + +دالة `compute_metrics` جاهزة للاستخدام، وستحتاج إليها عند إعداد التدريب. + +## التدريب(Train) + +قبل تدريب النموذج، جهّز خريطة تربط بين المعرّفات المتوقعة وتسمياتها باستخدام `id2label` و `label2id`: + +```py +>>> id2label = { +... 0: "O", +... 1: "B-corporation", +... 2: "I-corporation", +... 3: "B-creative-work", +... 4: "I-creative-work", +... 5: "B-group", +... 6: "I-group", +... 7: "B-location", +... 8: "I-location", +... 9: "B-person", +... 10: "I-person", +... 11: "B-product", +... 12: "I-product", +... } +>>> label2id = { +... "O": 0, +... "B-corporation": 1, +... "I-corporation": 2, +... "B-creative-work": 3, +... "I-creative-work": 4, +... "B-group": 5, +... "I-group": 6, +... "B-location": 7, +... "I-location": 8, +... "B-person": 9, +... "I-person": 10, +... "B-product": 11, +... "I-product": 12, +... } +``` + + + + + +إذا لم تكن على دراية بتعديل نموذج باستخدام [`Trainer`], ألق نظرة على الدليل التعليمي الأساسي [هنا](../training#train-with-pytorch-trainer)! + + + +أنت مستعد الآن لبدء تدريب نموذجك! قم بتحميل DistilBERT مع [`AutoModelForTokenClassification`] إلى جانب عدد التصنيفات المتوقعة، وخريطة التسميات: + +```py +>>> from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer + +>>> model = AutoModelForTokenClassification.from_pretrained( +... "distilbert/distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id +... ) +``` + +في هذه المرحلة، هناك ثلاث خطوات فقط متبقية: + +1. حدد معلمات التدريب الخاصة بك في [`TrainingArguments`]. المعامل الوحيد المطلوب هو `output_dir` الذي يحدد مكان حفظ نموذجك. ستقوم بدفع هذا النموذج إلى Hub عن طريق تعيين `push_to_hub=True` (يجب أن تكون مسجلاً الدخول إلى Hugging Face لتحميل نموذجك). في نهاية كل حقبة، سيقوم [`Trainer`] بتقييم درجات seqeval وحفظ تسخة التدريب. +2. قم بتمرير معاملات التدريب إلى [`Trainer`] إلى جانب النموذج، ومجموعة البيانات، والمُجزِّئ اللغوي، و`data collator`، ودالة `compute_metrics`. +3.استدعِ [`~Trainer.train`] لتدريب نموذجك. + +```py +>>> training_args = TrainingArguments( +... output_dir="my_awesome_wnut_model", +... learning_rate=2e-5, +... per_device_train_batch_size=16, +... per_device_eval_batch_size=16, +... num_train_epochs=2, +... weight_decay=0.01, +... eval_strategy="epoch", +... save_strategy="epoch", +... load_best_model_at_end=True, +... push_to_hub=True, +... ) + +>>> trainer = Trainer( +... model=model, +... args=training_args, +... train_dataset=tokenized_wnut["train"], +... eval_dataset=tokenized_wnut["test"], +... processing_class=tokenizer, +... data_collator=data_collator, +... compute_metrics=compute_metrics, +... ) + +>>> trainer.train() +``` + +بمجرد اكتمال التدريب، شارك نموذجك على Hub باستخدام طريقة [`~transformers.Trainer.push_to_hub`] حتى يتمكن الجميع من استخدام نموذجك: + +```py +>>> trainer.push_to_hub() +``` + + + + +إذا لم تكن على دراية بتعديل نموذج باستخدام Keras، ألق نظرة على الدليل التعليمي الأساسي [هنا](../training#train-a-tensorflow-model-with-keras)! + + +للتعديل على نموذج في TensorFlow، ابدأ بإعداد دالة محسن، وجدول معدل التعلم، وبعض معلمات التدريب: + +```py +>>> from transformers import create_optimizer + +>>> batch_size = 16 +>>> num_train_epochs = 3 +>>> num_train_steps = (len(tokenized_wnut["train"]) // batch_size) * num_train_epochs +>>> optimizer, lr_schedule = create_optimizer( +... init_lr=2e-5, +... num_train_steps=num_train_steps, +... weight_decay_rate=0.01, +... num_warmup_steps=0, +... ) +``` + +ثم يمكنك تحميل DistilBERT مع [`TFAutoModelForTokenClassification`] إلى جانب عدد التسميات المتوقعة، وتخطيطات التسميات: + +```py +>>> from transformers import TFAutoModelForTokenClassification + +>>> model = TFAutoModelForTokenClassification.from_pretrained( +... "distilbert/distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id +... ) +``` + +قم بتحويل مجموعات بياناتك إلى تنسيق `tf.data.Dataset` مع [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]: + +```py +>>> tf_train_set = model.prepare_tf_dataset( +... tokenized_wnut["train"], +... shuffle=True, +... batch_size=16, +... collate_fn=data_collator, +... ) + +>>> tf_validation_set = model.prepare_tf_dataset( +... tokenized_wnut["validation"], +... shuffle=False, +... batch_size=16, +... collate_fn=data_collator, +... ) +``` + +هيّئ النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). لاحظ أن نماذج Transformers تتضمن دالة خسارة افتراضية مرتبطة بالمهمة، لذلك لا تحتاج إلى تحديد واحدة إلا إذا كنت ترغب في ذلك: + +```py +>>> import tensorflow as tf + +>>> model.compile(optimizer=optimizer) # No loss argument! +``` + +آخر أمرين يجب إعدادهما قبل بدء التدريب هو حساب درجات seqeval من التنبؤات، وتوفير طريقة لدفع نموذجك إلى Hub. يتم ذلك باستخدام [Keras callbacks](../main_classes/keras_callbacks). + +مرر دالة `compute_metrics` الخاصة بك إلى [`~transformers.KerasMetricCallback`]: + +```py +>>> from transformers.keras_callbacks import KerasMetricCallback + +>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set) +``` + +حدد مكان دفع نموذجك والمحلل اللغوي في [`~transformers.PushToHubCallback`]: + +```py +>>> from transformers.keras_callbacks import PushToHubCallback + +>>> push_to_hub_callback = PushToHubCallback( +... output_dir="my_awesome_wnut_model", +... tokenizer=tokenizer, +... ) +``` + +ثم جمّع callbacks الخاصة بك معًا: + +```py +>>> callbacks = [metric_callback, push_to_hub_callback] +``` + +أخيرًا، أنت جاهز الآن لبدء تدريب نموذجك! قم باستدعاء [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع بيانات التدريب والتحقق، وعدد الحقبات، وcallbacks لتعديل النموذج: + +```py +>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks) +``` + +بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائيًا إلى Hub حتى يتمكن الجميع من استخدامه! + + + + + +للحصول على مثال أكثر تفصيلاً حول كيفية تعديل نموذج لتصنيف الرموز، ألق نظرة على الدفتر المقابل +[دفتر PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb) +أو [دفتر TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb). + + + +## الاستدلال(Inference) + +رائع، الآن بعد أن قمت بتعديل نموذج، يمكنك استخدامه للاستدلال! + +احصل على بعض النصوص التي تريد تشغيل الاستدلال عليها: + +```py +>>> text = "The Golden State Warriors are an American professional basketball team based in San Francisco." +``` + +أبسط طريقة لتجربة نموذجك المُدرب مسبقًا للاستدلال هي استخدامه في [`pipeline`]. قم بتنفيذ `pipeline` لتصنيف الكيانات المسماة مع نموذجك، ومرر نصك إليه: + +```py +>>> from transformers import pipeline + +>>> classifier = pipeline("ner", model="stevhliu/my_awesome_wnut_model") +>>> classifier(text) +[{'entity': 'B-location', + 'score': 0.42658573, + 'index': 2, + 'word': 'golden', + 'start': 4, + 'end': 10}, + {'entity': 'I-location', + 'score': 0.35856336, + 'index': 3, + 'word': 'state', + 'start': 11, + 'end': 16}, + {'entity': 'B-group', + 'score': 0.3064001, + 'index': 4, + 'word': 'warriors', + 'start': 17, + 'end': 25}, + {'entity': 'B-location', + 'score': 0.65523505, + 'index': 13, + 'word': 'san', + 'start': 80, + 'end': 83}, + {'entity': 'B-location', + 'score': 0.4668663, + 'index': 14, + 'word': 'francisco', + 'start': 84, + 'end': 93}] +``` + +يمكنك أيضًا تكرار نتائج `pipeline` يدويًا إذا أردت: + + + +قسّم النص إلى رموز وأرجع المُوتّرات بلغة PyTorch: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_wnut_model") +>>> inputs = tokenizer(text, return_tensors="pt") +``` + +مرر مدخلاتك إلى النموذج واحصل على `logits`: + +```py +>>> from transformers import AutoModelForTokenClassification + +>>> model = AutoModelForTokenClassification.from_pretrained("stevhliu/my_awesome_wnut_model") +>>> with torch.no_grad(): +... logits = model(**inputs).logits +``` + +استخرج الفئة ذات الاحتمالية الأعلى، واستخدم جدول `id2label` الخاصة بالنموذج لتحويلها إلى تسمية نصية: + +```py +>>> predictions = torch.argmax(logits, dim=2) +>>> predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]] +>>> predicted_token_class +['O', + 'O', + 'B-location', + 'I-location', + 'B-group', + 'O', + 'O', + 'O', + 'O', + 'O', + 'O', + 'O', + 'O', + 'B-location', + 'B-location', + 'O', + 'O'] +``` + + +قسّم النص إلى رموز وأرجع المُوتّرات ب TensorFlow: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_wnut_model") +>>> inputs = tokenizer(text, return_tensors="tf") +``` + +مرر مدخلاتك إلى النموذج واحصل على `logits`: + +```py +>>> from transformers import TFAutoModelForTokenClassification + +>>> model = TFAutoModelForTokenClassification.from_pretrained("stevhliu/my_awesome_wnut_model") +>>> logits = model(**inputs).logits +``` + +استخرج الفئة ذات الاحتمالية الأعلى، واستخدم جدول `id2label` الخاصة بالنموذج لتحويلها إلى تسمية نصية: + +```py +>>> predicted_token_class_ids = tf.math.argmax(logits, axis=-1) +>>> predicted_token_class = [model.config.id2label[t] for t in predicted_token_class_ids[0].numpy().tolist()] +>>> predicted_token_class +['O', + 'O', + 'B-location', + 'I-location', + 'B-group', + 'O', + 'O', + 'O', + 'O', + 'O', + 'O', + 'O', + 'O', + 'B-location', + 'B-location', + 'O', + 'O'] +``` + + diff --git a/docs/source/ar/tasks/translation.md b/docs/source/ar/tasks/translation.md new file mode 100644 index 00000000000000..6245b903c22d63 --- /dev/null +++ b/docs/source/ar/tasks/translation.md @@ -0,0 +1,407 @@ + + +# الترجمة(Translation) + +[[open-in-colab]] + + + +الترجمة هي عملية تحويل سلسلة نصية من لغة إلى أخرى. وهي إحدى المهام التي يمكن صياغتها كمسألة تسلسل إلى تسلسل، وهو إطار عمل قوي لإنتاج مخرجات من مدخلات، مثل الترجمة أو التلخيص. تُستخدم أنظمة الترجمة عادةً للترجمة بين نصوص لغات مختلفة، ويمكن استخدامها أيضًا لترجمة الكلام أو لمهام تجمع بين النصوص والكلام، مثل تحويل النص إلى كلام أو تحويل الكلام إلى نص. + +سيوضح لك هذا الدليل كيفية: + +1. ضبط دقيق لنموذج [T5](https://huggingface.co/google-t5/t5-small) على المجموعة الفرعية الإنجليزية-الفرنسية من مجموعة بيانات [OPUS Books](https://huggingface.co/datasets/opus_books) لترجمة النص الإنجليزي إلى الفرنسية. +2. استخدام النموذج المضبوط بدقة للاستدلال. + + + +لمشاهدة جميع البنى والنسخ المتوافقة مع هذه المهمة، نوصي بالتحقق من [صفحة المهمة](https://huggingface.co/tasks/translation). + + + +قبل البدء، تأكد من تثبيت جميع المكتبات الضرورية: + +```bash +pip install transformers datasets evaluate sacrebleu +``` + +نشجعك على تسجيل الدخول إلى حساب Hugging Face الخاص بك حتى تتمكن من تحميل نموذجك ومشاركته مع المجتمع. عند الطلب، أدخل الرمز المميز الخاص بك لتسجيل الدخول: + +```py +>>> from huggingface_hub import notebook_login + +>>> notebook_login() +``` + +## تحميل مجموعة بيانات OPUS Books + +ابدأ بتحميل المجموعة الفرعية الإنجليزية-الفرنسية من مجموعة بيانات [OPUS Books](https://huggingface.co/datasets/opus_books) من مكتبة 🤗 Datasets: + +```py +>>> from datasets import load_dataset + +>>> books = load_dataset("opus_books", "en-fr") +``` + +قسّم مجموعة البيانات إلى مجموعة تدريب ومجموعة اختبار باستخدام طريقة [`~datasets.Dataset.train_test_split`]: + +```py +>>> books = books["train"].train_test_split(test_size=0.2) +``` + +ثم ألقِ نظرة على مثال: + +```py +>>> books["train"][0] +{'id': '90560', + 'translation': {'en': 'But this lofty plateau measured only a few fathoms, and soon we reentered Our Element.', + 'fr': 'Mais ce plateau élevé ne mesurait que quelques toises, et bientôt nous fûmes rentrés dans notre élément.'}} +``` + +`translation`: ترجمة إنجليزية وفرنسية للنص. + +## المعالجة المسبقة(Preprocess) + + + +الخطوة التالية هي تحميل مُجزئ T5 لمعالجة أزواج اللغة الإنجليزية-الفرنسية: + +```py +>>> from transformers import AutoTokenizer + +>>> checkpoint = "google-t5/t5-small" +>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint) +``` + +يجب أن تقوم دالة المعالجة المسبقة التي تُريد إنشاءها بما يلي: + +1. إضافة بادئة إلى المُدخل بمُوجه حتى يعرف T5 أن هذه مهمة ترجمة. تتطلب بعض النماذج القادرة على أداء مهام متعددة توجيهًا لمهام مُحددة. +2. تعيين اللغة الهدف (الفرنسية) في معامل `text_target` لضمان معالجة المُجزئ للنص بشكل صحيح. إذا لم تُعيّن `text_target`، فسيُعالج المُجزئ النص على أنه إنجليزي. +3. اقتطاع التسلسلات بحيث لا يزيد طولها عن الحد الأقصى الذي يحدده معامل `max_length`. + +```py +>>> source_lang = "en" +>>> target_lang = "fr" +>>> prefix = "translate English to French: " + +>>> def preprocess_function(examples): +... inputs = [prefix + example[source_lang] for example in examples["translation"]] +... targets = [example[target_lang] for example in examples["translation"]] +... model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True) +... return model_inputs +``` + +لتطبيق دالة المعالجة المسبقة على مجموعة البيانات بأكملها، استخدم طريقة [`~datasets.Dataset.map`] من 🤗 Datasets. يمكنك تسريع دالة `map` عن طريق تعيين `batched=True` لمعالجة عناصر متعددة من مجموعة البيانات في وقت واحد: + +```py +>>> tokenized_books = books.map(preprocess_function, batched=True) +``` + +الآن أنشئ دفعة من الأمثلة باستخدام [`DataCollatorForSeq2Seq`]. من الأكثر كفاءة *الحشو الديناميكي* للجمل إلى أطول طول في دفعة أثناء التجميع، بدلاً من حشو مجموعة البيانات بأكملها إلى الحد الأقصى للطول. + + + + +```py +>>> from transformers import DataCollatorForSeq2Seq + +>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint) +``` + + + +```py +>>> from transformers import DataCollatorForSeq2Seq + +>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf") +``` + + + +## التقييم (Evaluate) + +غالباً ما يكون تضمين مقياس أثناء التدريب مفيداً لتقييم أداء نموذجك. يمكنك تحميل طريقة تقييم بسرعة باستخدام مكتبة 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index). لهذه المهمة، حمّل مقياس [SacreBLEU](https://huggingface.co/spaces/evaluate-metric/sacrebleu) (راجع [الجولة السريعة](https://huggingface.co/docs/evaluate/a_quick_tour) لـ 🤗 Evaluate لمعرفة المزيد حول كيفية تحميل وحساب مقياس): + +```py +>>> import evaluate + +>>> metric = evaluate.load("sacrebleu") +``` + +ثم أنشئ دالة تُمرر تنبؤاتك وتسمياتك إلى [`~evaluate.EvaluationModule.compute`] لحساب درجة SacreBLEU: + +```py +>>> import numpy as np + +>>> def postprocess_text(preds, labels): +... preds = [pred.strip() for pred in preds] +... labels = [[label.strip()] for label in labels] + +... return preds, labels + +>>> def compute_metrics(eval_preds): +... preds, labels = eval_preds +... if isinstance(preds, tuple): +... preds = preds[0] +... decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) + +... labels = np.where(labels != -100, labels, tokenizer.pad_token_id) +... decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) + +... decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) + +... result = metric.compute(predictions=decoded_preds, references=decoded_labels) +... result = {"bleu": result["score"]} + +... prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds] +... result["gen_len"] = np.mean(prediction_lens) +... result = {k: round(v, 4) for k, v in result.items()} +... return result +``` + +دالة `compute_metrics` الخاصة بك جاهزة الآن، وسوف تعود إليها عند إعداد التدريب. + +## التدريب (Train) + + + + + + +إذا لم تكن معتادًا على ضبط دقيق نموذج باستخدام [`Trainer`], فألقِ نظرة على البرنامج التعليمي الأساسي [هنا](../training#train-with-pytorch-trainer)! + + + +أنت جاهز لبدء تدريب نموذجك الآن! حمّل T5 باستخدام [`AutoModelForSeq2SeqLM`]: + +```py +>>> from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer + +>>> model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint) +``` + +في هذه المرحلة، تبقى ثلاث خطوات فقط: + +1. حدد مُعاملات للتدريب في [`Seq2SeqTrainingArguments`]. المُعامل الوحيدة المطلوبة هي `output_dir` التي تحدد مكان حفظ النموذج الخاص بك. ستقوم بدفع هذا النموذج إلى Hub عن طريق تعيين `push_to_hub=True` (يجب عليك تسجيل الدخول إلى Hugging Face لتحميل نموذجك). في نهاية كل حقبة، سيقوم [`Trainer`] بتقييم مقياس SacreBLEU وحفظ نقطة تدقيق التدريب. +2. مرر مُعاملات التدريب إلى [`Seq2SeqTrainer`] جنبًا إلى جنب مع النموذج ومجموعة البيانات والمعالج اللغوي وجامع البيانات ووظيفة `compute_metrics`. +3. نفّذ [`~Trainer.train`] لضبط نموذجك. + +```py +>>> training_args = Seq2SeqTrainingArguments( +... output_dir="my_awesome_opus_books_model", +... eval_strategy="epoch", +... learning_rate=2e-5, +... per_device_train_batch_size=16, +... per_device_eval_batch_size=16, +... weight_decay=0.01, +... save_total_limit=3, +... num_train_epochs=2, +... predict_with_generate=True, +... fp16=True, #change to bf16=True for XPU +... push_to_hub=True, +... ) + +>>> trainer = Seq2SeqTrainer( +... model=model, +... args=training_args, +... train_dataset=tokenized_books["train"], +... eval_dataset=tokenized_books["test"], +... processing_class=tokenizer, +... data_collator=data_collator, +... compute_metrics=compute_metrics, +... ) + +>>> trainer.train() +``` + +بمجرد اكتمال التدريب، شارك نموذجك مع Hub باستخدام طريقة [`~transformers.Trainer.push_to_hub`] حتى يتمكن الجميع من استخدام نموذجك: + +```py +>>> trainer.push_to_hub() +``` + + + + +إذا لم تكن معتادًا على ضبط نموذج باستخدام Keras، فألق نظرة على البرنامج التعليمي الأساسي [هنا](../training#train-a-tensorflow-model-with-keras)! + + +لضبط نموذج في TensorFlow، ابدأ بإعداد دالة مُحسِّن وجدول معدل تعلم وبعض المعلمات الفائقة للتدريب: + +```py +>>> from transformers import AdamWeightDecay + +>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01) +``` + +ثم يمكنك تحميل T5 باستخدام [`TFAutoModelForSeq2SeqLM`]: + +```py +>>> from transformers import TFAutoModelForSeq2SeqLM + +>>> model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint) +``` + +حوّل مجموعات البيانات الخاصة بك إلى تنسيق `tf.data.Dataset` باستخدام [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]: + +```py +>>> tf_train_set = model.prepare_tf_dataset( +... tokenized_books["train"], +... shuffle=True, +... batch_size=16, +... collate_fn=data_collator, +... ) + +>>> tf_test_set = model.prepare_tf_dataset( +... tokenized_books["test"], +... shuffle=False, +... batch_size=16, +... collate_fn=data_collator, +... ) +``` + +قم بتكوين النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). لاحظ أن جميع نماذج Transformers تحتوي على دالة خسارة ذات صلة بالمهمة بشكل افتراضي، لذلك لا تحتاج إلى تحديد واحدة إلا إذا كنت ترغب في ذلك: + +```py +>>> import tensorflow as tf + +>>> model.compile(optimizer=optimizer) # No loss argument! +``` + +آخر شيئين يجب إعدادهما قبل بدء التدريب هما حساب مقياس SacreBLEU من التوقعات، وتوفير طريقة لدفع نموذجك إلى Hub. يتم كلاهما باستخدام [استدعاءات Keras](../main_classes/keras_callbacks). + +مرر دالة `compute_metrics` الخاصة بك إلى [`~transformers.KerasMetricCallback`]: + +```py +>>> from transformers.keras_callbacks import KerasMetricCallback + +>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set) +``` + +حدد مكان دفع نموذجك ومعالجك اللغوي في [`~transformers.PushToHubCallback`]: + +```py +>>> from transformers.keras_callbacks import PushToHubCallback + +>>> push_to_hub_callback = PushToHubCallback( +... output_dir="my_awesome_opus_books_model", +... tokenizer=tokenizer, +... ) +``` + +ثم اجمع استدعاءاتك معًا: + +```py +>>> callbacks = [metric_callback, push_to_hub_callback] +``` + +أخيرًا، أنت جاهز لبدء تدريب نموذجك! اتصل بـ [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع مجموعات بيانات التدريب والتحقق من الصحة وعدد الحقب واستدعاءاتك لضبط النموذج: + +```py +>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=callbacks) +``` + +بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائيًا إلى Hub حتى يتمكن الجميع من استخدامه! + + + + + +للحصول على مثال أكثر تعمقًا لكيفية ضبط نموذج للترجمة، ألق نظرة على [دفتر ملاحظات PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb) المقابل +أو [دفتر ملاحظات TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation-tf.ipynb). + + + +## الاستدلال (Inference) + +رائع، الآن بعد أن قمت بضبط نموذج، يمكنك استخدامه للاستدلال! + +أحضر بعض النصوص التي ترغب في ترجمتها إلى لغة أخرى. بالنسبة لـ T5، تحتاج إلى إضافة بادئة إلى مدخلاتك اعتمادًا على المهمة التي تعمل عليها. للترجمة من الإنجليزية إلى الفرنسية، يجب عليك إضافة بادئة إلى مدخلاتك كما هو موضح أدناه: + +```py +>>> text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria." +``` + +أبسط طريقة لتجربة نموذجك المضبوط للاستدلال هي استخدامه في [`pipeline`]. قم بإنشاء مثيل لـ `pipeline` للترجمة باستخدام نموذجك، ومرر النص الخاص بك إليه: + +```py +>>> from transformers import pipeline + +# تغيير `xx` إلى لغة الإدخال و `yy` إلى لغة المخرجات المطلوبة. +# أمثلة: "en" للغة الإنجليزية، "fr" للغة الفرنسية، "de" للغة الألمانية، "es" للغة الإسبانية، "zh" للغة الصينية، إلخ؛ translation_en_to_fr تترجم من الإنجليزية إلى الفرنسية +# يمكنك عرض جميع قوائم اللغات هنا - https://huggingface.co/languages +>>> translator = pipeline("translation_xx_to_yy", model="username/my_awesome_opus_books_model") +>>> translator(text) +[{'translation_text': 'Legumes partagent des ressources avec des bactéries azotantes.'}] +``` + +يمكنك أيضًا تكرار نتائج `pipeline` يدويًا إذا أردت: + + + +قم بتحويل النص إلى رموز وإرجاع `input_ids` كموترات PyTorch: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_opus_books_model") +>>> inputs = tokenizer(text, return_tensors="pt").input_ids +``` + +استخدم الدالة [`~generation.GenerationMixin.generate`] لإنشاء الترجمة. لمزيد من التفاصيل حول استراتيجيات توليد النصوص المختلفة والمعلمات للتحكم في التوليد، تحقق من واجهة برمجة تطبيقات [توليد النصوص](../main_classes/text_generation). + +```py +>>> from transformers import AutoModelForSeq2SeqLM + +>>> model = AutoModelForSeq2SeqLM.from_pretrained("username/my_awesome_opus_books_model") +>>> outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95) +``` + +فك تشفير معرفات الرموز المولدة مرة أخرى إلى نص: + +```py +>>> tokenizer.decode(outputs[0], skip_special_tokens=True) +'Les lignées partagent des ressources avec des bactéries enfixant l'azote.' +``` + + +قم بتحويل النص إلى رموز وإرجاع `input_ids` كموترات TensorFlow: + +```py +>>> from transformers import AutoTokenizer + +>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_opus_books_model") +>>> inputs = tokenizer(text, return_tensors="tf").input_ids +``` + +استخدم طريقة [`~transformers.generation_tf_utils.TFGenerationMixin.generate`] لإنشاء الترجمة. لمزيد من التفاصيل حول استراتيجيات توليد النصوص المختلفة والمعلمات للتحكم في التوليد، تحقق من واجهة برمجة تطبيقات [توليد النصوص](../main_classes/text_generation). + +```py +>>> from transformers import TFAutoModelForSeq2SeqLM + +>>> model = TFAutoModelForSeq2SeqLM.from_pretrained("username/my_awesome_opus_books_model") +>>> outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95) +``` + +فك تشفير معرفات الرموز المولدة مرة أخرى إلى نص: + +```py +>>> tokenizer.decode(outputs[0], skip_special_tokens=True) +'Les lugumes partagent les ressources avec des bactéries fixatrices d'azote.' +``` + + \ No newline at end of file diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index a076f704b8ede2..40780d24d51c49 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -386,6 +386,8 @@ title: DeBERTa-v2 - local: model_doc/dialogpt title: DialoGPT + - local: model_doc/diffllama + title: DiffLlama - local: model_doc/distilbert title: DistilBERT - local: model_doc/dpr @@ -450,6 +452,8 @@ title: Granite - local: model_doc/granitemoe title: GraniteMoe + - local: model_doc/helium + title: Helium - local: model_doc/herbert title: HerBERT - local: model_doc/ibert @@ -504,6 +508,8 @@ title: MobileBERT - local: model_doc/modernbert title: ModernBert + - local: model_doc/moonshine + title: moonshine - local: model_doc/mpnet title: MPNet - local: model_doc/mpt @@ -719,6 +725,8 @@ title: Swin2SR - local: model_doc/table-transformer title: Table Transformer + - local: model_doc/textnet + title: TextNet - local: model_doc/timm_wrapper title: Timm Wrapper - local: model_doc/upernet @@ -737,6 +745,8 @@ title: ViTMatte - local: model_doc/vit_msn title: ViTMSN + - local: model_doc/vitpose + title: ViTPose - local: model_doc/yolos title: YOLOS - local: model_doc/zoedepth @@ -754,8 +764,6 @@ title: dac - local: model_doc/encodec title: EnCodec - - local: model_doc/hiera - title: Hiera - local: model_doc/hubert title: Hubert - local: model_doc/mctct @@ -854,6 +862,8 @@ title: DePlot - local: model_doc/donut title: Donut + - local: model_doc/emu3 + title: Emu3 - local: model_doc/flava title: FLAVA - local: model_doc/git diff --git a/docs/source/en/chat_templating.md b/docs/source/en/chat_templating.md index 0108cb48e95cee..c44420543cf89f 100644 --- a/docs/source/en/chat_templating.md +++ b/docs/source/en/chat_templating.md @@ -23,7 +23,7 @@ of text (as is the case with a standard language model), the model instead conti of one or more **messages**, each of which includes a **role**, like "user" or "assistant", as well as message text. Much like tokenization, different models expect very different input formats for chat. This is the reason we added -**chat templates** as a feature. Chat templates are part of the tokenizer. They specify how to convert conversations, +**chat templates** as a feature. Chat templates are part of the tokenizer for text-only LLMs or processor for multimodal LLMs. They specify how to convert conversations, represented as lists of messages, into a single tokenizable string in the format that the model expects. Let's make this concrete with a quick example using the `mistralai/Mistral-7B-Instruct-v0.1` model: @@ -66,10 +66,12 @@ for you, allowing you to write universal code that works for any model. ## How do I use chat templates? As you can see in the example above, chat templates are easy to use. Simply build a list of messages, with `role` -and `content` keys, and then pass it to the [`~PreTrainedTokenizer.apply_chat_template`] method. Once you do that, +and `content` keys, and then pass it to the [`~PreTrainedTokenizer.apply_chat_template`] or [`~ProcessorMixin.apply_chat_template`] method +depending on what type of model you are using. Once you do that, you'll get output that's ready to go! When using chat templates as input for model generation, it's also a good idea to use `add_generation_prompt=True` to add a [generation prompt](#what-are-generation-prompts). +## Usage with text-only LLMs Here's an example of preparing input for `model.generate()`, using `Zephyr` again: ```python @@ -116,6 +118,44 @@ How many helicopters can a human eat in one sitting? Matey, I'm afraid I must inform ye that humans cannot eat helicopters. Helicopters are not food, they are flying machines. Food is meant to be eaten, like a hearty plate o' grog, a savory bowl o' stew, or a delicious loaf o' bread. But helicopters, they be for transportin' and movin' around, not for eatin'. So, I'd say none, me hearties. None at all. ``` +## Usage with multimodal LLMs + +For multimodal LLMs such as [LLaVA](https://huggingface.co/llava-hf) the prompts can be formatted in a similar way. The only difference is you need to pass input images/videos as well along with the text. Each `"content"` +has to be a list containing either a text or an image/video. + +Here's an example of preparing input for using `LLaVA` model: + +```python +from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration + +model_id = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf" +model = LlavaOnevisionForConditionalGeneration.from_pretrained(model_id) # You may want to use bfloat16 and/or move to GPU here +processor = AutoProcessor.from_pretrained(model_id) + +messages = [ + { + "role": "system", + "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}], + }, + { + "role": "user", + "content": [ + {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"}, + {"type": "text", "text": "What are these?"}, + ], + }, +] + +processed_chat = processor.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt") +print(processor.batch_decode(processed_chat["input_ids"][:, :30])) +``` +This yields a string in LLaVAs expected input format with many `` tokens at the end. +The `` tokens are placeholders and each one will be replaced by image embeddings when the mode is run in the forward call. The `processed_chat` can be further passed into [`~GenerationMixin.generate`] to generate text. +```text +'<|im_start|>system +You are a friendly chatbot who always responds in the style of a pirate<|im_end|><|im_start|>user ' +``` + Arr, 'twas easy after all! ## Is there an automated pipeline for chat? diff --git a/docs/source/en/deepspeed.md b/docs/source/en/deepspeed.md index 7f7995c4664133..ad3d4240f85619 100644 --- a/docs/source/en/deepspeed.md +++ b/docs/source/en/deepspeed.md @@ -586,6 +586,20 @@ You can choose the communication data type by setting the `communication_data_ty } ``` +### Universal Checkpointing + +[Universal Checkpointing](https://www.deepspeed.ai/tutorials/universal-checkpointing) is an efficient and flexible feature for saving and loading model checkpoints. It enables seamless model training continuation and fine-tuning across different model architectures, parallelism techniques, and training configurations. + +Resume training with a universal checkpoint by setting [load_universal](https://www.deepspeed.ai/docs/config-json/#checkpoint-options) to `true` in the config file. + +```yaml +{ + "checkpoint": { + "load_universal": true + } +} +``` + ## Deployment DeepSpeed can be deployed by different launchers such as [torchrun](https://pytorch.org/docs/stable/elastic/run.html), the `deepspeed` launcher, or [Accelerate](https://huggingface.co/docs/accelerate/basic_tutorials/launch#using-accelerate-launch). To deploy, add `--deepspeed ds_config.json` to the [`Trainer`] command line. It’s recommended to use DeepSpeed’s [`add_config_arguments`](https://deepspeed.readthedocs.io/en/latest/initialize.html#argument-parsing) utility to add any necessary command line arguments to your code. diff --git a/docs/source/en/generation_strategies.md b/docs/source/en/generation_strategies.md index 47032a2a292b1b..4c4c19a3d6628d 100644 --- a/docs/source/en/generation_strategies.md +++ b/docs/source/en/generation_strategies.md @@ -96,6 +96,12 @@ distribution over the entire vocabulary with various strategy-specific adjustmen the decoding strategies that support multiple sequence candidates, e.g. variations of beam search and sampling. Decoding strategies like greedy search and contrastive search return a single output sequence. +It is also possible to extend `generate()` with external libraries or handcrafted code. The `logits_processor` argument +allows you to pass custom [`LogitsProcessor`] instances, allowing you to manipulate the next token probability +distributions. Likewise, the `stopping_criteria` argument lets you set custom [`StoppingCriteria`] to stop text generation. +The [`logits-processor-zoo`](https://github.com/NVIDIA/logits-processor-zoo) library contains examples of external +`generate()`-compatible extensions. + ## Save a custom decoding strategy with your model If you would like to share your fine-tuned model with a specific generation configuration, you can: @@ -435,6 +441,28 @@ To enable assisted decoding, set the `assistant_model` argument with a model. ['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a'] ``` + + +If you're using a `pipeline` object, all you need to do is to pass the assistant checkpoint under `assistant_model` + +```python +>>> from transformers import pipeline +>>> import torch + +>>> pipe = pipeline( +... "text-generation", +... model="meta-llama/Llama-3.1-8B", +... assistant_model="meta-llama/Llama-3.2-1B", # This extra line is all that's needed, also works with UAD +... torch_dtype=torch.bfloat16 +>>> ) +>>> pipe_output = pipe("Once upon a time, ", max_new_tokens=50, do_sample=False) +>>> pipe_output[0]["generated_text"] +'Once upon a time, 3D printing was a niche technology that was only' +``` + + + + When using assisted decoding with sampling methods, you can use the `temperature` argument to control the randomness, just like in multinomial sampling. However, in assisted decoding, reducing the temperature may help improve the latency. diff --git a/docs/source/en/index.md b/docs/source/en/index.md index dcecfc872d61d0..1c90fd71b3d298 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -125,6 +125,7 @@ Flax), PyTorch, and/or TensorFlow. | [DETA](model_doc/deta) | ✅ | ❌ | ❌ | | [DETR](model_doc/detr) | ✅ | ❌ | ❌ | | [DialoGPT](model_doc/dialogpt) | ✅ | ✅ | ✅ | +| [DiffLlama](model_doc/diffllama) | ✅ | ❌ | ❌ | | [DiNAT](model_doc/dinat) | ✅ | ❌ | ❌ | | [DINOv2](model_doc/dinov2) | ✅ | ❌ | ✅ | | [DINOv2 with Registers](model_doc/dinov2_with_registers) | ✅ | ❌ | ❌ | @@ -136,6 +137,7 @@ Flax), PyTorch, and/or TensorFlow. | [EfficientFormer](model_doc/efficientformer) | ✅ | ✅ | ❌ | | [EfficientNet](model_doc/efficientnet) | ✅ | ❌ | ❌ | | [ELECTRA](model_doc/electra) | ✅ | ✅ | ✅ | +| [Emu3](model_doc/emu3) | ✅ | ❌ | ❌ | | [EnCodec](model_doc/encodec) | ✅ | ❌ | ❌ | | [Encoder decoder](model_doc/encoder-decoder) | ✅ | ✅ | ✅ | | [ERNIE](model_doc/ernie) | ✅ | ❌ | ❌ | @@ -171,6 +173,7 @@ Flax), PyTorch, and/or TensorFlow. | [Graphormer](model_doc/graphormer) | ✅ | ❌ | ❌ | | [Grounding DINO](model_doc/grounding-dino) | ✅ | ❌ | ❌ | | [GroupViT](model_doc/groupvit) | ✅ | ✅ | ❌ | +| [Helium](model_doc/helium) | ✅ | ❌ | ❌ | | [HerBERT](model_doc/herbert) | ✅ | ✅ | ✅ | | [Hiera](model_doc/hiera) | ✅ | ❌ | ❌ | | [Hubert](model_doc/hubert) | ✅ | ✅ | ❌ | @@ -234,6 +237,7 @@ Flax), PyTorch, and/or TensorFlow. | [MobileViT](model_doc/mobilevit) | ✅ | ✅ | ❌ | | [MobileViTV2](model_doc/mobilevitv2) | ✅ | ❌ | ❌ | | [ModernBERT](model_doc/modernbert) | ✅ | ❌ | ❌ | +| [Moonshine](model_doc/moonshine) | ✅ | ❌ | ❌ | | [Moshi](model_doc/moshi) | ✅ | ❌ | ❌ | | [MPNet](model_doc/mpnet) | ✅ | ✅ | ❌ | | [MPT](model_doc/mpt) | ✅ | ❌ | ❌ | @@ -325,6 +329,7 @@ Flax), PyTorch, and/or TensorFlow. | [Table Transformer](model_doc/table-transformer) | ✅ | ❌ | ❌ | | [TAPAS](model_doc/tapas) | ✅ | ✅ | ❌ | | [TAPEX](model_doc/tapex) | ✅ | ✅ | ✅ | +| [TextNet](model_doc/textnet) | ✅ | ❌ | ❌ | | [Time Series Transformer](model_doc/time_series_transformer) | ✅ | ❌ | ❌ | | [TimeSformer](model_doc/timesformer) | ✅ | ❌ | ❌ | | [TimmWrapperModel](model_doc/timm_wrapper) | ✅ | ❌ | ❌ | @@ -354,6 +359,8 @@ Flax), PyTorch, and/or TensorFlow. | [ViTMAE](model_doc/vit_mae) | ✅ | ✅ | ❌ | | [ViTMatte](model_doc/vitmatte) | ✅ | ❌ | ❌ | | [ViTMSN](model_doc/vit_msn) | ✅ | ❌ | ❌ | +| [VitPose](model_doc/vitpose) | ✅ | ❌ | ❌ | +| [VitPoseBackbone](model_doc/vitpose_backbone) | ✅ | ❌ | ❌ | | [VITS](model_doc/vits) | ✅ | ❌ | ❌ | | [ViViT](model_doc/vivit) | ✅ | ❌ | ❌ | | [Wav2Vec2](model_doc/wav2vec2) | ✅ | ✅ | ✅ | diff --git a/docs/source/en/installation.md b/docs/source/en/installation.md index af7c97ef3508ae..ae1f2101d749b3 100644 --- a/docs/source/en/installation.md +++ b/docs/source/en/installation.md @@ -32,27 +32,18 @@ Install 🤗 Transformers for whichever deep learning library you're working wit You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, take a look at this [guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). A virtual environment makes it easier to manage different projects, and avoid compatibility issues between dependencies. -Start by creating a virtual environment in your project directory: +Now you're ready to install 🤗 Transformers with the following command: ```bash -python -m venv .env +pip install transformers ``` -Activate the virtual environment. On Linux and MacOs: +For GPU acceleration, install the appropriate CUDA drivers for [PyTorch](https://pytorch.org/get-started/locally) and TensorFlow(https://www.tensorflow.org/install/pip). -```bash -source .env/bin/activate -``` -Activate Virtual environment on Windows +Run the command below to check if your system detects an NVIDIA GPU. ```bash -.env/Scripts/activate -``` - -Now you're ready to install 🤗 Transformers with the following command: - -```bash -pip install transformers +nvidia-smi ``` For CPU-support only, you can conveniently install 🤗 Transformers and a deep learning library in one line. For example, install 🤗 Transformers and PyTorch with: @@ -254,3 +245,36 @@ Once your file is downloaded and locally cached, specify it's local path to load See the [How to download files from the Hub](https://huggingface.co/docs/hub/how-to-downstream) section for more details on downloading files stored on the Hub. + +## Troubleshooting + +See below for some of the more common installation issues and how to resolve them. + +### Unsupported Python version + +Ensure you are using Python 3.9 or later. Run the command below to check your Python version. + +``` +python --version +``` + +### Missing dependencies + +Install all required dependencies by running the following command. Ensure you’re in the project directory before executing the command. + +``` +pip install -r requirements.txt +``` + +### Windows-specific + +If you encounter issues on Windows, you may need to activate Developer Mode. Navigate to Windows Settings > For Developers > Developer Mode. + +Alternatively, create and activate a virtual environment as shown below. + +``` +python -m venv env +.\env\Scripts\activate +``` + + diff --git a/docs/source/en/llm_optims.md b/docs/source/en/llm_optims.md index 17ebb841de7a39..37406ea0bef298 100644 --- a/docs/source/en/llm_optims.md +++ b/docs/source/en/llm_optims.md @@ -156,9 +156,11 @@ def decode_one_tokens(model, cur_token, input_pos, cache_position, past_key_valu There are a few important things you must do to enable static kv-cache and `torch.compile` with the `StaticCache` method: 1. Initialize the [`StaticCache`] instance before using the model for inference. There you can configure parameters like the maximum batch size and sequence length. 2. Call `torch.compile` on the model to compile the forward pass with the static kv-cache. -3. Set `enable_math=True` in the [torch.backends.cuda.sdp_kernel](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) context manager to enable the native PyTorch C++ implementation of scaled dot product attention to speed up inference even more. +3. Use `SDPBackend.MATH` in the [torch.nn.attention.sdpa_kernel](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html) context manager to enable the native PyTorch C++ implementation of scaled dot product attention to speed up inference even more. ```py +from torch.nn.attention import SDPBackend, sdpa_kernel + batch_size, seq_length = inputs["input_ids"].shape with torch.no_grad(): past_key_values = StaticCache( @@ -179,7 +181,7 @@ with torch.no_grad(): decode_one_tokens = torch.compile(decode_one_tokens, mode="reduce-overhead", fullgraph=True) cache_position = torch.tensor([seq_length + 1], device=torch_device) for _ in range(1, NUM_TOKENS_TO_GENERATE): - with torch.backends.cuda.sdp_kernel(enable_flash=False, enable_mem_efficient=False, enable_math=True): + with sdpa_kernel(SDPBackend.MATH): next_token = decode_one_tokens(model, next_token.clone(), None, cache_position, past_key_values) generated_ids[:, cache_position] = next_token.int() cache_position += 1 @@ -453,10 +455,11 @@ Scaled dot product attention (SDPA) is automatically enabled in PyTorch 2.0 and > [!TIP] > SDPA supports FlashAttention-2 as long as you have the latest PyTorch version installed. -Use the [torch.backends.cuda.sdp_kernel](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) context manager to explicitly enable or disable any of the three attention algorithms. For example, set `enable_flash=True` to enable FlashAttention. +Use the [torch.nn.attention.sdpa_kernel](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html) context manager to explicitly enable or disable any of the four attention algorithms. For example, use `SDPBackend.FLASH_ATTENTION` to enable FlashAttention. ```py import torch +from torch.nn.attention import SDPBackend, sdpa_kernel from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained( @@ -464,7 +467,7 @@ model = AutoModelForCausalLM.from_pretrained( torch_dtype=torch.bfloat16, ) -with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False): +with sdpa_kernel(SDPBackend.FLASH_ATTENTION): outputs = model.generate(**inputs) ``` diff --git a/docs/source/en/llm_tutorial.md b/docs/source/en/llm_tutorial.md index 097d7bf1e9ca38..eaf59c741cdd28 100644 --- a/docs/source/en/llm_tutorial.md +++ b/docs/source/en/llm_tutorial.md @@ -265,8 +265,9 @@ While the autoregressive generation process is relatively straightforward, makin ### Related libraries -1. [`optimum`](https://github.com/huggingface/optimum), an extension of 🤗 Transformers that optimizes for specific hardware devices. +1. [`optimum`](https://github.com/huggingface/optimum), an extension of 🤗 Transformers that optimizes for specific hardware devices; 2. [`outlines`](https://github.com/outlines-dev/outlines), a library where you can constrain text generation (e.g. to generate JSON files); -3. [`SynCode`](https://github.com/uiuc-focal-lab/syncode), a library for context-free grammar guided generation. (e.g. JSON, SQL, Python) +3. [`SynCode`](https://github.com/uiuc-focal-lab/syncode), a library for context-free grammar guided generation (e.g. JSON, SQL, Python); 4. [`text-generation-inference`](https://github.com/huggingface/text-generation-inference), a production-ready server for LLMs; 5. [`text-generation-webui`](https://github.com/oobabooga/text-generation-webui), a UI for text generation; +6. [`logits-processor-zoo`](https://github.com/NVIDIA/logits-processor-zoo), containing additional options to control text generation with 🤗 Transformers. See our related [blog post](https://huggingface.co/blog/logits-processor-zoo). diff --git a/docs/source/en/model_doc/diffllama.md b/docs/source/en/model_doc/diffllama.md new file mode 100644 index 00000000000000..80afcfe433e9dd --- /dev/null +++ b/docs/source/en/model_doc/diffllama.md @@ -0,0 +1,59 @@ + + +# DiffLlama + +## Overview + +The DiffLlama model was proposed in [Differential Transformer](https://arxiv.org/abs/2410.05258) by Kazuma Matsumoto and . +This model is combine Llama model and Differential Transformer's Attention. + +The abstract from the paper is the following: + +*Transformer tends to overallocate attention to irrelevant context. In this work, we introduce Diff Transformer, which amplifies attention to the relevant context while canceling noise. Specifically, the differential attention mechanism calculates attention scores as the difference between two separate softmax attention maps. The subtraction cancels noise, promoting the emergence of sparse attention patterns. Experimental results on language modeling show that Diff Transformer outperforms Transformer in various settings of scaling up model size and training tokens. More intriguingly, it offers notable advantages in practical applications, such as long-context modeling, key information retrieval, hallucination mitigation, in-context learning, and reduction of activation outliers. By being less distracted by irrelevant context, Diff Transformer can mitigate hallucination in question answering and text summarization. For in-context learning, Diff Transformer not only enhances accuracy but is also more robust to order permutation, which was considered as a chronic robustness issue. The results position Diff Transformer as a highly effective and promising architecture to advance large language models.* + +### Usage tips +The hyperparameters of this model is the same as Llama model. + + +## DiffLlamaConfig + +[[autodoc]] DiffLlamaConfig + +## DiffLlamaModel + +[[autodoc]] DiffLlamaModel + - forward + +## DiffLlamaForCausalLM + +[[autodoc]] DiffLlamaForCausalLM + - forward + +## DiffLlamaForSequenceClassification + +[[autodoc]] DiffLlamaForSequenceClassification + - forward + +## DiffLlamaForQuestionAnswering + +[[autodoc]] DiffLlamaForQuestionAnswering + - forward + +## DiffLlamaForTokenClassification + +[[autodoc]] DiffLlamaForTokenClassification + - forward diff --git a/docs/source/en/model_doc/emu3.md b/docs/source/en/model_doc/emu3.md new file mode 100644 index 00000000000000..0b3220c073fb65 --- /dev/null +++ b/docs/source/en/model_doc/emu3.md @@ -0,0 +1,179 @@ + + +# Emu3 + +## Overview + +The Emu3 model was proposed in [Emu3: Next-Token Prediction is All You Need](https://arxiv.org/abs/2409.18869) by Xinlong Wang, Xiaosong Zhang, Zhengxiong Luo, Quan Sun, Yufeng Cui, Jinsheng Wang, Fan Zhang, Yueze Wang, Zhen Li, Qiying Yu, Yingli Zhao, Yulong Ao, Xuebin Min, Tao Li, Boya Wu, Bo Zhao, Bowen Zhang, Liangdong Wang, Guang Liu, Zheqi He, Xi Yang, Jingjing Liu, Yonghua Lin, Tiejun Huang, Zhongyuan Wang. + +Emu3 is a multimodal LLM that uses vector quantization to tokenize images into discrete tokens. Discretized image tokens are later fused with text token ids for image and text generation. The model can additionally generate images by predicting image token ids. + + +The abstract from the paper is the following: + +*While next-token prediction is considered a promising path towards artificial general intelligence, it has struggled to excel in multimodal tasks, which are still dominated by diffusion models (e.g., Stable Diffusion) and compositional approaches (e.g., CLIP combined with LLMs). In this paper, we introduce Emu3, a new suite of state-of-the-art multimodal models trained solely with next-token prediction. By tokenizing images, text, and videos into a discrete space, we train a single transformer from scratch on a mixture of multimodal sequences. Emu3 outperforms several well-established task-specific models in both generation and perception tasks, surpassing flagship models such as SDXL and LLaVA-1.6, while eliminating the need for diffusion or compositional architectures. Emu3 is also capable of generating high-fidelity video via predicting the next token in a video sequence. We simplify complex multimodal model designs by converging on a singular focus: tokens, unlocking great potential for scaling both during training and inference. Our results demonstrate that next-token prediction is a promising path towards building general multimodal intelligence beyond language. We open-source key techniques and models to support further research in this direction.* + +Tips: + +- We advise users to set `processor.tokenizer.padding_side = "left"` before batched generation as it leads to more accurate results. + +- Note that the model has been trained with a specific prompt format for chatting. Use `processor.apply_chat_template(my_conversation_dict)` to correctly format your prompts. + +- Emu3 has two different checkpoints for image-generation and text-generation, make sure to use the correct checkpoint when loading the model. To generate an image, it is advised to use `prefix_constraints` so that the generated tokens are sampled only from possible image tokens. See more below for usage examples. + +> [!TIP] +> Emu3 implementation in Transformers uses a special image token to indicate where to merge image embeddings. The special image token isn't new and uses one of the reserved tokens: `<|extra_0|>`. You have to add `` to your prompt in the place where the image should be embedded for correct generation. + + +This model was contributed by [RaushanTurganbay](https://huggingface.co/RaushanTurganbay). +The original code can be found [here](https://github.com/baaivision/Emu3). + + +## Usage example + +### Text generation inference + +Here's how to load the model and perform inference in half-precision (`torch.bfloat16`) to generate textual output from text or text and image inputs: + +```python +from transformers import Emu3Processor, Emu3ForConditionalGeneration +import torch +from PIL import Image +import requests + +processor = Emu3Processor.from_pretrained("Emu3-community/Emu3-Chat-hf") +model = Emu3ForConditionalGeneration.from_pretrained("Emu3-community/Emu3-Chat-hf", torch_dtype=torch.bfloat16, device_map="cuda") + +# prepare image and text prompt +url = 'http://images.cocodataset.org/val2017/000000039769.jpg' +image = Image.open(requests.get(url, stream=True).raw) +prompt = "What do you see in this image?" + +inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device, dtype=torch.bfloat16) + +# autoregressively complete prompt +output = model.generate(**inputs, max_new_tokens=50) +print(processor.decode(output[0], skip_special_tokens=True)) +``` + +### Image generation inference + +Emu3 can also generate images from textual input. Here is how you can do it: + +```python +processor = Emu3Processor.from_pretrained("Emu3-community/Emu3-Gen-hf") +model = Emu3ForConditionalGeneration.from_pretrained("Emu3-community/Emu3-Gen-hf", torch_dtype="bfloat16", device_map="auto", attn_implementation="flash_attention_2") + + +inputs = processor( + text=["a portrait of young girl. masterpiece, film grained, best quality.", "a dog running under the rain"], + padding=True, + return_tensors="pt", + return_for_image_generation=True, +) +inputs = inputs.to(device="cuda:0", dtype=torch.bfloat16) + +neg_prompt = "lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry." +neg_inputs = processor(text=[neg_prompt] * 2, return_tensors="pt").to(device="cuda:0") + +image_sizes = inputs.pop("image_sizes") +HEIGHT, WIDTH = image_sizes[0] +VISUAL_TOKENS = model.vocabulary_mapping.image_tokens + +def prefix_allowed_tokens_fn(batch_id, input_ids): + height, width = HEIGHT, WIDTH + visual_tokens = VISUAL_TOKENS + image_wrapper_token_id = torch.tensor([processor.tokenizer.image_wrapper_token_id], device=model.device) + eoi_token_id = torch.tensor([processor.tokenizer.eoi_token_id], device=model.device) + eos_token_id = torch.tensor([processor.tokenizer.eos_token_id], device=model.device) + pad_token_id = torch.tensor([processor.tokenizer.pad_token_id], device=model.device) + eof_token_id = torch.tensor([processor.tokenizer.eof_token_id], device=model.device) + eol_token_id = processor.tokenizer.encode("<|extra_200|>", return_tensors="pt")[0] + + position = torch.nonzero(input_ids == image_wrapper_token_id, as_tuple=True)[0][0] + offset = input_ids.shape[0] - position + if offset % (width + 1) == 0: + return (eol_token_id, ) + elif offset == (width + 1) * height + 1: + return (eof_token_id, ) + elif offset == (width + 1) * height + 2: + return (eoi_token_id, ) + elif offset == (width + 1) * height + 3: + return (eos_token_id, ) + elif offset > (width + 1) * height + 3: + return (pad_token_id, ) + else: + return visual_tokens + + +out = model.generate( + **inputs, + max_new_tokens=50_000, # make sure to have enough tokens for one image + prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, + return_dict_in_generate=True, + negative_prompt_ids=neg_inputs.input_ids, # indicate for Classifier-Free Guidance + negative_prompt_attention_mask=neg_inputs.attention_mask, +) + +image = model.decode_image_tokens(out.sequences[:, inputs.input_ids.shape[1]: ], height=HEIGHT, width=WIDTH) +images = processor.postprocess(list(image.float()), return_tensors="PIL.Image.Image") # internally we convert to np but it's not supported in bf16 precision +for i, image in enumerate(images['pixel_values']): + image.save(f"result{i}.png") + +``` + + +## Emu3Config + +[[autodoc]] Emu3Config + +## Emu3VQVAEConfig + +[[autodoc]] Emu3VQVAEConfig + +## Emu3TextConfig + +[[autodoc]] Emu3TextConfig + +## Emu3Processor + +[[autodoc]] Emu3Processor + +## Emu3ImageProcessor + +[[autodoc]] Emu3ImageProcessor + - preprocess + +## Emu3VQVAE + +[[autodoc]] Emu3VQVAE + - forward + +## Emu3TextModel + +[[autodoc]] Emu3TextModel + - forward + +## Emu3ForCausalLM + +[[autodoc]] Emu3ForCausalLM + - forward + +## Emu3ForConditionalGeneration + +[[autodoc]] Emu3ForConditionalGeneration + - forward diff --git a/docs/source/en/model_doc/helium.md b/docs/source/en/model_doc/helium.md new file mode 100644 index 00000000000000..df5927544df960 --- /dev/null +++ b/docs/source/en/model_doc/helium.md @@ -0,0 +1,158 @@ + + +# Helium + + +## Overview + +Helium was proposed in [Announcing Helium-1 Preview](https://kyutai.org/2025/01/13/helium.html) by the Kyutai Team. + + +Helium-1 preview is a lightweight language model with 2B parameters, targeting edge and mobile devices. +It supports the following languages: English, French, German, Italian, Portuguese, Spanish. + +- **Developed by:** Kyutai +- **Model type:** Large Language Model +- **Language(s) (NLP):** English, French, German, Italian, Portuguese, Spanish +- **License:** CC-BY 4.0 + + + + +## Evaluation + + + +#### Testing Data + + + +The model was evaluated on MMLU, TriviaQA, NaturalQuestions, ARC Easy & Challenge, Open Book QA, Common Sense QA, +Physical Interaction QA, Social Interaction QA, HellaSwag, WinoGrande, Multilingual Knowledge QA, FLORES 200. + +#### Metrics + + + +We report accuracy on MMLU, ARC, OBQA, CSQA, PIQA, SIQA, HellaSwag, WinoGrande. +We report exact match on TriviaQA, NQ and MKQA. +We report BLEU on FLORES. + +### English Results + +| Benchmark | Helium-1 Preview | HF SmolLM2 (1.7B) | Gemma-2 (2.6B) | Llama-3.2 (3B) | Qwen2.5 (1.5B) | +|--------------|--------|--------|--------|--------|--------| +| | | | | | | +| MMLU | 51.2 | 50.4 | 53.1 | 56.6 | 61.0 | +| NQ | 17.3 | 15.1 | 17.7 | 22.0 | 13.1 | +| TQA | 47.9 | 45.4 | 49.9 | 53.6 | 35.9 | +| ARC E | 80.9 | 81.8 | 81.1 | 84.6 | 89.7 | +| ARC C | 62.7 | 64.7 | 66.0 | 69.0 | 77.2 | +| OBQA | 63.8 | 61.4 | 64.6 | 68.4 | 73.8 | +| CSQA | 65.6 | 59.0 | 64.4 | 65.4 | 72.4 | +| PIQA | 77.4 | 77.7 | 79.8 | 78.9 | 76.0 | +| SIQA | 64.4 | 57.5 | 61.9 | 63.8 | 68.7 | +| HS | 69.7 | 73.2 | 74.7 | 76.9 | 67.5 | +| WG | 66.5 | 65.6 | 71.2 | 72.0 | 64.8 | +| | | | | | | +| Average | 60.7 | 59.3 | 62.2 | 64.7 | 63.6 | + +#### Multilingual Results + +| Language | Benchmark | Helium-1 Preview | HF SmolLM2 (1.7B) | Gemma-2 (2.6B) | Llama-3.2 (3B) | Qwen2.5 (1.5B) | +|-----|--------------|--------|--------|--------|--------|--------| +| | | | | | | | +|German| MMLU | 45.6 | 35.3 | 45.0 | 47.5 | 49.5 | +|| ARC C | 56.7 | 38.4 | 54.7 | 58.3 | 60.2 | +|| HS | 53.5 | 33.9 | 53.4 | 53.7 | 42.8 | +|| MKQA | 16.1 | 7.1 | 18.9 | 20.2 | 10.4 | +| | | | | | | | +|Spanish| MMLU | 46.5 | 38.9 | 46.2 | 49.6 | 52.8 | +|| ARC C | 58.3 | 43.2 | 58.8 | 60.0 | 68.1 | +|| HS | 58.6 | 40.8 | 60.5 | 61.1 | 51.4 | +|| MKQA | 16.0 | 7.9 | 18.5 | 20.6 | 10.6 | + + +## Technical Specifications + +### Model Architecture and Objective + +| Hyperparameter | Value | +|--------------|--------| +| Layers | 24 | +| Heads | 20 | +| Model dimension | 2560 | +| MLP dimension | 7040 | +| Context size | 4096 | +| Theta RoPE | 100,000 | + +Tips: + +- This model was contributed by [Laurent Mazare](https://huggingface.co/lmz) + + +## Usage tips + +`Helium` can be found on the [Huggingface Hub](https://huggingface.co/collections/kyutai/helium-1-preview) + +In the following, we demonstrate how to use `helium-1-preview` for the inference. + +```python +>>> from transformers import AutoModelForCausalLM, AutoTokenizer +>>> device = "cuda" # the device to load the model onto + +>>> model = AutoModelForCausalLM.from_pretrained("helium-1-preview", device_map="auto") +>>> tokenizer = AutoTokenizer.from_pretrained("helium-1-preview") + +>>> prompt = "Give me a short introduction to large language model." + +>>> messages = [{"role": "user", "content": prompt}] + +>>> text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + +>>> model_inputs = tokenizer([text], return_tensors="pt").to(device) + +>>> generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512, do_sample=True) + +>>> generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)] + +>>> response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] +``` + +## HeliumConfig + +[[autodoc]] HeliumConfig + +## HeliumModel + +[[autodoc]] HeliumModel + - forward + +## HeliumForCausalLM + +[[autodoc]] HeliumForCausalLM + - forward + +## HeliumForSequenceClassification + +[[autodoc]] HeliumForSequenceClassification + - forward + +## HeliumForTokenClassification + +[[autodoc]] HeliumForTokenClassification + - forward diff --git a/docs/source/en/model_doc/modernbert.md b/docs/source/en/model_doc/modernbert.md index b641d7f3f58199..e90f34a903e49b 100644 --- a/docs/source/en/model_doc/modernbert.md +++ b/docs/source/en/model_doc/modernbert.md @@ -14,7 +14,7 @@ rendered properly in your Markdown viewer. --> -# ModernBert +# ModernBERT
@@ -27,7 +27,7 @@ rendered properly in your Markdown viewer. ## Overview -The ModernBert model was proposed in [Smarter, Better, Faster, Longer: A Modern Bidirectional Encoder for Fast, Memory Efficient, and Long Context Finetuning and Inference](https://arxiv.org/abs/2412.13663) by Benjamin Warner, Antoine Chaffin, Benjamin Clavié, Orion Weller, Oskar Hallström, Said Taghadouini, Alexis Galalgher, Raja Bisas, Faisal Ladhak, Tom Aarsen, Nathan Cooper, Grifin Adams, Jeremy Howard and Iacopo Poli. +The ModernBERT model was proposed in [Smarter, Better, Faster, Longer: A Modern Bidirectional Encoder for Fast, Memory Efficient, and Long Context Finetuning and Inference](https://arxiv.org/abs/2412.13663) by Benjamin Warner, Antoine Chaffin, Benjamin Clavié, Orion Weller, Oskar Hallström, Said Taghadouini, Alexis Galalgher, Raja Bisas, Faisal Ladhak, Tom Aarsen, Nathan Cooper, Grifin Adams, Jeremy Howard and Iacopo Poli. It is a refresh of the traditional encoder architecture, as used in previous models such as [BERT](https://huggingface.co/docs/transformers/en/model_doc/bert) and [RoBERTa](https://huggingface.co/docs/transformers/en/model_doc/roberta). diff --git a/docs/source/en/model_doc/moonshine.md b/docs/source/en/model_doc/moonshine.md new file mode 100644 index 00000000000000..571e3febdb4fa8 --- /dev/null +++ b/docs/source/en/model_doc/moonshine.md @@ -0,0 +1,56 @@ + + +# Moonshine + +## Overview + +The Moonshine model was proposed in [Moonshine: Speech Recognition for Live Transcription and Voice Commands +](https://arxiv.org/abs/2410.15608) by Nat Jeffries, Evan King, Manjunath Kudlur, Guy Nicholson, James Wang, Pete Warden. + +The abstract from the paper is the following: + +*This paper introduces Moonshine, a family of speech recognition models optimized for live transcription and voice command processing. Moonshine is based on an encoder-decoder transformer architecture and employs Rotary Position Embedding (RoPE) instead of traditional absolute position embeddings. The model is trained on speech segments of various lengths, but without using zero-padding, leading to greater efficiency for the encoder during inference time. When benchmarked against OpenAI's Whisper tiny-en, Moonshine Tiny demonstrates a 5x reduction in compute requirements for transcribing a 10-second speech segment while incurring no increase in word error rates across standard evaluation datasets. These results highlight Moonshine's potential for real-time and resource-constrained applications.* + +Tips: + +- Moonshine improves upon Whisper's architecture: + 1. It uses SwiGLU activation instead of GELU in the decoder layers + 2. Most importantly, it replaces absolute position embeddings with Rotary Position Embeddings (RoPE). This allows Moonshine to handle audio inputs of any length, unlike Whisper which is restricted to fixed 30-second windows. + +This model was contributed by [Eustache Le Bihan (eustlb)](https://huggingface.co/eustlb). +The original code can be found [here](https://github.com/usefulsensors/moonshine). + +## Resources + +- [Automatic speech recognition task guide](../tasks/asr) + +## MoonshineConfig + +[[autodoc]] MoonshineConfig + +## MoonshineModel + +[[autodoc]] MoonshineModel + - forward + - _mask_input_features + +## MoonshineForConditionalGeneration + +[[autodoc]] MoonshineForConditionalGeneration + - forward + - generate + diff --git a/docs/source/en/model_doc/musicgen_melody.md b/docs/source/en/model_doc/musicgen_melody.md index 4d92d861f0bb5f..7b67713c42b743 100644 --- a/docs/source/en/model_doc/musicgen_melody.md +++ b/docs/source/en/model_doc/musicgen_melody.md @@ -266,7 +266,6 @@ Tips: ## MusicgenMelodyFeatureExtractor [[autodoc]] MusicgenMelodyFeatureExtractor - - _extract_stem_indices ## MusicgenMelodyConfig diff --git a/docs/source/en/model_doc/qwen2_audio.md b/docs/source/en/model_doc/qwen2_audio.md index f399a7e7320c17..2ef947ce430d40 100644 --- a/docs/source/en/model_doc/qwen2_audio.md +++ b/docs/source/en/model_doc/qwen2_audio.md @@ -34,6 +34,37 @@ The abstract from the paper is the following: `Qwen2-Audio-7B` and `Qwen2-Audio-7B-Instruct` can be found on the [Huggingface Hub](https://huggingface.co/Qwen) +### Inference + +```python +from io import BytesIO +from urllib.request import urlopen +import librosa +from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration + +model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B", trust_remote_code=True, device_map="auto") +processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B", trust_remote_code=True) + +prompt = "<|audio_bos|><|AUDIO|><|audio_eos|>Generate the caption in English:" +url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Audio/glass-breaking-151256.mp3" +audio, sr = librosa.load(BytesIO(urlopen(url).read()), sr=processor.feature_extractor.sampling_rate) +inputs = processor(text=prompt, audios=audio, return_tensors="pt").to(model.device) + +generate_ids = model.generate(**inputs, max_length=256) +generate_ids = generate_ids[:, inputs.input_ids.size(1):] + +response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + +# We can also omit the audio_bos and audio_eos tokens +prompt = "<|AUDIO|>Generate the caption in English:" +inputs = processor(text=prompt, audios=audio, return_tensors="pt").to(model.device) + +generate_ids = model.generate(**inputs, max_length=256) +generate_ids = generate_ids[:, inputs.input_ids.size(1):] + +response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] +``` + In the following, we demonstrate how to use `Qwen2-Audio-7B-Instruct` for the inference, supporting both voice chat and audio analysis modes. Note that we have used the ChatML format for dialog, in this demo we show how to leverage `apply_chat_template` for this purpose. ### Voice Chat Inference diff --git a/docs/source/en/model_doc/siglip.md b/docs/source/en/model_doc/siglip.md index 88e38cbb590edc..0c0977d10b580c 100644 --- a/docs/source/en/model_doc/siglip.md +++ b/docs/source/en/model_doc/siglip.md @@ -102,7 +102,7 @@ If you want to do the pre- and postprocessing yourself, here's how to do that: A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with SigLIP. -- [Zero-shot image classification task guide](../tasks/zero_shot_image_classification_md) +- [Zero-shot image classification task guide](../tasks/zero_shot_image_classification) - Demo notebooks for SigLIP can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/SigLIP). 🌎 If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. diff --git a/docs/source/en/model_doc/textnet.md b/docs/source/en/model_doc/textnet.md new file mode 100644 index 00000000000000..d6b431e648f21b --- /dev/null +++ b/docs/source/en/model_doc/textnet.md @@ -0,0 +1,55 @@ + + +# TextNet + +## Overview + +The TextNet model was proposed in [FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation](https://arxiv.org/abs/2111.02394) by Zhe Chen, Jiahao Wang, Wenhai Wang, Guo Chen, Enze Xie, Ping Luo, Tong Lu. TextNet is a vision backbone useful for text detection tasks. It is the result of neural architecture search (NAS) on backbones with reward function as text detection task (to provide powerful features for text detection). + + + + TextNet backbone as part of FAST. Taken from the original paper. + +This model was contributed by [Raghavan](https://huggingface.co/Raghavan), [jadechoghari](https://huggingface.co/jadechoghari) and [nielsr](https://huggingface.co/nielsr). + +## Usage tips + +TextNet is mainly used as a backbone network for the architecture search of text detection. Each stage of the backbone network is comprised of a stride-2 convolution and searchable blocks. +Specifically, we present a layer-level candidate set, defined as {conv3×3, conv1×3, conv3×1, identity}. As the 1×3 and 3×1 convolutions have asymmetric kernels and oriented structure priors, they may help to capture the features of extreme aspect-ratio and rotated text lines. + +TextNet is the backbone for Fast, but can also be used as an efficient text/image classification, we add a `TextNetForImageClassification` as is it would allow people to train an image classifier on top of the pre-trained textnet weights + +## TextNetConfig + +[[autodoc]] TextNetConfig + +## TextNetImageProcessor + +[[autodoc]] TextNetImageProcessor + - preprocess + +## TextNetModel + +[[autodoc]] TextNetModel + - forward + +## TextNetForImageClassification + +[[autodoc]] TextNetForImageClassification + - forward + diff --git a/docs/source/en/model_doc/vitpose.md b/docs/source/en/model_doc/vitpose.md new file mode 100644 index 00000000000000..361f8e30c75d2b --- /dev/null +++ b/docs/source/en/model_doc/vitpose.md @@ -0,0 +1,254 @@ + + +# VitPose + +## Overview + +The VitPose model was proposed in [ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation](https://arxiv.org/abs/2204.12484) by Yufei Xu, Jing Zhang, Qiming Zhang, Dacheng Tao. VitPose employs a standard, non-hierarchical [Vision Transformer](https://arxiv.org/pdf/2010.11929v2) as backbone for the task of keypoint estimation. A simple decoder head is added on top to predict the heatmaps from a given image. Despite its simplicity, the model gets state-of-the-art results on the challenging MS COCO Keypoint Detection benchmark. + +The abstract from the paper is the following: + +*Although no specific domain knowledge is considered in the design, plain vision transformers have shown excellent performance in visual recognition tasks. However, little effort has been made to reveal the potential of such simple structures for pose estimation tasks. In this paper, we show the surprisingly good capabilities of plain vision transformers for pose estimation from various aspects, namely simplicity in model structure, scalability in model size, flexibility in training paradigm, and transferability of knowledge between models, through a simple baseline model called ViTPose. Specifically, ViTPose employs plain and non-hierarchical vision transformers as backbones to extract features for a given person instance and a lightweight decoder for pose estimation. It can be scaled up from 100M to 1B parameters by taking the advantages of the scalable model capacity and high parallelism of transformers, setting a new Pareto front between throughput and performance. Besides, ViTPose is very flexible regarding the attention type, input resolution, pre-training and finetuning strategy, as well as dealing with multiple pose tasks. We also empirically demonstrate that the knowledge of large ViTPose models can be easily transferred to small ones via a simple knowledge token. Experimental results show that our basic ViTPose model outperforms representative methods on the challenging MS COCO Keypoint Detection benchmark, while the largest model sets a new state-of-the-art.* + +![vitpose-architecture](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vitpose-architecture.png) + +This model was contributed by [nielsr](https://huggingface.co/nielsr) and [sangbumchoi](https://github.com/SangbumChoi). +The original code can be found [here](https://github.com/ViTAE-Transformer/ViTPose). + +## Usage Tips + +ViTPose is a so-called top-down keypoint detection model. This means that one first uses an object detector, like [RT-DETR](rt_detr.md), to detect people (or other instances) in an image. Next, ViTPose takes the cropped images as input and predicts the keypoints. + +```py +import torch +import requests +import numpy as np + +from PIL import Image + +from transformers import ( + AutoProcessor, + RTDetrForObjectDetection, + VitPoseForPoseEstimation, +) + +device = "cuda" if torch.cuda.is_available() else "cpu" + +url = "http://images.cocodataset.org/val2017/000000000139.jpg" +image = Image.open(requests.get(url, stream=True).raw) + +# ------------------------------------------------------------------------ +# Stage 1. Detect humans on the image +# ------------------------------------------------------------------------ + +# You can choose detector by your choice +person_image_processor = AutoProcessor.from_pretrained("PekingU/rtdetr_r50vd_coco_o365") +person_model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd_coco_o365", device_map=device) + +inputs = person_image_processor(images=image, return_tensors="pt").to(device) + +with torch.no_grad(): + outputs = person_model(**inputs) + +results = person_image_processor.post_process_object_detection( + outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.3 +) +result = results[0] # take first image results + +# Human label refers 0 index in COCO dataset +person_boxes = result["boxes"][result["labels"] == 0] +person_boxes = person_boxes.cpu().numpy() + +# Convert boxes from VOC (x1, y1, x2, y2) to COCO (x1, y1, w, h) format +person_boxes[:, 2] = person_boxes[:, 2] - person_boxes[:, 0] +person_boxes[:, 3] = person_boxes[:, 3] - person_boxes[:, 1] + +# ------------------------------------------------------------------------ +# Stage 2. Detect keypoints for each person found +# ------------------------------------------------------------------------ + +image_processor = AutoProcessor.from_pretrained("usyd-community/vitpose-base-simple") +model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple", device_map=device) + +inputs = image_processor(image, boxes=[person_boxes], return_tensors="pt").to(device) + +with torch.no_grad(): + outputs = model(**inputs) + +pose_results = image_processor.post_process_pose_estimation(outputs, boxes=[person_boxes]) +image_pose_result = pose_results[0] # results for first image +``` + + +### Visualization for supervision user +```py +import supervision as sv + +xy = torch.stack([pose_result['keypoints'] for pose_result in image_pose_result]).cpu().numpy() +scores = torch.stack([pose_result['scores'] for pose_result in image_pose_result]).cpu().numpy() + +key_points = sv.KeyPoints( + xy=xy, confidence=scores +) + +edge_annotator = sv.EdgeAnnotator( + color=sv.Color.GREEN, + thickness=1 +) +vertex_annotator = sv.VertexAnnotator( + color=sv.Color.RED, + radius=2 +) +annotated_frame = edge_annotator.annotate( + scene=image.copy(), + key_points=key_points +) +annotated_frame = vertex_annotator.annotate( + scene=annotated_frame, + key_points=key_points +) +``` + +### Visualization for advanced user +```py +import math +import cv2 + +def draw_points(image, keypoints, scores, pose_keypoint_color, keypoint_score_threshold, radius, show_keypoint_weight): + if pose_keypoint_color is not None: + assert len(pose_keypoint_color) == len(keypoints) + for kid, (kpt, kpt_score) in enumerate(zip(keypoints, scores)): + x_coord, y_coord = int(kpt[0]), int(kpt[1]) + if kpt_score > keypoint_score_threshold: + color = tuple(int(c) for c in pose_keypoint_color[kid]) + if show_keypoint_weight: + cv2.circle(image, (int(x_coord), int(y_coord)), radius, color, -1) + transparency = max(0, min(1, kpt_score)) + cv2.addWeighted(image, transparency, image, 1 - transparency, 0, dst=image) + else: + cv2.circle(image, (int(x_coord), int(y_coord)), radius, color, -1) + +def draw_links(image, keypoints, scores, keypoint_edges, link_colors, keypoint_score_threshold, thickness, show_keypoint_weight, stick_width = 2): + height, width, _ = image.shape + if keypoint_edges is not None and link_colors is not None: + assert len(link_colors) == len(keypoint_edges) + for sk_id, sk in enumerate(keypoint_edges): + x1, y1, score1 = (int(keypoints[sk[0], 0]), int(keypoints[sk[0], 1]), scores[sk[0]]) + x2, y2, score2 = (int(keypoints[sk[1], 0]), int(keypoints[sk[1], 1]), scores[sk[1]]) + if ( + x1 > 0 + and x1 < width + and y1 > 0 + and y1 < height + and x2 > 0 + and x2 < width + and y2 > 0 + and y2 < height + and score1 > keypoint_score_threshold + and score2 > keypoint_score_threshold + ): + color = tuple(int(c) for c in link_colors[sk_id]) + if show_keypoint_weight: + X = (x1, x2) + Y = (y1, y2) + mean_x = np.mean(X) + mean_y = np.mean(Y) + length = ((Y[0] - Y[1]) ** 2 + (X[0] - X[1]) ** 2) ** 0.5 + angle = math.degrees(math.atan2(Y[0] - Y[1], X[0] - X[1])) + polygon = cv2.ellipse2Poly( + (int(mean_x), int(mean_y)), (int(length / 2), int(stick_width)), int(angle), 0, 360, 1 + ) + cv2.fillConvexPoly(image, polygon, color) + transparency = max(0, min(1, 0.5 * (keypoints[sk[0], 2] + keypoints[sk[1], 2]))) + cv2.addWeighted(image, transparency, image, 1 - transparency, 0, dst=image) + else: + cv2.line(image, (x1, y1), (x2, y2), color, thickness=thickness) + + +# Note: keypoint_edges and color palette are dataset-specific +keypoint_edges = model.config.edges + +palette = np.array( + [ + [255, 128, 0], + [255, 153, 51], + [255, 178, 102], + [230, 230, 0], + [255, 153, 255], + [153, 204, 255], + [255, 102, 255], + [255, 51, 255], + [102, 178, 255], + [51, 153, 255], + [255, 153, 153], + [255, 102, 102], + [255, 51, 51], + [153, 255, 153], + [102, 255, 102], + [51, 255, 51], + [0, 255, 0], + [0, 0, 255], + [255, 0, 0], + [255, 255, 255], + ] +) + +link_colors = palette[[0, 0, 0, 0, 7, 7, 7, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 16]] +keypoint_colors = palette[[16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0]] + +numpy_image = np.array(image) + +for pose_result in image_pose_result: + scores = np.array(pose_result["scores"]) + keypoints = np.array(pose_result["keypoints"]) + + # draw each point on image + draw_points(numpy_image, keypoints, scores, keypoint_colors, keypoint_score_threshold=0.3, radius=4, show_keypoint_weight=False) + + # draw links + draw_links(numpy_image, keypoints, scores, keypoint_edges, link_colors, keypoint_score_threshold=0.3, thickness=1, show_keypoint_weight=False) + +pose_image = Image.fromarray(numpy_image) +pose_image +``` +drawing + +### MoE backbone + +To enable MoE (Mixture of Experts) function in the backbone, user has to give appropriate configuration such as `num_experts` and input value `dataset_index` to the backbone model. However, it is not used in default parameters. Below is the code snippet for usage of MoE function. + +```py +>>> from transformers import VitPoseBackboneConfig, VitPoseBackbone +>>> import torch + +>>> config = VitPoseBackboneConfig(num_experts=3, out_indices=[-1]) +>>> model = VitPoseBackbone(config) + +>>> pixel_values = torch.randn(3, 3, 256, 192) +>>> dataset_index = torch.tensor([1, 2, 3]) +>>> outputs = model(pixel_values, dataset_index) +``` + +## VitPoseImageProcessor + +[[autodoc]] VitPoseImageProcessor + - preprocess + +## VitPoseConfig + +[[autodoc]] VitPoseConfig + +## VitPoseForPoseEstimation + +[[autodoc]] VitPoseForPoseEstimation + - forward \ No newline at end of file diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index 364141c8e406b2..d9bdf6f6e4841d 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -47,7 +47,9 @@ FlashAttention-2 is currently supported for the following architectures: * [Cohere2](https://huggingface.co/docs/transformers/model_doc/cohere2#transformers.Cohere2Model) * [GLM](https://huggingface.co/docs/transformers/model_doc/glm#transformers.GLMModel) * [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel) +* [DiffLlama](https://huggingface.co/docs/transformers/model_doc/diffllama#transformers.DiffLlamaModel) * [DistilBert](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel) +* [Emu3](https://huggingface.co/docs/transformers/model_doc/emu3) * [Gemma](https://huggingface.co/docs/transformers/model_doc/gemma#transformers.GemmaModel) * [Gemma2](https://huggingface.co/docs/transformers/model_doc/gemma2#transformers.Gemma2Model) * [GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2) @@ -67,6 +69,7 @@ FlashAttention-2 is currently supported for the following architectures: * [Llava-NeXT](https://huggingface.co/docs/transformers/model_doc/llava_next) * [Llava-NeXT-Video](https://huggingface.co/docs/transformers/model_doc/llava_next_video) * [LLaVA-Onevision](https://huggingface.co/docs/transformers/model_doc/llava_onevision) +* [Moonshine](https://huggingface.co/docs/transformers/model_doc/moonshine#transformers.MoonshineModel) * [Mimi](https://huggingface.co/docs/transformers/model_doc/mimi) * [VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava) * [VideoLlava](https://huggingface.co/docs/transformers/model_doc/video_llava) @@ -106,6 +109,7 @@ FlashAttention-2 is currently supported for the following architectures: * [SigLIP](https://huggingface.co/docs/transformers/model_doc/siglip) * [UniSpeech](https://huggingface.co/docs/transformers/v4.39.3/en/model_doc/unispeech#transformers.UniSpeechModel) * [unispeech_sat](https://huggingface.co/docs/transformers/v4.39.3/en/model_doc/unispeech-sat#transformers.UniSpeechSatModel) +* [helium](https://huggingface.co/docs/transformers/main/en/model_doc/heliumtransformers.HeliumModel) You can request to add FlashAttention-2 support for another model by opening a GitHub Issue or Pull Request. @@ -237,11 +241,13 @@ For now, Transformers supports SDPA inference and training for the following arc * [data2vec_vision](https://huggingface.co/docs/transformers/main/en/model_doc/data2vec#transformers.Data2VecVisionModel) * [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel) * [DeiT](https://huggingface.co/docs/transformers/model_doc/deit#transformers.DeiTModel) +* [DiffLlama](https://huggingface.co/docs/transformers/model_doc/diffllama#transformers.DiffLlamaModel) * [Dinov2](https://huggingface.co/docs/transformers/en/model_doc/dinov2) * [Dinov2_with_registers](https://huggingface.co/docs/transformers/en/model_doc/dinov2) * [DistilBert](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel) * [Dpr](https://huggingface.co/docs/transformers/model_doc/dpr#transformers.DprReader) * [EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder_decoder#transformers.EncoderDecoderModel) +* [Emu3](https://huggingface.co/docs/transformers/model_doc/emu3) * [Falcon](https://huggingface.co/docs/transformers/model_doc/falcon#transformers.FalconModel) * [Gemma](https://huggingface.co/docs/transformers/model_doc/gemma#transformers.GemmaModel) * [Gemma2](https://huggingface.co/docs/transformers/model_doc/gemma2#transformers.Gemma2Model) @@ -263,6 +269,7 @@ For now, Transformers supports SDPA inference and training for the following arc * [Llava-NeXT-Video](https://huggingface.co/docs/transformers/model_doc/llava_next_video) * [LLaVA-Onevision](https://huggingface.co/docs/transformers/model_doc/llava_onevision) * [M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100#transformers.M2M100Model) +* [Moonshine](https://huggingface.co/docs/transformers/model_doc/moonshine#transformers.MoonshineModel) * [Mimi](https://huggingface.co/docs/transformers/model_doc/mimi) * [Mistral](https://huggingface.co/docs/transformers/model_doc/mistral#transformers.MistralModel) * [Mllama](https://huggingface.co/docs/transformers/model_doc/mllama#transformers.MllamaForConditionalGeneration) @@ -281,8 +288,8 @@ For now, Transformers supports SDPA inference and training for the following arc * [Phi3](https://huggingface.co/docs/transformers/model_doc/phi3#transformers.Phi3Model) * [PhiMoE](https://huggingface.co/docs/transformers/model_doc/phimoe#transformers.PhimoeModel) * [Idefics](https://huggingface.co/docs/transformers/model_doc/idefics#transformers.IdeficsModel) -* [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper#transformers.WhisperModel) * [mBart](https://huggingface.co/docs/transformers/model_doc/mbart#transformers.MBartModel) +* [Moonshine](https://huggingface.co/docs/transformers/model_doc/moonshine#transformers.MoonshineModel) * [Mistral](https://huggingface.co/docs/transformers/model_doc/mistral#transformers.MistralModel) * [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral#transformers.MixtralModel) * [StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm#transformers.StableLmModel) @@ -318,6 +325,7 @@ For now, Transformers supports SDPA inference and training for the following arc * [XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaModel) * [XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl#transformers.XLMRobertaXLModel) * [YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos#transformers.YolosModel) +* [helium](https://huggingface.co/docs/transformers/main/en/model_doc/heliumtransformers.HeliumModel) diff --git a/docs/source/en/quantization/gptq.md b/docs/source/en/quantization/gptq.md index 5713ef4132a9a8..1534a977f3436f 100644 --- a/docs/source/en/quantization/gptq.md +++ b/docs/source/en/quantization/gptq.md @@ -22,15 +22,42 @@ Try GPTQ quantization with PEFT in this [notebook](https://colab.research.google -The [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) library implements the GPTQ algorithm, a post-training quantization technique where each row of the weight matrix is quantized independently to find a version of the weights that minimizes the error. These weights are quantized to int4, but they're restored to fp16 on the fly during inference. This can save your memory-usage by 4x because the int4 weights are dequantized in a fused kernel rather than a GPU's global memory, and you can also expect a speedup in inference because using a lower bitwidth takes less time to communicate. +Both [GPTQModel](https://github.com/ModelCloud/GPTQModel) and [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) libraries implement the GPTQ algorithm, a post-training quantization technique where each row of the weight matrix is quantized independently to find a version of the weights that minimizes error. These weights are quantized to int4, stored as int32 (int4 x 8) and dequantized (restored) to fp16 on the fly during inference. This can save memory by almost 4x because the int4 weights are often dequantized in a fused kernel. You can also expect a substantial speedup in inference due to lower bandwidth requirements for lower bitwidth. -Before you begin, make sure the following libraries are installed: +[GPTQModel](https://github.com/ModelCloud/GPTQModel) started as a maintained fork of AutoGPTQ but has since differentiated itself with the following major differences. + +* Model support: GPTQModel continues to support all of the latest LLM models. +* Multimodal support: GPTQModel supports accurate quantization of Qwen 2-VL and Ovis 1.6-VL image-to-text models. +* Platform support: Linux, macOS (Apple Silicon), and Windows 11. +* Hardware support: NVIDIA CUDA, AMD ROCm, Apple Silicon M1/MPS /CPU, Intel/AMD CPU, and Intel Datacenter Max/Arc GPUs. +* Asymmetric support: Asymmetric quantization can potentially introduce lower quantization errors compared to symmetric quantization. However, it is not backward compatible with AutoGPTQ, and not all kernels, such as Marlin, support asymmetric quantization. +* IPEX kernel for Intel/AMD accelerated CPU and Intel GPU (Datacenter Max/Arc GPUs) support. +* Updated Marlin kernel from Neural Magic optimized for A100 (Ampere). +* Updated kernels with auto-padding for legacy model support and models with non-uniform in/out-features. +* Faster quantization, lower memory usage, and more accurate default quantization via GPTQModel quantization APIs. +* User and developer friendly APIs. + + +[AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) will likely be deprecated in the future due the lack of continued support for new models and features. + +Before you begin, make sure the following libraries are installed and updated to the latest release: ```bash -pip install auto-gptq pip install --upgrade accelerate optimum transformers ``` +Then install either GPTQModel or AutoGPTQ. + +```bash +pip install gptqmodel --no-build-isolation +``` + +or + +```bash +pip install auto-gptq --no-build-isolation +``` + To quantize a model (currently only supported for text models), you need to create a [`GPTQConfig`] class and set the number of bits to quantize to, a dataset to calibrate the weights for quantization, and a tokenizer to prepare the dataset. ```py @@ -92,9 +119,22 @@ from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto") ``` +## Marlin + +[Marlin](https://github.com/IST-DASLab/marlin) is a 4-bit only CUDA GPTQ kernel, highly optimized for the NVIDIA A100 GPU (Ampere) architecture. Loading, dequantization, and execution of post-dequantized weights are highly parallelized, offering a substantial inference improvement versus the original CUDA GPTQ kernel. Marlin is only available for quantized inference and does not support model quantization. + +Marlin inference can be activated with the `backend` parameter in [`GPTQConfig`]. + +```py + +from transformers import AutoModelForCausalLM, GPTQConfig + +model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto", quantization_config=GPTQConfig(bits=4, backend="marlin")) +``` + ## ExLlama -[ExLlama](https://github.com/turboderp/exllama) is a Python/C++/CUDA implementation of the [Llama](model_doc/llama) model that is designed for faster inference with 4-bit GPTQ weights (check out these [benchmarks](https://github.com/huggingface/optimum/tree/main/tests/benchmark#gptq-benchmark)). The ExLlama kernel is activated by default when you create a [`GPTQConfig`] object. To boost inference speed even further, use the [ExLlamaV2](https://github.com/turboderp/exllamav2) kernels by configuring the `exllama_config` parameter: +[ExLlama](https://github.com/turboderp/exllama) is a CUDA implementation of the [Llama](model_doc/llama) model that is designed for faster inference with 4-bit GPTQ weights (check out these [benchmarks](https://github.com/huggingface/optimum/tree/main/tests/benchmark#gptq-benchmark)). The ExLlama kernel is activated by default when you create a [`GPTQConfig`] object. To boost inference speed even further, use the [ExLlamaV2](https://github.com/turboderp/exllamav2) kernels by configuring the `exllama_config` parameter: ```py import torch @@ -110,11 +150,11 @@ Only 4-bit models are supported, and we recommend deactivating the ExLlama kerne -The ExLlama kernels are only supported when the entire model is on the GPU. If you're doing inference on a CPU with AutoGPTQ (version > 0.4.2), then you'll need to disable the ExLlama kernel. This overwrites the attributes related to the ExLlama kernels in the quantization config of the config.json file. +The ExLlama kernels are only supported when the entire model is on the GPU. If you're doing inference on a CPU with AutoGPTQ or GPTQModel, then you'll need to disable the ExLlama kernel. This overwrites the attributes related to the ExLlama kernels in the quantization config of the config.json file. ```py import torch from transformers import AutoModelForCausalLM, GPTQConfig gptq_config = GPTQConfig(bits=4, use_exllama=False) model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="cpu", quantization_config=gptq_config) -``` \ No newline at end of file +``` diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md index 48840fad646fd0..dfe680832b1952 100644 --- a/docs/source/en/quantization/overview.md +++ b/docs/source/en/quantization/overview.md @@ -45,32 +45,50 @@ In short, supporting a wide range of quantization methods allows you to pick the Use the table below to help you decide which quantization method to use. -| Quantization method | On the fly quantization | CPU | CUDA GPU | RoCm GPU (AMD) | Metal (Apple Silicon) | Intel GPU | torch.compile() support | Number of bits | Supports fine-tuning (through PEFT) | Serializable with 🤗 transformers | 🤗 transformers support | Link to library | -|-------------------------------------|-------------------------|-----|----------|----------------|-----------------------|-----------|-------------------------|----------------|-------------------------------------|--------------|------------------------|---------------------------------------------| -| [AQLM](./aqlm) | 🔴 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 🟢 | 1 / 2 | 🟢 | 🟢 | 🟢 | https://github.com/Vahe1994/AQLM | -| [AWQ](./awq) | 🔴 | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | ? | 4 | 🟢 | 🟢 | 🟢 | https://github.com/casper-hansen/AutoAWQ | -| [bitsandbytes](./bitsandbytes) | 🟢 | 🟡 * | 🟢 | 🟡 * | 🔴 ** | 🟡 * | 🔴 (soon!) | 4 / 8 | 🟢 | 🟢 | 🟢 | https://github.com/bitsandbytes-foundation/bitsandbytes | -| [compressed-tensors](./compressed_tensors) | 🔴 | 🟢 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 1 - 8 | 🟢 | 🟢 | 🟢 | https://github.com/neuralmagic/compressed-tensors | -| [EETQ](./eetq) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | ? | 8 | 🟢 | 🟢 | 🟢 | https://github.com/NetEase-FuXi/EETQ | -| GGUF / GGML (llama.cpp) | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 1 - 8 | 🔴 | [See GGUF section](../gguf) | [See GGUF section](../gguf) | https://github.com/ggerganov/llama.cpp | -| [GPTQ](./gptq) | 🔴 | 🔴 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 2 - 3 - 4 - 8 | 🟢 | 🟢 | 🟢 | https://github.com/AutoGPTQ/AutoGPTQ | -| [HIGGS](./higgs) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | 🟢 | 2 - 4 | 🔴 | 🟢 | 🟢 | https://github.com/HanGuo97/flute | -| [HQQ](./hqq) | 🟢 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 🟢 | 1 - 8 | 🟢 | 🔴 | 🟢 | https://github.com/mobiusml/hqq/ | -| [optimum-quanto](./quanto) | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | 🔴 | 🟢 | 2 / 4 / 8 | 🔴 | 🔴 | 🟢 | https://github.com/huggingface/optimum-quanto | -| [FBGEMM_FP8](./fbgemm_fp8.md) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | 🔴 | 8 | 🔴 | 🟢 | 🟢 | https://github.com/pytorch/FBGEMM | -| [torchao](./torchao.md) | 🟢 | | 🟢 | 🔴 | partial support (int4 weight only) | 🔴 | | 4 / 8 | | 🟢🔴 | 🟢 | https://github.com/pytorch/ao | -| [VPTQ](./vptq) | 🔴 | 🔴 | 🟢 | 🟡 | 🔴 | 🔴 | 🟢 | 1 - 8 | 🔴 | 🟢 | 🟢 | https://github.com/microsoft/VPTQ | +| Quantization Method | On the fly quantization | CPU | CUDA GPU | ROCm GPU | Metal (Apple Silicon) | Intel GPU | Torch compile() | Bits | PEFT Fine Tuning | Serializable with 🤗Transformers | 🤗Transformers Support | Link to library | +|-----------------------------------------------|----------------------|-----------------|----------|-----------|------------------------------------|-----------------|-----------------|---------------|------------------|-----------------------------|-------------------------|---------------------------------------------| +| [AQLM](./aqlm.md) | 🔴 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 🟢 | 1/2 | 🟢 | 🟢 | 🟢 | https://github.com/Vahe1994/AQLM | +| [AWQ](./awq.md) | 🔴 | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | ? | 4 | 🟢 | 🟢 | 🟢 | https://github.com/casper-hansen/AutoAWQ | +| [bitsandbytes](./bitsandbytes.md) | 🟢 | 🟡 1 | 🟢 | 🟡 1 | 🔴 2 | 🟡 1 | 🔴 1 | 4/8 | 🟢 | 🟢 | 🟢 | https://github.com/bitsandbytes-foundation/bitsandbytes | +| [compressed-tensors](./compressed_tensors.md) | 🔴 | 🟢 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 1/8 | 🟢 | 🟢 | 🟢 | https://github.com/neuralmagic/compressed-tensors | +| [EETQ](./eetq.md) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | ? | 8 | 🟢 | 🟢 | 🟢 | https://github.com/NetEase-FuXi/EETQ | +| [GGUF / GGML (llama.cpp)](../gguf.md) | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 1/8 | 🔴 | [See Notes](../gguf.md) | [See Notes](../gguf.md) | https://github.com/ggerganov/llama.cpp | +| [GPTQModel](./gptq.md) | 🔴 | 🟢 3 | 🟢 | 🟢 | 🟢 | 🟢 4 | 🔴 | 2/3/4/8 | 🟢 | 🟢 | 🟢 | https://github.com/ModelCloud/GPTQModel | +| [AutoGPTQ](./gptq.md) | 🔴 | 🔴 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 2/3/4/8 | 🟢 | 🟢 | 🟢 | https://github.com/AutoGPTQ/AutoGPTQ | +| [HIGGS](./higgs.md) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | 🟢 | 2/4 | 🔴 | 🟢 | 🟢 | https://github.com/HanGuo97/flute | +| [HQQ](./hqq.md) | 🟢 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 🟢 | 1/8 | 🟢 | 🔴 | 🟢 | https://github.com/mobiusml/hqq/ | +| [optimum-quanto](./quanto.md) | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | 🔴 | 🟢 | 2/4/8 | 🔴 | 🔴 | 🟢 | https://github.com/huggingface/optimum-quanto | +| [FBGEMM_FP8](./fbgemm_fp8.md) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | 🔴 | 8 | 🔴 | 🟢 | 🟢 | https://github.com/pytorch/FBGEMM | +| [torchao](./torchao.md) | 🟢 | | 🟢 | 🔴 | 🟡 5 | 🔴 | | 4/8 | | 🟢🔴 | 🟢 | https://github.com/pytorch/ao | +| [VPTQ](./vptq.md) | 🔴 | 🔴 | 🟢 | 🟡 | 🔴 | 🔴 | 🟢 | 1/8 | 🔴 | 🟢 | 🟢 | https://github.com/microsoft/VPTQ | + +**1:** bitsandbytes is being refactored to support multiple backends beyond CUDA. Currently, ROCm (AMD GPU) and Intel CPU implementations are mature, with Intel XPU in progress and Apple Silicon support expected by Q4/Q1. For installation instructions and the latest backend updates, visit [this link](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend). Check out [these docs](https://huggingface.co/docs/bitsandbytes/main/en/non_cuda_backends) for more details and feedback links. -\* bitsandbytes is being refactored to support multiple backends beyond CUDA. Currently, ROCm (AMD GPU) and Intel CPU implementations are mature, with Intel XPU in progress and Apple Silicon support expected by Q4/Q1. For installation instructions and the latest backend updates, visit [this link](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend). + + + + +**2:** bitsandbytes is seeking contributors to help develop and lead the Apple Silicon backend. Interested? Contact them directly via their repo. Stipends may be available through sponsorships. + + + + -We value your feedback to help identify bugs before the full release! Check out [these docs](https://huggingface.co/docs/bitsandbytes/main/en/non_cuda_backends) for more details and feedback links. +**3:** GPTQModel[CPU] supports 4-bit via IPEX on Intel/AMD and full bit range via Torch on Intel/AMD/Apple Silicon. -\** bitsandbytes is seeking contributors to help develop and lead the Apple Silicon backend. Interested? Contact them directly via their repo. Stipends may be available through sponsorships. +**4:** GPTQModel[Intel GPU] via IPEX only supports 4-bit for Intel Datacenter Max/Arc GPUs. + + + + + +**5:** torchao only supports int4 weight on Metal (Apple Silicon). + + - \ No newline at end of file diff --git a/docs/source/en/quantization/torchao.md b/docs/source/en/quantization/torchao.md index 38f7c074c97d90..46fb0f8cbb9a88 100644 --- a/docs/source/en/quantization/torchao.md +++ b/docs/source/en/quantization/torchao.md @@ -16,7 +16,8 @@ rendered properly in your Markdown viewer. Before you begin, make sure the following libraries are installed with their latest version: ```bash -pip install --upgrade torch torchao +# Updating 🤗 Transformers to the latest version, as the example script below uses the new auto compilation +pip install --upgrade torch torchao transformers ``` By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type. @@ -35,12 +36,8 @@ tokenizer = AutoTokenizer.from_pretrained(model_name) input_text = "What are we having for dinner?" input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") -# compile the quantized model to get speedup -import torchao -torchao.quantization.utils.recommended_inductor_config_setter() -quantized_model = torch.compile(quantized_model, mode="max-autotune") - -output = quantized_model.generate(**input_ids, max_new_tokens=10) +# auto-compile the quantized model with `cache_implementation="static"` to get speedup +output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static") print(tokenizer.decode(output[0], skip_special_tokens=True)) # benchmark the performance @@ -59,11 +56,11 @@ def benchmark_fn(f, *args, **kwargs): return f"{(t0.blocked_autorange().mean):.3f}" MAX_NEW_TOKENS = 1000 -print("int4wo-128 model:", benchmark_fn(quantized_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS)) +print("int4wo-128 model:", benchmark_fn(quantized_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS, cache_implementation="static")) bf16_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda", torch_dtype=torch.bfloat16) -bf16_model = torch.compile(bf16_model, mode="max-autotune") -print("bf16 model:", benchmark_fn(bf16_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS)) +output = bf16_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static") # auto-compile +print("bf16 model:", benchmark_fn(bf16_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS, cache_implementation="static")) ``` diff --git a/docs/source/en/tasks/video_text_to_text.md b/docs/source/en/tasks/video_text_to_text.md index 3929f7994bdafb..4c10907e4571bf 100644 --- a/docs/source/en/tasks/video_text_to_text.md +++ b/docs/source/en/tasks/video_text_to_text.md @@ -144,4 +144,4 @@ print(processor.decode(output[0][2:], skip_special_tokens=True)[len(user_prompt) And voila! -To learn more about chat templates and token streaming for video-text-to-text models, refer to the [image-text-to-text](../image_text_to_text) task guide because these models work similarly. \ No newline at end of file +To learn more about chat templates and token streaming for video-text-to-text models, refer to the [image-text-to-text](../tasks/image_text_to_text) task guide because these models work similarly. \ No newline at end of file diff --git a/docs/source/ja/model_doc/decision_transformer.md b/docs/source/ja/model_doc/decision_transformer.md index 9c7f27bbeeec2d..fe37feb5a35d54 100644 --- a/docs/source/ja/model_doc/decision_transformer.md +++ b/docs/source/ja/model_doc/decision_transformer.md @@ -23,31 +23,28 @@ Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laski 論文の要約は次のとおりです。 -*強化学習(RL)をシーケンスモデリング問題として抽象化するフレームワークを紹介します。 +_強化学習(RL)をシーケンスモデリング問題として抽象化するフレームワークを紹介します。 これにより、Transformer アーキテクチャのシンプルさとスケーラビリティ、および関連する進歩を活用できるようになります。 - GPT-x や BERT などの言語モデリングで。特に、Decision Transformer というアーキテクチャを紹介します。 - RL の問題を条件付きシーケンス モデリングとして投げかけます。値関数に適合する以前の RL アプローチとは異なり、 - ポリシー勾配を計算すると、Decision Transformer は因果的にマスクされたアルゴリズムを利用して最適なアクションを出力するだけです。 - 変成器。望ましいリターン (報酬)、過去の状態、アクションに基づいて自己回帰モデルを条件付けすることにより、 - Decision Transformer モデルは、望ましいリターンを達成する将来のアクションを生成できます。そのシンプルさにも関わらず、 - Decision Transformer は、最先端のモデルフリーのオフライン RL ベースラインのパフォーマンスと同等、またはそれを超えています。 - Atari、OpenAI Gym、Key-to-Door タスク* +GPT-x や BERT などの言語モデリングで。特に、Decision Transformer というアーキテクチャを紹介します。 +RL の問題を条件付きシーケンス モデリングとして投げかけます。値関数に適合する以前の RL アプローチとは異なり、 +ポリシー勾配を計算すると、Decision Transformer は因果的にマスクされたアルゴリズムを利用して最適なアクションを出力するだけです。 +変成器。望ましいリターン (報酬)、過去の状態、アクションに基づいて自己回帰モデルを条件付けすることにより、 +Decision Transformer モデルは、望ましいリターンを達成する将来のアクションを生成できます。そのシンプルさにも関わらず、 +Decision Transformer は、最先端のモデルフリーのオフライン RL ベースラインのパフォーマンスと同等、またはそれを超えています。 +Atari、OpenAI Gym、Key-to-Door タスク_ このバージョンのモデルは、状態がベクトルであるタスク用です。 -このモデルは、[edbeeching](https://huggingface.co/edbeeching) によって提供されました。元のコードは [ここ](https://github.com/kzl/decion-transformer) にあります。 +このモデルは、[edbeeching](https://huggingface.co/edbeeching) によって提供されました。元のコードは [ここ](https://github.com/kzl/decision-transformer) にあります。 ## DecisionTransformerConfig [[autodoc]] DecisionTransformerConfig - ## DecisionTransformerGPT2Model -[[autodoc]] DecisionTransformerGPT2Model - - forward +[[autodoc]] DecisionTransformerGPT2Model - forward ## DecisionTransformerModel -[[autodoc]] DecisionTransformerModel - - forward +[[autodoc]] DecisionTransformerModel - forward diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml index 54740610ee1148..dfa50f9f13a1f1 100644 --- a/docs/source/ko/_toctree.yml +++ b/docs/source/ko/_toctree.yml @@ -145,38 +145,14 @@ title: (번역중) Getting started - local: quantization/bitsandbytes title: bitsandbytes - - local: in_translation - title: (번역중) GPTQ + - local: quantization/gptq + title: GPTQ - local: quantization/awq title: AWQ - local: in_translation title: (번역중) AQLM - local: in_translation title: (번역중) VPTQ - - local: in_translation - title: (번역중) Quanto - - local: in_translation - title: (번역중) EETQ - - local: in_translation - title: (번역중) HQQ - - local: in_translation - title: (번역중) Optimum - - local: in_translation - title: (번역중) Contribute new quantization method - title: (번역중) 경량화 메소드 -- sections: - - local: in_translation - title: (번역중) Getting started - - local: in_translation - title: (번역중) bitsandbytes - - local: quantization/gptq - title: GPTQ - - local: in_translation - title: (번역중) AWQ - - local: in_translation - title: (번역중) AQLM - - local: in_translation - title: (번역중) VPTQ - local: quantization/quanto title: Quanto - local: quantization/eetq @@ -692,8 +668,8 @@ sections: - local: in_translation title: (번역중) ALIGN - - local: in_translation - title: (번역중) AltCLIP + - local: model_doc/altclip + title: AltCLIP - local: model_doc/blip-2 title: BLIP-2 - local: model_doc/blip diff --git a/docs/source/ko/model_doc/altclip.md b/docs/source/ko/model_doc/altclip.md new file mode 100644 index 00000000000000..1236bcc9aaa7da --- /dev/null +++ b/docs/source/ko/model_doc/altclip.md @@ -0,0 +1,78 @@ +# AltCLIP + +## 개요[[overview]] + +AltCLIP 모델은 Zhongzhi Chen, Guang Liu, Bo-Wen Zhang, Fulong Ye, Qinghong Yang, Ledell Wu의 [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679v2) 논문에서 제안되었습니다. AltCLIP(CLIP의 언어 인코더를 변경하여 언어 기능 확장)은 다양한 이미지-텍스트 및 텍스트-텍스트 쌍으로 훈련된 신경망입니다. CLIP의 텍스트 인코더를 사전 훈련된 다국어 텍스트 인코더 XLM-R로 교체하여, 거의 모든 작업에서 CLIP과 유사한 성능을 얻을 수 있었으며, 원래 CLIP의 다국어 이해와 같은 기능도 확장되었습니다. + +논문의 초록은 다음과 같습니다: + +*본 연구에서는 강력한 이중 언어 멀티모달 표현 모델을 훈련하는 개념적으로 간단하고 효과적인 방법을 제시합니다. OpenAI에서 출시한 사전 훈련된 멀티모달 표현 모델 CLIP에서 시작하여, 그 텍스트 인코더를 사전 훈련된 다국어 텍스트 인코더 XLM-R로 교체하고, 교사 학습과 대조 학습으로 구성된 2단계 훈련 스키마를 통해 언어와 이미지 표현을 정렬했습니다. 우리는 광범위한 작업 평가를 통해 우리의 방법을 검증했습니다. ImageNet-CN, Flicker30k-CN, COCO-CN을 포함한 여러 작업에서 새로운 최고 성능을 달성했으며, 거의 모든 작업에서 CLIP과 유사한 성능을 얻었습니다. 이는 CLIP의 텍스트 인코더를 단순히 변경하여 다국어 이해와 같은 확장 기능을 얻을 수 있음을 시사합니다.* + +이 모델은 [jongjyh](https://huggingface.co/jongjyh)에 의해 기여되었습니다. + +## 사용 팁과 예제[[usage-tips-and-example]] + +AltCLIP의 사용법은 CLIP과 매우 유사하며, 차이점은 텍스트 인코더에 있습니다. 일반적인 어텐션 대신 양방향 어텐션을 사용하며, XLM-R의 [CLS] 토큰을 사용하여 텍스트 임베딩을 나타냅니다. + +AltCLIP은 멀티모달 비전 및 언어 모델입니다. 이미지와 텍스트 간의 유사성 계산 및 제로샷 이미지 분류에 사용할 수 있습니다. AltCLIP은 ViT와 같은 트랜스포머를 사용하여 시각적 특징을 얻고, 양방향 언어 모델을 사용하여 텍스트 특징을 얻습니다. 이후 텍스트와 시각적 특징 모두 동일한 차원의 잠재 공간으로 투사됩니다. 투사된 이미지와 텍스트 특징 간의 내적을 유사도 점수로 사용합니다. + +이미지를 트랜스포머 인코더에 입력하기 위해, 각 이미지를 일정한 크기의 겹치지 않는 패치 시퀀스로 분할한 뒤, 이를 선형 임베딩합니다. 전체 이미지를 나타내기 위해 [CLS] 토큰이 추가됩니다. 저자들은 절대 위치 임베딩도 추가하여 결과 벡터 시퀀스를 표준 트랜스포머 인코더에 입력합니다. [`CLIPImageProcessor`]는 모델을 위해 이미지를 크기 조정하고 정규화하는 데 사용할 수 있습니다. + +[`AltCLIPProcessor`]는 [`CLIPImageProcessor`]와 [`XLMRobertaTokenizer`]를 하나의 인스턴스로 묶어 텍스트를 인코딩하고 이미지를 준비합니다. 다음 예제는 [`AltCLIPProcessor`]와 [`AltCLIPModel`]을 사용하여 이미지와 텍스트 간의 유사성 점수를 얻는 방법을 보여줍니다. +```python +>>> from PIL import Image +>>> import requests + +>>> from transformers import AltCLIPModel, AltCLIPProcessor + +>>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP") +>>> processor = AltCLIPProcessor.from_pretrained("BAAI/AltCLIP") + +>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" +>>> image = Image.open(requests.get(url, stream=True).raw) + +>>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True) + +>>> outputs = model(**inputs) +>>> logits_per_image = outputs.logits_per_image # 이미지-텍스트 유사도 점수 +>>> probs = logits_per_image.softmax(dim=1) # 라벨 마다 확률을 얻기 위해 softmax 적용 +``` + + +이 모델은 `CLIPModel`을 기반으로 하므로, 원래 CLIP처럼 사용할 수 있습니다. + + + +## AltCLIPConfig + +[[autodoc]] AltCLIPConfig + - from_text_vision_configs + +## AltCLIPTextConfig + +[[autodoc]] AltCLIPTextConfig + +## AltCLIPVisionConfig + +[[autodoc]] AltCLIPVisionConfig + +## AltCLIPProcessor + +[[autodoc]] AltCLIPProcessor + +## AltCLIPModel + +[[autodoc]] AltCLIPModel + - forward + - get_text_features + - get_image_features + +## AltCLIPTextModel + +[[autodoc]] AltCLIPTextModel + - forward + +## AltCLIPVisionModel + +[[autodoc]] AltCLIPVisionModel + - forward diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py index ee155e377e4137..692a43f9d23cdd 100644 --- a/examples/flax/question-answering/run_qa.py +++ b/examples/flax/question-answering/run_qa.py @@ -61,7 +61,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") Array = Any Dataset = datasets.arrow_dataset.Dataset diff --git a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py index 095af99efffccc..ae106c7264317b 100644 --- a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py +++ b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py @@ -60,7 +60,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risk. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/flax/speech-recognition/requirements.txt") diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py index ddbde78f703ce2..0c4edda3bd4afb 100755 --- a/examples/flax/text-classification/run_flax_glue.py +++ b/examples/flax/text-classification/run_flax_glue.py @@ -56,7 +56,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") Array = Any Dataset = datasets.arrow_dataset.Dataset diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py index 5f1988c36de17c..bc83fb53e2120e 100644 --- a/examples/flax/token-classification/run_flax_ner.py +++ b/examples/flax/token-classification/run_flax_ner.py @@ -57,7 +57,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") diff --git a/examples/modular-transformers/configuration_my_new_model.py b/examples/modular-transformers/configuration_my_new_model.py index 7042c586cbb636..59637e02d3f11c 100644 --- a/examples/modular-transformers/configuration_my_new_model.py +++ b/examples/modular-transformers/configuration_my_new_model.py @@ -43,7 +43,7 @@ class MyNewModelConfig(PretrainedConfig): The non-linear activation function (function or string) in the decoder. max_position_embeddings (`int`, *optional*, defaults to 2048): The maximum sequence length that this model might ever be used with. MyNewModel 1 supports up to 2048 tokens, - MyNewModel 2 up to 4096, CodeMyNewModel up to 16384. + MyNewModel 2 up to 4096, CodeLlama up to 16384. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. rms_norm_eps (`float`, *optional*, defaults to 1e-06): @@ -110,7 +110,7 @@ class MyNewModelConfig(PretrainedConfig): mlp_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers. head_dim (`int`, *optional*): - The attention head dimension. If None, it will default to hidden_size // num_heads + The attention head dimension. If None, it will default to hidden_size // num_attention_heads ```python >>> from transformers import MyNewModelModel, MyNewModelConfig diff --git a/examples/modular-transformers/image_processing_new_imgproc_model.py b/examples/modular-transformers/image_processing_new_imgproc_model.py index a64eb17861a1c2..f3ab1772ec59e9 100644 --- a/examples/modular-transformers/image_processing_new_imgproc_model.py +++ b/examples/modular-transformers/image_processing_new_imgproc_model.py @@ -247,7 +247,7 @@ def preprocess( # All transformations expect numpy arrays. images = [to_numpy_array(image) for image in images] - if is_scaled_image(images[0]) and do_rescale: + if do_rescale and is_scaled_image(images[0]): logger.warning_once( "It looks like you are trying to rescale already rescaled images. If the input" " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." diff --git a/examples/modular-transformers/modeling_dummy.py b/examples/modular-transformers/modeling_dummy.py index 3e0aa6e9b2ad02..382b87bd38471e 100644 --- a/examples/modular-transformers/modeling_dummy.py +++ b/examples/modular-transformers/modeling_dummy.py @@ -597,7 +597,7 @@ def _update_causal_mask( output_attentions: bool, ): if self.config._attn_implementation == "flash_attention_2": - if attention_mask is not None and 0.0 in attention_mask: + if attention_mask is not None and (attention_mask == 0.0).any(): return attention_mask return None diff --git a/examples/modular-transformers/modeling_multimodal1.py b/examples/modular-transformers/modeling_multimodal1.py index c4f90a5cbadab3..df23a83b341144 100644 --- a/examples/modular-transformers/modeling_multimodal1.py +++ b/examples/modular-transformers/modeling_multimodal1.py @@ -597,7 +597,7 @@ def _update_causal_mask( output_attentions: bool, ): if self.config._attn_implementation == "flash_attention_2": - if attention_mask is not None and 0.0 in attention_mask: + if attention_mask is not None and (attention_mask == 0.0).any(): return attention_mask return None diff --git a/examples/modular-transformers/modeling_my_new_model2.py b/examples/modular-transformers/modeling_my_new_model2.py index b8d5b5eb910095..9288b1a2930578 100644 --- a/examples/modular-transformers/modeling_my_new_model2.py +++ b/examples/modular-transformers/modeling_my_new_model2.py @@ -602,7 +602,7 @@ def _update_causal_mask( output_attentions: bool, ): if self.config._attn_implementation == "flash_attention_2": - if attention_mask is not None and 0.0 in attention_mask: + if attention_mask is not None and (attention_mask == 0.0).any(): return attention_mask return None diff --git a/examples/modular-transformers/modeling_new_task_model.py b/examples/modular-transformers/modeling_new_task_model.py index 477d084b1d9309..da0b354fe76efa 100644 --- a/examples/modular-transformers/modeling_new_task_model.py +++ b/examples/modular-transformers/modeling_new_task_model.py @@ -249,9 +249,6 @@ def set_decoder(self, decoder): def get_decoder(self): return self.language_model.get_decoder() - def tie_weights(self): - return self.language_model.tie_weights() - def _update_causal_mask( self, attention_mask, diff --git a/examples/modular-transformers/modeling_super.py b/examples/modular-transformers/modeling_super.py index 42d8108ee72a68..1f5aa55c46909e 100644 --- a/examples/modular-transformers/modeling_super.py +++ b/examples/modular-transformers/modeling_super.py @@ -519,7 +519,7 @@ def _update_causal_mask( output_attentions: bool, ): if self.config._attn_implementation == "flash_attention_2": - if attention_mask is not None and 0.0 in attention_mask: + if attention_mask is not None and (attention_mask == 0.0).any(): return attention_mask return None diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py index ef308316569b79..d9cb0187e4d82f 100644 --- a/examples/pytorch/audio-classification/run_audio_classification.py +++ b/examples/pytorch/audio-classification/run_audio_classification.py @@ -45,7 +45,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt") diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py index d42fb52d5c13c0..f8170bb416dd8c 100644 --- a/examples/pytorch/contrastive-image-text/run_clip.py +++ b/examples/pytorch/contrastive-image-text/run_clip.py @@ -54,7 +54,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt") diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py index 111d8adce8b41a..32d85b7d98dab1 100755 --- a/examples/pytorch/image-classification/run_image_classification.py +++ b/examples/pytorch/image-classification/run_image_classification.py @@ -57,7 +57,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt") diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index 6cbcac0a7e688a..0dc3e11f08abc7 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -49,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py index f23e55191709a5..6308e250f5c341 100644 --- a/examples/pytorch/image-pretraining/run_mae.py +++ b/examples/pytorch/image-pretraining/run_mae.py @@ -43,7 +43,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py index 9d052076b7b162..3721dc267c2273 100644 --- a/examples/pytorch/image-pretraining/run_mim.py +++ b/examples/pytorch/image-pretraining/run_mim.py @@ -48,7 +48,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py index 100a1365c2e9ea..0d8496bb3fed15 100644 --- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py +++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py @@ -53,7 +53,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation.py b/examples/pytorch/instance-segmentation/run_instance_segmentation.py index 806330fb72d107..306e8085f6763f 100644 --- a/examples/pytorch/instance-segmentation/run_instance_segmentation.py +++ b/examples/pytorch/instance-segmentation/run_instance_segmentation.py @@ -46,7 +46,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt") diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py index d888b7853dd475..e9cd01610bb7b3 100644 --- a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py +++ b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py @@ -52,7 +52,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py index 10bfee8f25f775..54098f5a7dd617 100755 --- a/examples/pytorch/language-modeling/run_clm.py +++ b/examples/pytorch/language-modeling/run_clm.py @@ -55,7 +55,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index 078b0add065ceb..e35e4e7d907ecb 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/language-modeling/run_fim.py b/examples/pytorch/language-modeling/run_fim.py index cac845f3a055df..e94690eaa7fcd3 100644 --- a/examples/pytorch/language-modeling/run_fim.py +++ b/examples/pytorch/language-modeling/run_fim.py @@ -58,7 +58,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_fim_no_trainer.py b/examples/pytorch/language-modeling/run_fim_no_trainer.py index 0a0e10511fa236..64e340a62a7639 100644 --- a/examples/pytorch/language-modeling/run_fim_no_trainer.py +++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py @@ -60,7 +60,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py index 8cb30099491a3a..fde2980d3ab5b8 100755 --- a/examples/pytorch/language-modeling/run_mlm.py +++ b/examples/pytorch/language-modeling/run_mlm.py @@ -54,7 +54,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index 0bff38707d567a..40265efcfdae29 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") logger = get_logger(__name__) require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py index 20763558a5f626..6ef17ebb9b6af8 100755 --- a/examples/pytorch/language-modeling/run_plm.py +++ b/examples/pytorch/language-modeling/run_plm.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py index f188e4e476a208..2d927f41978824 100755 --- a/examples/pytorch/multiple-choice/run_swag.py +++ b/examples/pytorch/multiple-choice/run_swag.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py index 2d4e8bdbb92c06..9ec972d091c74b 100755 --- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py +++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") logger = get_logger(__name__) # You should update this to your particular problem to have better documentation of `model_type` diff --git a/examples/pytorch/object-detection/run_object_detection.py b/examples/pytorch/object-detection/run_object_detection.py index 07fcb36acb1583..8d722f4d5d5ddb 100644 --- a/examples/pytorch/object-detection/run_object_detection.py +++ b/examples/pytorch/object-detection/run_object_detection.py @@ -48,7 +48,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/object-detection/requirements.txt") diff --git a/examples/pytorch/object-detection/run_object_detection_no_trainer.py b/examples/pytorch/object-detection/run_object_detection_no_trainer.py index 33ad0499301e18..dbfcb3fd97fae8 100644 --- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py +++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py @@ -51,7 +51,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") logging.basicConfig(level=logging.INFO) logger = get_logger(__name__) diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py index 6de464f4367072..81fcc7b8b70b96 100755 --- a/examples/pytorch/question-answering/run_qa.py +++ b/examples/pytorch/question-answering/run_qa.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py index c3e12ac9edef16..4b199a9e8990ba 100755 --- a/examples/pytorch/question-answering/run_qa_beam_search.py +++ b/examples/pytorch/question-answering/run_qa_beam_search.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py index 8e791564b007bf..312d8b389dd610 100644 --- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py index 6ccce481b548a9..da448c37f2c4d6 100755 --- a/examples/pytorch/question-answering/run_qa_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py index c1874b3fe18e12..cd6204c467c2a2 100644 --- a/examples/pytorch/question-answering/run_seq2seq_qa.py +++ b/examples/pytorch/question-answering/run_seq2seq_qa.py @@ -46,7 +46,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py index 6f77f82564172d..0551b5f61f10c5 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py @@ -51,7 +51,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt") diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py index 16e64eb92343f1..8a0d9de748f9ec 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py index 9eb3498c8c174e..70fc035fabb845 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py index 682d3b16d216a3..a6e3d9f7f33fc0 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py @@ -53,7 +53,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py index e6a643e4213910..6d3950802c83ef 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py index a2d09f20004704..93036a7e03ac06 100755 --- a/examples/pytorch/summarization/run_summarization.py +++ b/examples/pytorch/summarization/run_summarization.py @@ -52,7 +52,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py index ab5ab7adb19c44..c2201840cacc18 100644 --- a/examples/pytorch/summarization/run_summarization_no_trainer.py +++ b/examples/pytorch/summarization/run_summarization_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/pytorch/text-classification/run_classification.py b/examples/pytorch/text-classification/run_classification.py index dae845b119b172..fef77f4108a39d 100755 --- a/examples/pytorch/text-classification/run_classification.py +++ b/examples/pytorch/text-classification/run_classification.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py index 2a99bc42e1195a..9f1c93f3df2edb 100755 --- a/examples/pytorch/text-classification/run_glue.py +++ b/examples/pytorch/text-classification/run_glue.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py index 8da7e86d87551f..61bd746f0782e4 100644 --- a/examples/pytorch/text-classification/run_glue_no_trainer.py +++ b/examples/pytorch/text-classification/run_glue_no_trainer.py @@ -49,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py index c76f83ce4def6f..0c7d2c44b8788e 100755 --- a/examples/pytorch/text-classification/run_xnli.py +++ b/examples/pytorch/text-classification/run_xnli.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py index 056db7167280ed..81340dc2eef8ad 100755 --- a/examples/pytorch/token-classification/run_ner.py +++ b/examples/pytorch/token-classification/run_ner.py @@ -49,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py index 56e3a1e646db34..5b8c4c80ef8940 100755 --- a/examples/pytorch/token-classification/run_ner_no_trainer.py +++ b/examples/pytorch/token-classification/run_ner_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py index dadfcb80941e27..1d8ff8d05e530a 100755 --- a/examples/pytorch/translation/run_translation.py +++ b/examples/pytorch/translation/run_translation.py @@ -52,7 +52,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py index df4c1e9557a982..50213214304692 100644 --- a/examples/pytorch/translation/run_translation_no_trainer.py +++ b/examples/pytorch/translation/run_translation_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") diff --git a/examples/research_projects/decision_transformer/requirements.txt b/examples/research_projects/decision_transformer/requirements.txt index 33bb1d658595e5..b8ee442d9bfeae 100644 --- a/examples/research_projects/decision_transformer/requirements.txt +++ b/examples/research_projects/decision_transformer/requirements.txt @@ -92,7 +92,7 @@ itsdangerous==2.1.1 jax==0.3.4 jaxlib==0.3.2 jedi==0.18.1 -Jinja2==3.1.4 +Jinja2==3.1.5 jinja2-time==0.2.0 jmespath==0.10.0 joblib==1.2.0 diff --git a/examples/tensorflow/contrastive-image-text/run_clip.py b/examples/tensorflow/contrastive-image-text/run_clip.py index 01c31de8730b12..bc656ba6ff1c2c 100644 --- a/examples/tensorflow/contrastive-image-text/run_clip.py +++ b/examples/tensorflow/contrastive-image-text/run_clip.py @@ -51,7 +51,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version( "datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/contrastive-image-text/requirements.txt" diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py index 296c70549bda1b..6e4be8dcb0565a 100644 --- a/examples/tensorflow/image-classification/run_image_classification.py +++ b/examples/tensorflow/image-classification/run_image_classification.py @@ -55,7 +55,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt") diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py index b35d761d8a6a6a..02b8e565240166 100644 --- a/examples/tensorflow/multiple-choice/run_swag.py +++ b/examples/tensorflow/multiple-choice/run_swag.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py index a78a5d89e19f6c..46f0470d1c5686 100755 --- a/examples/tensorflow/question-answering/run_qa.py +++ b/examples/tensorflow/question-answering/run_qa.py @@ -62,7 +62,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py index 92a10990d160ed..3fd823ec8c08b6 100644 --- a/examples/tensorflow/summarization/run_summarization.py +++ b/examples/tensorflow/summarization/run_summarization.py @@ -53,7 +53,7 @@ # region Checking dependencies # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py index a8f2de825cc2e4..0fbe3790e0589c 100644 --- a/examples/tensorflow/text-classification/run_glue.py +++ b/examples/tensorflow/text-classification/run_glue.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") task_to_keys = { "cola": ("sentence", None), diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py index 1afb72cf10984e..148ee55f26c3dd 100644 --- a/examples/tensorflow/translation/run_translation.py +++ b/examples/tensorflow/translation/run_translation.py @@ -56,7 +56,7 @@ # region Dependencies and constants # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.48.0.dev0") +check_min_version("4.49.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/read_video.py b/read_video.py new file mode 100644 index 00000000000000..25e201a6e48d83 --- /dev/null +++ b/read_video.py @@ -0,0 +1,77 @@ +import numpy as np +import cv2 +import requests +from yt_dlp import YoutubeDL +from contextlib import redirect_stdout +from pathlib import Path +import io +import imageio.v3 as iio + + +url = "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4" +vid = cv2.VideoCapture(url) +# ret, frame = vid.read() + +while(True): + # Capture frame-by-frame + ret, frame = vid.read() + #print cap.isOpened(), ret + if frame is not None: + pass + # print(frame.shape) + else: + break + +print(vid.isOpened(), frame is not None) + +buffer = io.BytesIO(requests.get(url).content) +video = buffer.getvalue() +frames = iio.imread(video, index=None) +print(frames.shape) + + + + + +youtube_id = "https://www.youtube.com/watch?v=BaW_jenozKc" + +ctx = { + "outtmpl": "-", + 'logtostderr': True +} + +buffer = io.BytesIO() +with redirect_stdout(buffer), YoutubeDL(ctx) as foo: + foo.download([youtube_id]) +# Path(f"vi.mp4").write_bytes(buffer.getvalue()) + +video = buffer.getvalue() +print(type(video)) +frames = iio.imread(video, index=None) +print(frames.shape) + + +import decord +file_obj = io.BytesIO(video) +container = decord.VideoReader(file_obj) +print(container[2].shape) + +# print(np.frombuffer(video, dtype=np.uint8).shape) +# img_array = np.asarray(bytearray(video), dtype=np.uint8) +# im = cv2.imdecode(img_array, cv2.IMREAD_UNCHANGED) + + + +import av + +file_obj = io.BytesIO(video) +container = av.open(file_obj) +container.seek(0) +frames = [] +for i, frame in enumerate(container.decode(video=0)): + if i > 10: + break + if i >= 0: + frames.append(frame) +out = np.stack([x.to_ndarray(format="rgb24") for x in frames]) +print(out.shape) diff --git a/run.py b/run.py new file mode 100644 index 00000000000000..b79ba1ecf3fb9f --- /dev/null +++ b/run.py @@ -0,0 +1,107 @@ +import av +import torch +import decord +from decord import VideoReader, cpu + +import numpy as np +from PIL import Image +from huggingface_hub import hf_hub_download +from transformers import LlavaNextVideoProcessor, LlavaNextVideoForConditionalGeneration, SiglipImageProcessor + +model_id = "/raid/raushan/llava-next-video-qwen-7b" + +model = LlavaNextVideoForConditionalGeneration.from_pretrained( + model_id, + torch_dtype=torch.bfloat16, + low_cpu_mem_usage=True, +).to(0) + +processor = LlavaNextVideoProcessor.from_pretrained(model_id, torch_dtype=torch.bfloat16) +img_proc = SiglipImageProcessor.from_pretrained("google/siglip-so400m-patch14-384") + +image = Image.open("/raid/raushan/image.png") + + +def load_video(video_path, max_frames_num,fps=1,force_sample=False): + + vr = VideoReader(video_path) + total_frame_num = len(vr) + video_time = total_frame_num / vr.get_avg_fps() + fps = round(vr.get_avg_fps()/fps) + frame_idx = [i for i in range(0, len(vr), fps)] + frame_time = [i/fps for i in frame_idx] + if len(frame_idx) > max_frames_num or force_sample: + sample_fps = max_frames_num + uniform_sampled_frames = np.linspace(0, total_frame_num - 1, sample_fps, dtype=int) + frame_idx = uniform_sampled_frames.tolist() + frame_time = [i/vr.get_avg_fps() for i in frame_idx] + frame_time = ",".join([f"{i:.2f}s" for i in frame_time]) + spare_frames = vr.get_batch(frame_idx).asnumpy() + print(spare_frames.shape) + return spare_frames,frame_time,video_time + + +def read_video_pyav(container, indices): + ''' + Decode the video with PyAV decoder. + Args: + container (`av.container.input.InputContainer`): PyAV container. + indices (`List[int]`): List of frame indices to decode. + Returns: + result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3). + ''' + frames = [] + container.seek(0) + start_index = indices[0] + end_index = indices[-1] + for i, frame in enumerate(container.decode(video=0)): + if i > end_index: + break + if i >= start_index and i in indices: + frames.append(frame) + return np.stack([x.to_ndarray(format="rgb24") for x in frames]) + + +# define a chat history and use `apply_chat_template` to get correctly formatted prompt +# Each value in "content" has to be a list of dicts with types ("text", "image", "video") +# <|im_start|>system +# You are a helpful assistant.<|im_end|> +# <|im_start|>user +# Time farmes are this moments and we ahev 64 frames +# Please describe this video in detail.<|im_end|> +# <|im_start|>assistant + +conversation = [ + { + + "role": "system", + "content": [ + {"type": "text", "text": "You are a helpful assistant."}, + ], + }, + { + + "role": "user", + "content": [ + {"type": "text", "text": "The video lasts for 19.97 seconds, and 64 frames are uniformly sampled from it. These frames are located at 0.00s,0.30s,0.60s,0.93s,1.23s,1.57s,1.87s,2.20s,2.50s,2.83s,3.13s,3.47s,3.77s,4.10s,4.40s,4.73s,5.03s,5.37s,5.67s,6.00s,6.30s,6.63s,6.93s,7.27s,7.57s,7.90s,8.20s,8.53s,8.83s,9.17s,9.47s,9.80s,10.10s,10.43s,10.73s,11.07s,11.37s,11.70s,12.00s,12.33s,12.63s,12.97s,13.27s,13.60s,13.90s,14.23s,14.53s,14.87s,15.17s,15.50s,15.80s,16.13s,16.43s,16.77s,17.07s,17.40s,17.70s,18.03s,18.33s,18.67s,18.97s,19.30s,19.60s,19.93s.Please answer the following questions related to this video.\nPlease describe this video in detail."}, + {"type": "video"}, + ], + }, +] + +prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) +prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n