diff --git a/llm_bench/python/benchmark.py b/llm_bench/python/benchmark.py index 8a9d754f65..3f99081031 100644 --- a/llm_bench/python/benchmark.py +++ b/llm_bench/python/benchmark.py @@ -132,10 +132,13 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, if num == 0: warmup_md5[prompt_index] = result_md5_list per_token_time = generation_time * 1000 / (num_tokens / args['batch_size']) - tm_list = bench_hook.get_time_list() - log.debug('latency of all tokens:') - [log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_list)] - tm_infer_list = bench_hook.get_time_infer_list() + tm_list = [] + tm_infer_list = [] + if bench_hook is not None: + tm_list = bench_hook.get_time_list() + log.debug('latency of all tokens:') + [log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_list)] + tm_infer_list = bench_hook.get_time_infer_list() iter_data = gen_iterate_data( num, input_token_size * args['batch_size'], @@ -168,8 +171,9 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0]) else: utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0]) - bench_hook.clear_time_list() - bench_hook.clear_time_infer_list() + if bench_hook is not None: + bench_hook.clear_time_list() + bench_hook.clear_time_infer_list() def run_text_generation_benchmark(model_path, framework, device, args, num_iters): diff --git a/llm_bench/python/utils/hook_beam_search.py b/llm_bench/python/utils/hook_beam_search.py index 97957559a4..99b0a9e5c3 100644 --- a/llm_bench/python/utils/hook_beam_search.py +++ b/llm_bench/python/utils/hook_beam_search.py @@ -5,29 +5,21 @@ import time import torch import warnings -import transformers import logging as log -import utils.hook_common as hook_common from torch import nn -from packaging import version from typing import Optional, Tuple, Union, List from transformers.generation.stopping_criteria import ( + EosTokenCriteria, StoppingCriteriaList, validate_stopping_criteria, ) from transformers.generation.logits_process import LogitsProcessorList from transformers.generation.beam_search import BeamScorer -from transformers.generation.stopping_criteria import ( - EosTokenCriteria, - StoppingCriteriaList, - validate_stopping_criteria, -) from transformers.generation.utils import ( _split_model_inputs, stack_model_outputs, ) from transformers.utils import ModelOutput -import utils.hook_beam_search_old as hook_old_beam logger = log.getLogger(__name__) @@ -64,8 +56,8 @@ class GenerateBeamEncoderDecoderOutput(ModelOutput): tm_infer_list = [] -# Transformers version: Release/v4.39.2 97c00cdfe132164dbd793447a088432fa359fd36 -# Copied from https://github.com/eaidova/optimum-intel/blob/86c2baf253c1f8d063bf71e143837e5d8e629909/optimum/intel/openvino/modeling_decoder.py#L767 +# Transformers version: v4.40-release 4fdf58afb72b0754da30037fc800b6044e7d9c99 +# Copied from https://github.com/huggingface/transformers/blob/4fdf58afb72b0754da30037fc800b6044e7d9c99/src/transformers/generation/utils.py#L2911 # Add the function of collecting latency def new_beam_search( self, @@ -85,6 +77,120 @@ def new_beam_search( sequential: Optional[bool] = None, **model_kwargs, ) -> Union[GenerateBeamOutput, torch.LongTensor]: + r""" + Generates sequences of token ids for models with a language modeling head using **beam search decoding** and + can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. + + + + In most cases, you do not need to call [`~generation.GenerationMixin._beam_search`] directly. Use generate() + instead. For an overview of generation strategies and code examples, check the [following + guide](../generation_strategies). + + + + Parameters: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + The sequence used as a prompt for the generation. + beam_scorer (`BeamScorer`): + An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and + sorted during generation. For more information, the documentation of [`BeamScorer`] should be read. + logits_processor (`LogitsProcessorList`, *optional*): + An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] + used to modify the prediction scores of the language modeling head applied at each generation step. + stopping_criteria (`StoppingCriteriaList`, *optional*): + An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] + used to tell if the generation loop should stop. + max_length (`int`, *optional*, defaults to 20): + **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated + tokens. The maximum length of the sequence to be generated. + pad_token_id (`int`, *optional*): + The id of the *padding* token. + eos_token_id (`Union[int, List[int]]`, *optional*): + The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens. + output_attentions (`bool`, *optional*, defaults to `False`): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more details. + output_hidden_states (`bool`, *optional*, defaults to `False`): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more details. + output_logits (`bool`, *optional*, defaults to `False`): + Whether or not to return the raw prediction logit scores. See `logits` under returned tensors for + more details. + output_scores (`bool`, *optional*, defaults to `False`): + Whether or not to return the prediction scores. See `scores` under returned tensors for more details. + return_dict_in_generate (`bool`, *optional*, defaults to `False`): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + synced_gpus (`bool`, *optional*, defaults to `False`): + Whether to continue running the while loop until max_length (needed for ZeRO stage 3) + sequential (`bool`, defaults to `False`): + By default, beam search has `batch_size * num_beams` as effective batch size (see `beam_search()` for + more details). This flag will avoid parallelizing the beam search and will instead run beam search + sequentially. + model_kwargs: + Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is + an encoder-decoder model the kwargs should include `encoder_outputs`. + + Return: + [`generation.GenerateBeamDecoderOnlyOutput`], [`~generation.GenerateBeamEncoderDecoderOutput`] or + `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a + [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and + `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if + `model.config.is_encoder_decoder=True`. + + + Examples: + + ```python + >>> from transformers import ( + ... AutoTokenizer, + ... AutoModelForSeq2SeqLM, + ... LogitsProcessorList, + ... MinLengthLogitsProcessor, + ... BeamSearchScorer, + ... ) + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base") + >>> model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base") + + >>> encoder_input_str = "translate English to German: How old are you?" + >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids + + + >>> # lets run beam search using 3 beams + >>> num_beams = 3 + >>> # define decoder start token ids + >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long) + >>> input_ids = input_ids * model.config.decoder_start_token_id + + >>> # add encoder_outputs to model keyword arguments + >>> model_kwargs = { + ... "encoder_outputs": model.get_encoder()( + ... encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True + ... ) + ... } + + >>> # instantiate beam scorer + >>> beam_scorer = BeamSearchScorer( + ... batch_size=1, + ... num_beams=num_beams, + ... device=model.device, + ... ) + + >>> # instantiate logits processors + >>> logits_processor = LogitsProcessorList( + ... [ + ... MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id), + ... ] + ... ) + + >>> outputs = model._beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs) + + >>> tokenizer.batch_decode(outputs, skip_special_tokens=True) + ['Wie alt bist du?'] + ```""" + # init values logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() sequential = sequential if sequential is not None else self.generation_config.low_memory @@ -107,6 +213,8 @@ def new_beam_search( ) stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id)) else: + # TODO remove when the method is totally private and beam scorer refactored + # need to get `eos_token_id` and add stopping criteria, so that generation does not go forever eos_token_id = [ criteria.eos_token_id.tolist() for criteria in stopping_criteria if hasattr(criteria, "eos_token_id") ] @@ -139,16 +247,28 @@ def new_beam_search( cur_len = model_kwargs["inputs_embeds"].shape[1] model_kwargs["cache_position"] = torch.arange(cur_len, device=input_ids.device) + if num_beams * batch_size != batch_beam_size: + raise ValueError( + f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}." + ) + # init attention / hidden states / scores tuples scores = () if (return_dict_in_generate and output_scores) else None raw_logits = () if (return_dict_in_generate and output_logits) else None beam_indices = ( - tuple(() for _ in range(num_beams * batch_size)) if (return_dict_in_generate and output_scores) else None + tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None ) decoder_attentions = () if (return_dict_in_generate and output_attentions) else None cross_attentions = () if (return_dict_in_generate and output_attentions) else None decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None + # if model is an encoder-decoder, retrieve encoder attention weights and hidden states + if return_dict_in_generate and self.config.is_encoder_decoder: + encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None + encoder_hidden_states = ( + model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None + ) + # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens # of the first beam are considered to avoid sampling the exact same tokens across all beams. beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device) @@ -158,7 +278,6 @@ def new_beam_search( this_peer_finished = False decoder_prompt_len = input_ids.shape[-1] # record the prompt length of decoder - first_iteration = True while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device): tic = time.perf_counter() @@ -167,10 +286,27 @@ def new_beam_search( # if sequential is True, split the input to batches of batch_size and run sequentially tic_infer = time.perf_counter() if sequential: + if any( + model_name in self.__class__.__name__.lower() + for model_name in [ + "fsmt", + "reformer", + "bloom", + "ctrl", + "gpt_bigcode", + "transo_xl", + "xlnet", + "cpm", + "jamba", + ] + ): + raise RuntimeError( + f"Currently generation for {self.__class__.__name__} is not supported " + f"for `low_memory beam_search`. Please open an issue on GitHub if you need this feature." + ) + inputs_per_sub_batches = _split_model_inputs( - model_inputs, - split_size=batch_size, - full_batch_size=batch_beam_size if not first_iteration else batch_size, + model_inputs, split_size=batch_size, full_batch_size=batch_beam_size ) outputs_per_sub_batch = [ self( @@ -192,17 +328,12 @@ def new_beam_search( output_hidden_states=output_hidden_states, ) tm_infer_list.append(time.perf_counter() - tic_infer) - if first_iteration: - input_ids = input_ids.repeat_interleave(num_beams, dim=0) - model_kwargs = self._update_inputs_for_beam_search(model_kwargs, num_beams) - logits, past_key_values = self._expand_outputs_for_generation( - num_beams, outputs.logits, outputs.past_key_values - ) - outputs.logits = logits - outputs.past_key_values = past_key_values + if synced_gpus and this_peer_finished: + cur_len = cur_len + 1 + continue # don't waste resources running the code we don't need next_token_logits = outputs.logits[:, -1, :] - next_token_scores = torch.nn.functional.log_softmax( + next_token_scores = nn.functional.log_softmax( next_token_logits, dim=-1 ) # (batch_size * num_beams, vocab_size) @@ -266,7 +397,7 @@ def new_beam_search( model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder, ) - if model_kwargs.get("past_key_values", None) is not None and not first_iteration: + if model_kwargs.get("past_key_values", None) is not None: model_kwargs["past_key_values"] = self._temporary_reorder_cache( model_kwargs["past_key_values"], beam_idx ) @@ -280,8 +411,6 @@ def new_beam_search( if beam_scorer.is_done or all(stopping_criteria(input_ids, scores)): this_peer_finished = True - first_iteration = False - sequence_outputs = beam_scorer.finalize( input_ids, beam_scores, @@ -298,16 +427,31 @@ def new_beam_search( if not output_scores: sequence_outputs["sequence_scores"] = None - return GenerateBeamDecoderOnlyOutput( - sequences=sequence_outputs["sequences"], - sequences_scores=sequence_outputs["sequence_scores"], - scores=scores, - logits=raw_logits, - beam_indices=sequence_outputs["beam_indices"], - attentions=decoder_attentions, - hidden_states=decoder_hidden_states, - past_key_values=model_kwargs.get("past_key_values"), - ) + if self.config.is_encoder_decoder: + return GenerateBeamEncoderDecoderOutput( + sequences=sequence_outputs["sequences"], + sequences_scores=sequence_outputs["sequence_scores"], + scores=scores, + logits=raw_logits, + beam_indices=sequence_outputs["beam_indices"], + encoder_attentions=encoder_attentions, + encoder_hidden_states=encoder_hidden_states, + decoder_attentions=decoder_attentions, + cross_attentions=cross_attentions, + decoder_hidden_states=decoder_hidden_states, + past_key_values=model_kwargs.get("past_key_values"), + ) + else: + return GenerateBeamDecoderOnlyOutput( + sequences=sequence_outputs["sequences"], + sequences_scores=sequence_outputs["sequence_scores"], + scores=scores, + logits=raw_logits, + beam_indices=sequence_outputs["beam_indices"], + attentions=decoder_attentions, + hidden_states=decoder_hidden_states, + past_key_values=model_kwargs.get("past_key_values"), + ) else: return sequence_outputs["sequences"] @@ -339,15 +483,6 @@ def get_time_infer_list(self): global tm_infer_list return tm_infer_list - def new_forward(self, model, model_type=None): + def new_forward(self, model): """Define a new beam search function.""" - min_version = version.parse(hook_common.TRANS_MIN_VERSION) - trans_version = version.parse(transformers.__version__) - if trans_version < min_version: - log.warning(f'The function of getting latency of beam search will not be available with current transformers version:{trans_version}') - else: - min_second_version = version.parse(hook_common.TRANS_SENCOND_VERSION) - if trans_version >= min_second_version: - model._beam_search = new_beam_search.__get__(model, model.__class__) - else: - model.beam_search = hook_old_beam.old_beam_search.__get__(model, model.__class__) \ No newline at end of file + model._beam_search = new_beam_search.__get__(model, model.__class__) \ No newline at end of file diff --git a/llm_bench/python/utils/hook_beam_search_old.py b/llm_bench/python/utils/hook_beam_search_old.py deleted file mode 100644 index a1b1845f1e..0000000000 --- a/llm_bench/python/utils/hook_beam_search_old.py +++ /dev/null @@ -1,374 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (C) 2023-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 -# flake8: noqa -import time -import torch -import warnings -import transformers -import torch.distributed as dist -import logging as log -from torch import nn -from packaging import version -from typing import Optional, Tuple, Union, List -from transformers.generation.stopping_criteria import ( - StoppingCriteriaList, - validate_stopping_criteria, -) -from transformers.generation.logits_process import LogitsProcessorList -from transformers.generation.beam_search import BeamScorer -from transformers.utils import ModelOutput -import utils.hook_beam_search as hook_beam - - -class BeamSearchEncoderDecoderOutput(ModelOutput): - sequences: torch.LongTensor = None - sequences_scores: Optional[torch.FloatTensor] = None - scores: Optional[Tuple[torch.FloatTensor]] = None - beam_indices: Optional[torch.LongTensor] = None - encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None - encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None - decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - - -class BeamSearchDecoderOnlyOutput(ModelOutput): - sequences: torch.LongTensor = None - sequences_scores: Optional[torch.FloatTensor] = None - scores: Optional[Tuple[torch.FloatTensor]] = None - beam_indices: Optional[torch.LongTensor] = None - attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - - -BeamSearchOutput = Union[BeamSearchEncoderDecoderOutput, BeamSearchDecoderOnlyOutput] - - -# Transformers version: Release/v4.35.2 514de24abfd4416aeba6a6455ad5920f57f3567d -# Copied from https://github.com/huggingface/transformers/blob/514de24abfd4416aeba6a6455ad5920f57f3567d/src/transformers/generation/utils.py#L2894 -# Add the function of collecting latency -def old_beam_search( - self, - input_ids: torch.LongTensor, - beam_scorer: BeamScorer, - logits_processor: Optional[LogitsProcessorList] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - max_length: Optional[int] = None, - pad_token_id: Optional[int] = None, - eos_token_id: Optional[Union[int, List[int]]] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - output_scores: Optional[bool] = None, - return_dict_in_generate: Optional[bool] = None, - synced_gpus: bool = False, - **model_kwargs, - ) -> Union[BeamSearchOutput, torch.LongTensor]: - r""" - Generates sequences of token ids for models with a language modeling head using **beam search decoding** and - can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. - - - - In most cases, you do not need to call [`~generation.GenerationMixin.beam_search`] directly. Use generate() - instead. For an overview of generation strategies and code examples, check the [following - guide](../generation_strategies). - - - - Parameters: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - The sequence used as a prompt for the generation. - beam_scorer (`BeamScorer`): - An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and - sorted during generation. For more information, the documentation of [`BeamScorer`] should be read. - logits_processor (`LogitsProcessorList`, *optional*): - An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] - used to modify the prediction scores of the language modeling head applied at each generation step. - stopping_criteria (`StoppingCriteriaList`, *optional*): - An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] - used to tell if the generation loop should stop. - max_length (`int`, *optional*, defaults to 20): - **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated - tokens. The maximum length of the sequence to be generated. - pad_token_id (`int`, *optional*): - The id of the *padding* token. - eos_token_id (`Union[int, List[int]]`, *optional*): - The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens. - output_attentions (`bool`, *optional*, defaults to `False`): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more details. - output_hidden_states (`bool`, *optional*, defaults to `False`): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more details. - output_scores (`bool`, *optional*, defaults to `False`): - Whether or not to return the prediction scores. See `scores` under returned tensors for more details. - return_dict_in_generate (`bool`, *optional*, defaults to `False`): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. - synced_gpus (`bool`, *optional*, defaults to `False`): - Whether to continue running the while loop until max_length (needed for ZeRO stage 3) - model_kwargs: - Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is - an encoder-decoder model the kwargs should include `encoder_outputs`. - - Return: - [`generation.BeamSearchDecoderOnlyOutput`], [`~generation.BeamSearchEncoderDecoderOutput`] or - `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a - [`~generation.BeamSearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and - `return_dict_in_generate=True` or a [`~generation.BeamSearchEncoderDecoderOutput`] if - `model.config.is_encoder_decoder=True`. - - - Examples: - - ```python - >>> from transformers import ( - ... AutoTokenizer, - ... AutoModelForSeq2SeqLM, - ... LogitsProcessorList, - ... MinLengthLogitsProcessor, - ... BeamSearchScorer, - ... ) - >>> import torch - - >>> tokenizer = AutoTokenizer.from_pretrained("t5-base") - >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") - - >>> encoder_input_str = "translate English to German: How old are you?" - >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids - - - >>> # lets run beam search using 3 beams - >>> num_beams = 3 - >>> # define decoder start token ids - >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long) - >>> input_ids = input_ids * model.config.decoder_start_token_id - - >>> # add encoder_outputs to model keyword arguments - >>> model_kwargs = { - ... "encoder_outputs": model.get_encoder()( - ... encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True - ... ) - ... } - - >>> # instantiate beam scorer - >>> beam_scorer = BeamSearchScorer( - ... batch_size=1, - ... num_beams=num_beams, - ... device=model.device, - ... ) - - >>> # instantiate logits processors - >>> logits_processor = LogitsProcessorList( - ... [ - ... MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id), - ... ] - ... ) - - >>> outputs = model.beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs) - - >>> tokenizer.batch_decode(outputs, skip_special_tokens=True) - ['Wie alt bist du?'] - ```""" - # init values - logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() - stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() - if max_length is not None: - warnings.warn( - "`max_length` is deprecated in this function, use" - " `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))` instead.", - UserWarning, - ) - stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length) - if len(stopping_criteria) == 0: - warnings.warn("You don't have defined any stopping_criteria, this will likely loop forever", UserWarning) - pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id - eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id - if isinstance(eos_token_id, int): - eos_token_id = [eos_token_id] - output_scores = output_scores if output_scores is not None else self.generation_config.output_scores - output_attentions = ( - output_attentions if output_attentions is not None else self.generation_config.output_attentions - ) - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states - ) - return_dict_in_generate = ( - return_dict_in_generate - if return_dict_in_generate is not None - else self.generation_config.return_dict_in_generate - ) - - batch_size = len(beam_scorer._beam_hyps) - num_beams = beam_scorer.num_beams - - batch_beam_size, cur_len = input_ids.shape - - if num_beams * batch_size != batch_beam_size: - raise ValueError( - f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}." - ) - - # init attention / hidden states / scores tuples - scores = () if (return_dict_in_generate and output_scores) else None - beam_indices = ( - tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None - ) - decoder_attentions = () if (return_dict_in_generate and output_attentions) else None - cross_attentions = () if (return_dict_in_generate and output_attentions) else None - decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None - - # if model is an encoder-decoder, retrieve encoder attention weights and hidden states - if return_dict_in_generate and self.config.is_encoder_decoder: - encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None - encoder_hidden_states = ( - model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None - ) - - # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens - # of the first beam are considered to avoid sampling the exact same tokens across all beams. - beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device) - beam_scores[:, 1:] = -1e9 - beam_scores = beam_scores.view((batch_size * num_beams,)) - - this_peer_finished = False # used by synced_gpus only - while True: - tic = time.perf_counter() - if synced_gpus: - # Under synced_gpus the `forward` call must continue until all gpus complete their sequence. - # The following logic allows an early break if all peers finished generating their sequence - this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device) - # send 0.0 if we finished, 1.0 otherwise - dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM) - # did all peers finish? the reduced sum will be 0.0 then - if this_peer_finished_flag.item() == 0.0: - break - - model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) - tic_infer = time.perf_counter() - outputs = self( - **model_inputs, - return_dict=True, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - ) - hook_beam.tm_infer_list.append(time.perf_counter() - tic_infer) - - if synced_gpus and this_peer_finished: - cur_len = cur_len + 1 - continue # don't waste resources running the code we don't need - - next_token_logits = outputs.logits[:, -1, :] - next_token_scores = nn.functional.log_softmax( - next_token_logits, dim=-1 - ) # (batch_size * num_beams, vocab_size) - - next_token_scores_processed = logits_processor(input_ids, next_token_scores) - next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as( - next_token_scores_processed - ) - - # Store scores, attentions and hidden_states when required - if return_dict_in_generate: - if output_scores: - scores += (next_token_scores_processed,) - if output_attentions: - decoder_attentions += ( - (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) - ) - if self.config.is_encoder_decoder: - cross_attentions += (outputs.cross_attentions,) - - if output_hidden_states: - decoder_hidden_states += ( - (outputs.decoder_hidden_states,) - if self.config.is_encoder_decoder - else (outputs.hidden_states,) - ) - - # reshape for beam search - vocab_size = next_token_scores.shape[-1] - next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size) - - # Sample 1 + len(eos_token_id) next tokens for each beam so we have at least 1 non eos token per beam. - n_eos_tokens = len(eos_token_id) if eos_token_id else 0 - next_token_scores, next_tokens = torch.topk( - next_token_scores, max(2, 1 + n_eos_tokens) * num_beams, dim=1, largest=True, sorted=True - ) - - next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor") - next_tokens = next_tokens % vocab_size - - # stateless - beam_outputs = beam_scorer.process( - input_ids, - next_token_scores, - next_tokens, - next_indices, - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - beam_indices=beam_indices, - ) - - beam_scores = beam_outputs["next_beam_scores"] - beam_next_tokens = beam_outputs["next_beam_tokens"] - beam_idx = beam_outputs["next_beam_indices"] - - input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1) - - model_kwargs = self._update_model_kwargs_for_generation( - outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder, - ) - if model_kwargs["past_key_values"] is not None: - model_kwargs["past_key_values"] = self._reorder_cache(model_kwargs["past_key_values"], beam_idx) - - if return_dict_in_generate and output_scores: - beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices)))) - - # increase cur_len - cur_len = cur_len + 1 - hook_beam.tm_list.append(time.perf_counter() - tic) - if beam_scorer.is_done or stopping_criteria(input_ids, scores): - if not synced_gpus: - break - else: - this_peer_finished = True - - sequence_outputs = beam_scorer.finalize( - input_ids, - beam_scores, - next_tokens, - next_indices, - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - max_length=stopping_criteria.max_length, - beam_indices=beam_indices, - ) - - if return_dict_in_generate: - if not output_scores: - sequence_outputs["sequence_scores"] = None - - if self.config.is_encoder_decoder: - return BeamSearchEncoderDecoderOutput( - sequences=sequence_outputs["sequences"], - sequences_scores=sequence_outputs["sequence_scores"], - scores=scores, - beam_indices=sequence_outputs["beam_indices"], - encoder_attentions=encoder_attentions, - encoder_hidden_states=encoder_hidden_states, - decoder_attentions=decoder_attentions, - cross_attentions=cross_attentions, - decoder_hidden_states=decoder_hidden_states, - ) - else: - return BeamSearchDecoderOnlyOutput( - sequences=sequence_outputs["sequences"], - sequences_scores=sequence_outputs["sequence_scores"], - scores=scores, - beam_indices=sequence_outputs["beam_indices"], - attentions=decoder_attentions, - hidden_states=decoder_hidden_states, - ) - else: - return sequence_outputs["sequences"] \ No newline at end of file diff --git a/llm_bench/python/utils/hook_common.py b/llm_bench/python/utils/hook_common.py index 5e93385d45..3ff78c9f68 100644 --- a/llm_bench/python/utils/hook_common.py +++ b/llm_bench/python/utils/hook_common.py @@ -2,149 +2,26 @@ # Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # flake8: noqa -import torch -from typing import Union, List, Dict -from transformers.utils import ModelOutput - -TRANS_MIN_VERSION = '4.36.0' -TRANS_SENCOND_VERSION = '4.39.0' - - -# Copied from https://github.com/huggingface/transformers/blob/v4.39-release/src/transformers/generation/utils.py#L4783 -def _split(data, full_batch_size: int, split_size: int = None): - """ - Takes care of three cases: - 1. data is a tensor: e.g. last_hidden_state, pooler_output etc. split them on the batch_size dim - 2. data is a tuple: e.g. hidden_states, attentions etc. Keep the tuple as it is and split each tensor in it and - return a list of tuples - 3. data is a tuple of tuples, e.g. past_key_values. Keep the tuple as it is and split each tuple in it and - return a list of tuples of tuples - (see documentation of ModelOutput) - """ - if data is None: - return [None] * (full_batch_size // split_size) - if isinstance(data, torch.Tensor): - return [data[i : i + split_size] for i in range(0, full_batch_size, split_size)] - elif isinstance(data, tuple): - # If the elements of the tuple are also tuples (e.g., past_key_values in our earlier example) - if isinstance(data[0], tuple): - return [ - tuple(tuple(tensor[i : i + split_size] for tensor in inner_tuple) for inner_tuple in data) - for i in range(0, full_batch_size, split_size) - ] - +import logging as log +import transformers +from packaging import version + +TRANS_MIN_VERSION = '4.40.0' + + +def get_bench_hook(num_beams, ov_model): + min_version = version.parse(TRANS_MIN_VERSION) + trans_version = version.parse(transformers.__version__) + search_type = 'beam search' if num_beams > 1 else 'greedy search' + if trans_version >= min_version: + import utils.hook_greedy_search + import utils.hook_beam_search + if num_beams > 1: + bench_hook = utils.hook_beam_search.BeamSearchHook() else: - return [ - tuple(sub_tensor[i : i + split_size] for sub_tensor in data) - for i in range(0, full_batch_size, split_size) - ] + bench_hook = utils.hook_greedy_search.GreedySearchHook() + bench_hook.new_forward(ov_model) else: - raise ValueError(f"Unexpected attribute type: {type(data)}") - - -# Copied from https://github.com/huggingface/transformers/blob/v4.39-release/src/transformers/generation/utils.py#L4814 -def _split_model_inputs( - model_input: Union[ModelOutput, Dict], split_size: int, full_batch_size: int -) -> List[Union[ModelOutput, Dict]]: - """ - Split a ModelOutput object (or its subclasses) or Dict into a list of same-class objects based on a specified split - size. The input object is dict when it was prepared for forward pass and ModelOutput when it was returned from - previous forward pass. - """ - # Edge case: if model_input is None, return a list of Nones - # this happens with Whisper where encoder_outputs is None - if model_input is None: - return [model_input] * (full_batch_size // split_size) - # Infer the class from the object - model_output_cls = type(model_input) - if (full_batch_size % split_size) != 0: - raise ValueError("`full_batch_size` must be divisible by `split_size`") - - if split_size > full_batch_size: - raise ValueError("`split_size` must be smaller or equal to `full_batch_size`") - - # Helper function to split tensors or tuples of tensors - - # Find all the dataclass fields (e.g., last_hidden_state, pooler_output etc.) and split them - keys = ( - model_input.__dataclass_fields__.keys() if hasattr(model_input, "__dataclass_fields__") else model_input.keys() - ) - # We only keep keys that are in the model_input - keys = [k for k in keys if k in model_input] - # Here we can have four types of values: tensors, tuples of tensors and booleans, and encoder_outputs which is a - # ModelOutput object. - # bool should not be split but replicated for each split - bool_keys = [k for k in keys if isinstance(model_input[k], bool) or k == "cache_position"] - keys_to_ignore = ["cache_position", "encoder_outputs"] - non_bool_keys = [k for k in keys if not isinstance(model_input[k], bool) and k not in keys_to_ignore] - - # we split the tensors and tuples of tensors - data_split_list = [ - {k: _split(model_input[k], full_batch_size, split_size)[i] for k in non_bool_keys} - for i in range(full_batch_size // split_size) - ] - # bool values are the same and replicated for each split - bool_data = {k: model_input[k] for k in bool_keys} - # encoder_outputs is a ModelOutput object and should be split by its own - if "encoder_outputs" in model_input: - encoder_outputs_split = _split_model_inputs(model_input["encoder_outputs"], split_size, full_batch_size) - data_split_list = [ - {**data_split, "encoder_outputs": encoder_outputs_split[i]} for i, data_split in enumerate(data_split_list) - ] - - # Convert each dictionary in the list to an object of the inferred class - split_model_inputs: List[Union[ModelOutput, Dict]] = [ - model_output_cls(**data_split, **bool_data) for data_split in data_split_list - ] - - return split_model_inputs - - -# Copied from https://github.com/huggingface/transformers/blob/v4.39-release/src/transformers/generation/utils.py#L4871 -def stack_model_outputs(model_outputs: List[ModelOutput]) -> ModelOutput: - """ - Stack a list of ModelOutput objects (or its subclasses) along the batch_size dimension. The function infers the - specific ModelOutput subclass from the list provided. - """ - if not model_outputs: - raise ValueError("Input list is empty.") - - # Infer the class from the first object in the list - model_output_cls = type(model_outputs[0]) - - # Ensure all objects are of the same type - if not all(isinstance(obj, model_output_cls) for obj in model_outputs): - raise ValueError("All elements in the list should be of the same type.") - - # Helper function to concat tensors or tuples of tensors - def _concat(data): - """ - Reverse of `_split` function above. - """ - if any(data is None for data in data): - return None - if isinstance(data[0], torch.Tensor): - return torch.cat(data, dim=0) - elif isinstance(data[0], tuple): - # If the elements of the tuple are also tuples (e.g., past_key_values in our earlier example) - if isinstance(data[0][0], tuple): - return tuple( - tuple(torch.cat([attr[i][j] for attr in data], dim=0) for j in range(len(data[0][0]))) - for i in range(len(data[0])) - ) - else: - return tuple(torch.cat([attr[i] for attr in data], dim=0) for i in range(len(data[0]))) - elif isinstance(data[0], (int, float)): - # If the elements are integers or floats, return a tensor - return torch.tensor(data) - else: - raise ValueError(f"Unexpected attribute type: {type(data[0])}") - - # Use a dictionary comprehension to gather attributes from all objects and concatenate them - concatenated_data = { - k: _concat([getattr(model_output, k) for model_output in model_outputs]) - for k in model_output_cls.__dataclass_fields__.keys() - } - - # Return a new object of the inferred class with the concatenated attributes - return model_output_cls(**concatenated_data) \ No newline at end of file + log.warning(f'The minimum version of transformers to get 1st and 2nd tokens latency of {search_type} is: {min_version}') + bench_hook = None + return bench_hook \ No newline at end of file diff --git a/llm_bench/python/utils/hook_greedy_search.py b/llm_bench/python/utils/hook_greedy_search.py index 7fcbff2fd7..a3912726d7 100644 --- a/llm_bench/python/utils/hook_greedy_search.py +++ b/llm_bench/python/utils/hook_greedy_search.py @@ -5,20 +5,19 @@ import time import torch import warnings -import transformers -import torch.distributed as dist import logging as log -import utils.hook_common as hook_common -from packaging import version from typing import Optional, Tuple, Union, List from transformers.generation.stopping_criteria import ( + EosTokenCriteria, StoppingCriteriaList, validate_stopping_criteria, ) from transformers.generation.logits_process import LogitsProcessorList from transformers.generation.streamers import BaseStreamer from transformers.utils import ModelOutput -import utils.hook_greedy_search_old as hook_old_greedy + + +logger = log.getLogger(__name__) class GenerateDecoderOnlyOutput(ModelOutput): @@ -47,8 +46,8 @@ class GenerateEncoderDecoderOutput(ModelOutput): tm_list = [] tm_infer_list = [] -# Transformers version: Release/v4.39.2 97c00cdfe132164dbd793447a088432fa359fd36 -# Copied from https://github.com/huggingface/transformers/blob/v4.39-release/src/transformers/generation/utils.py#L2244 +# Transformers version: v4.40-release 4fdf58afb72b0754da30037fc800b6044e7d9c99 +# Copied from https://github.com/huggingface/transformers/blob/4fdf58afb72b0754da30037fc800b6044e7d9c99/src/transformers/generation/utils.py#L2310 # Add the function of collecting latency def new_greedy_search( self, @@ -173,10 +172,27 @@ def new_greedy_search( ) stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length) pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id - eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id + if eos_token_id is not None: + logger.warning_once( + "`eos_token_id` is deprecated in this function and will be removed in v4.41, use" + " `stopping_criteria=StoppingCriteriaList([EosTokenCriteria(eos_token_id=eos_token_id)])` instead." + " Otherwise make sure to set `model.generation_config.eos_token_id`", + FutureWarning, + ) + stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id)) + else: + # TODO remove when the method is totally private + # need to get `eos_token_id` and add stopping criteria, so that generation does not go forever + eos_token_id = [ + criteria.eos_token_id.tolist() for criteria in stopping_criteria if hasattr(criteria, "eos_token_id") + ] + eos_token_id = eos_token_id[0] if eos_token_id else None + if eos_token_id is None and self.generation_config.eos_token_id is not None: + eos_token_id = self.generation_config.eos_token_id + stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id)) + if isinstance(eos_token_id, int): eos_token_id = [eos_token_id] - eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None output_scores = output_scores if output_scores is not None else self.generation_config.output_scores output_attentions = ( output_attentions if output_attentions is not None else self.generation_config.output_attentions @@ -274,12 +290,6 @@ def new_greedy_search( is_encoder_decoder=self.config.is_encoder_decoder, ) - # if eos_token was found in one sentence, set sentence to finished - if eos_token_id_tensor is not None: - unfinished_sequences = unfinished_sequences.mul( - next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0) - ) - unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores) this_peer_finished = unfinished_sequences.max() == 0 tm_list.append(time.perf_counter() - tic) @@ -340,15 +350,7 @@ def get_time_infer_list(self): global tm_infer_list return tm_infer_list - def new_forward(self, model, model_type=None): + def new_forward(self, model): """Define a new greedy search function.""" - min_version = version.parse(hook_common.TRANS_MIN_VERSION) - trans_version = version.parse(transformers.__version__) - if trans_version < min_version: - log.warning(f'The function of getting latency of greedy search will not be available with current transformers version:{trans_version}') - else: - min_second_version = version.parse(hook_common.TRANS_SENCOND_VERSION) - if trans_version >= min_second_version: - model._greedy_search = new_greedy_search.__get__(model, model.__class__) - else: - model.greedy_search = hook_old_greedy.old_greedy_search.__get__(model, model.__class__) + model._greedy_search = new_greedy_search.__get__(model, model.__class__) + diff --git a/llm_bench/python/utils/hook_greedy_search_old.py b/llm_bench/python/utils/hook_greedy_search_old.py deleted file mode 100644 index 595aa596da..0000000000 --- a/llm_bench/python/utils/hook_greedy_search_old.py +++ /dev/null @@ -1,302 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (C) 2023-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 -# flake8: noqa -import time -import torch -import warnings -import torch.distributed as dist -from typing import Optional, Tuple, Union, List -from transformers.generation.stopping_criteria import ( - StoppingCriteriaList, - validate_stopping_criteria, -) -from transformers.generation.logits_process import LogitsProcessorList -from transformers.generation.streamers import BaseStreamer -from transformers.utils import ModelOutput -import utils.hook_greedy_search as hook_greedy - - -class GreedySearchDecoderOnlyOutput(ModelOutput): - sequences: torch.LongTensor = None - scores: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - - -class GreedySearchEncoderDecoderOutput(ModelOutput): - sequences: torch.LongTensor = None - scores: Optional[Tuple[torch.FloatTensor]] = None - encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None - encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None - decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - - -GreedySearchOutput = Union[GreedySearchEncoderDecoderOutput, GreedySearchDecoderOnlyOutput] - -# Transformers version: Release/v4.35.2 514de24abfd4416aeba6a6455ad5920f57f3567d -# Copied from https://github.com/huggingface/transformers/blob/514de24abfd4416aeba6a6455ad5920f57f3567d/src/transformers/generation/utils.py#L2353 -# Add the function of collecting latency -def old_greedy_search( - self, - input_ids: torch.LongTensor, - logits_processor: Optional[LogitsProcessorList] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - max_length: Optional[int] = None, - pad_token_id: Optional[int] = None, - eos_token_id: Optional[Union[int, List[int]]] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - output_scores: Optional[bool] = None, - return_dict_in_generate: Optional[bool] = None, - synced_gpus: bool = False, - streamer: Optional["BaseStreamer"] = None, - **model_kwargs, - ) -> Union[GreedySearchOutput, torch.LongTensor]: - r""" - Generates sequences of token ids for models with a language modeling head using **greedy decoding** and can be - used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. - - - - In most cases, you do not need to call [`~generation.GenerationMixin.greedy_search`] directly. Use generate() - instead. For an overview of generation strategies and code examples, check the [following - guide](../generation_strategies). - - - - - Parameters: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - The sequence used as a prompt for the generation. - logits_processor (`LogitsProcessorList`, *optional*): - An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] - used to modify the prediction scores of the language modeling head applied at each generation step. - stopping_criteria (`StoppingCriteriaList`, *optional*): - An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] - used to tell if the generation loop should stop. - - max_length (`int`, *optional*, defaults to 20): - **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated - tokens. The maximum length of the sequence to be generated. - pad_token_id (`int`, *optional*): - The id of the *padding* token. - eos_token_id (`Union[int, List[int]]`, *optional*): - The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens. - output_attentions (`bool`, *optional*, defaults to `False`): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more details. - output_hidden_states (`bool`, *optional*, defaults to `False`): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more details. - output_scores (`bool`, *optional*, defaults to `False`): - Whether or not to return the prediction scores. See `scores` under returned tensors for more details. - return_dict_in_generate (`bool`, *optional*, defaults to `False`): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. - synced_gpus (`bool`, *optional*, defaults to `False`): - Whether to continue running the while loop until max_length (needed for ZeRO stage 3) - streamer (`BaseStreamer`, *optional*): - Streamer object that will be used to stream the generated sequences. Generated tokens are passed - through `streamer.put(token_ids)` and the streamer is responsible for any further processing. - model_kwargs: - Additional model specific keyword arguments will be forwarded to the `forward` function of the model. - If model is an encoder-decoder model the kwargs should include `encoder_outputs`. - - Return: - [`~generation.GreedySearchDecoderOnlyOutput`], [`~generation.GreedySearchEncoderDecoderOutput`] or - `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a - [`~generation.GreedySearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and - `return_dict_in_generate=True` or a [`~generation.GreedySearchEncoderDecoderOutput`] if - `model.config.is_encoder_decoder=True`. - - Examples: - - ```python - >>> from transformers import ( - ... AutoTokenizer, - ... AutoModelForCausalLM, - ... LogitsProcessorList, - ... MinLengthLogitsProcessor, - ... StoppingCriteriaList, - ... MaxLengthCriteria, - ... ) - - >>> tokenizer = AutoTokenizer.from_pretrained("gpt2") - >>> model = AutoModelForCausalLM.from_pretrained("gpt2") - - >>> # set pad_token_id to eos_token_id because GPT2 does not have a PAD token - >>> model.generation_config.pad_token_id = model.generation_config.eos_token_id - - >>> input_prompt = "It might be possible to" - >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids - - >>> # instantiate logits processors - >>> logits_processor = LogitsProcessorList( - ... [ - ... MinLengthLogitsProcessor(10, eos_token_id=model.generation_config.eos_token_id), - ... ] - ... ) - >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)]) - - >>> outputs = model.greedy_search( - ... input_ids, logits_processor=logits_processor, stopping_criteria=stopping_criteria - ... ) - - >>> tokenizer.batch_decode(outputs, skip_special_tokens=True) - ["It might be possible to get a better understanding of the nature of the problem, but it's not"] - ```""" - # init values - logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() - stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() - if max_length is not None: - warnings.warn( - "`max_length` is deprecated in this function, use" - " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.", - UserWarning, - ) - stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length) - pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id - eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id - if isinstance(eos_token_id, int): - eos_token_id = [eos_token_id] - eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None - output_scores = output_scores if output_scores is not None else self.generation_config.output_scores - output_attentions = ( - output_attentions if output_attentions is not None else self.generation_config.output_attentions - ) - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states - ) - return_dict_in_generate = ( - return_dict_in_generate - if return_dict_in_generate is not None - else self.generation_config.return_dict_in_generate - ) - - # init attention / hidden states / scores tuples - scores = () if (return_dict_in_generate and output_scores) else None - decoder_attentions = () if (return_dict_in_generate and output_attentions) else None - cross_attentions = () if (return_dict_in_generate and output_attentions) else None - decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None - - # if model is an encoder-decoder, retrieve encoder attention weights and hidden states - if return_dict_in_generate and self.config.is_encoder_decoder: - encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None - encoder_hidden_states = ( - model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None - ) - - # keep track of which sequences are already finished - unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device) - - this_peer_finished = False # used by synced_gpus only - while True: - tic = time.perf_counter() - if synced_gpus: - # Under synced_gpus the `forward` call must continue until all gpus complete their sequence. - # The following logic allows an early break if all peers finished generating their sequence - this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device) - # send 0.0 if we finished, 1.0 otherwise - dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM) - # did all peers finish? the reduced sum will be 0.0 then - if this_peer_finished_flag.item() == 0.0: - break - - # prepare model inputs - model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) - - # forward pass to get next token - tic_infer = time.perf_counter() - outputs = self( - **model_inputs, - return_dict=True, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - ) - hook_greedy.tm_infer_list.append(time.perf_counter() - tic_infer) - - if synced_gpus and this_peer_finished: - continue # don't waste resources running the code we don't need - - next_token_logits = outputs.logits[:, -1, :] - - # pre-process distribution - next_tokens_scores = logits_processor(input_ids, next_token_logits) - - # Store scores, attentions and hidden_states when required - if return_dict_in_generate: - if output_scores: - scores += (next_tokens_scores,) - if output_attentions: - decoder_attentions += ( - (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) - ) - if self.config.is_encoder_decoder: - cross_attentions += (outputs.cross_attentions,) - - if output_hidden_states: - decoder_hidden_states += ( - (outputs.decoder_hidden_states,) - if self.config.is_encoder_decoder - else (outputs.hidden_states,) - ) - - # argmax - next_tokens = torch.argmax(next_tokens_scores, dim=-1) - - # finished sentences should have their next token be a padding token - if eos_token_id is not None: - if pad_token_id is None: - raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.") - next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences) - - # update generated ids, model inputs, and length for next step - input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) - if streamer is not None: - streamer.put(next_tokens.cpu()) - model_kwargs = self._update_model_kwargs_for_generation( - outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder - ) - - # if eos_token was found in one sentence, set sentence to finished - if eos_token_id_tensor is not None: - unfinished_sequences = unfinished_sequences.mul( - next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0) - ) - - # stop when each sentence is finished - if unfinished_sequences.max() == 0: - this_peer_finished = True - - # stop if we exceed the maximum length - if stopping_criteria(input_ids, scores): - this_peer_finished = True - hook_greedy.tm_list.append(time.perf_counter() - tic) - if this_peer_finished and not synced_gpus: - break - - if streamer is not None: - streamer.end() - - if return_dict_in_generate: - if self.config.is_encoder_decoder: - return GreedySearchEncoderDecoderOutput( - sequences=input_ids, - scores=scores, - encoder_attentions=encoder_attentions, - encoder_hidden_states=encoder_hidden_states, - decoder_attentions=decoder_attentions, - cross_attentions=cross_attentions, - decoder_hidden_states=decoder_hidden_states, - ) - else: - return GreedySearchDecoderOnlyOutput( - sequences=input_ids, - scores=scores, - attentions=decoder_attentions, - hidden_states=decoder_hidden_states, - ) - else: - return input_ids \ No newline at end of file diff --git a/llm_bench/python/utils/ov_utils.py b/llm_bench/python/utils/ov_utils.py index a2416ccb92..3d77941ca9 100644 --- a/llm_bench/python/utils/ov_utils.py +++ b/llm_bench/python/utils/ov_utils.py @@ -9,9 +9,7 @@ import torch import time import types -import utils.hook_greedy_search -import utils.hook_beam_search - +import utils.hook_common as hook_common from utils.config_class import OV_MODEL_CLASSES_MAPPING, TOKENIZE_CLASSES_MAPPING, DEFAULT_MODEL_CLASSES import openvino.runtime.opset13 as opset @@ -159,11 +157,7 @@ def create_text_gen_model(model_path, device, **kwargs): if not isinstance(ov_model, OV_MODEL_CLASSES_MAPPING['t5']): patch_inter_processing_and_compile(ov_model, **kwargs) end = time.perf_counter() - if kwargs['num_beams'] > 1: - bench_hook = utils.hook_beam_search.BeamSearchHook() - else: - bench_hook = utils.hook_greedy_search.GreedySearchHook() - bench_hook.new_forward(ov_model, model_type) + bench_hook = hook_common.get_bench_hook(kwargs['num_beams'], ov_model) from_pretrained_time = end - start log.info(f'From pretrained time: {from_pretrained_time:.2f}s') # load token diff --git a/llm_bench/python/utils/pt_utils.py b/llm_bench/python/utils/pt_utils.py index d703f4bb1a..ccf401330c 100644 --- a/llm_bench/python/utils/pt_utils.py +++ b/llm_bench/python/utils/pt_utils.py @@ -7,11 +7,7 @@ import os import time import logging as log -import openvino.torch # noqa: F401 -import utils.hook_greedy_search -import utils.hook_beam_search - -MAX_CONNECT_TIME = 50 +import utils.hook_common as hook_common def set_bf16(model, device, **kwargs): @@ -95,11 +91,7 @@ def create_text_gen_model(model_path, device, **kwargs): else: raise RuntimeError('==Failure ==: no device to load') - if kwargs['num_beams'] > 1: - bench_hook = utils.hook_beam_search.BeamSearchHook() - else: - bench_hook = utils.hook_greedy_search.GreedySearchHook() - bench_hook.new_forward(model, model_type) + bench_hook = hook_common.get_bench_hook(kwargs['num_beams'], model) if kwargs['torch_compile_backend']: backend = kwargs['torch_compile_backend']