diff --git a/llm_bench/python/benchmark.py b/llm_bench/python/benchmark.py
index 8a9d754f65..3f99081031 100644
--- a/llm_bench/python/benchmark.py
+++ b/llm_bench/python/benchmark.py
@@ -132,10 +132,13 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list,
if num == 0:
warmup_md5[prompt_index] = result_md5_list
per_token_time = generation_time * 1000 / (num_tokens / args['batch_size'])
- tm_list = bench_hook.get_time_list()
- log.debug('latency of all tokens:')
- [log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_list)]
- tm_infer_list = bench_hook.get_time_infer_list()
+ tm_list = []
+ tm_infer_list = []
+ if bench_hook is not None:
+ tm_list = bench_hook.get_time_list()
+ log.debug('latency of all tokens:')
+ [log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_list)]
+ tm_infer_list = bench_hook.get_time_infer_list()
iter_data = gen_iterate_data(
num,
input_token_size * args['batch_size'],
@@ -168,8 +171,9 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list,
utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0])
else:
utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0])
- bench_hook.clear_time_list()
- bench_hook.clear_time_infer_list()
+ if bench_hook is not None:
+ bench_hook.clear_time_list()
+ bench_hook.clear_time_infer_list()
def run_text_generation_benchmark(model_path, framework, device, args, num_iters):
diff --git a/llm_bench/python/utils/hook_beam_search.py b/llm_bench/python/utils/hook_beam_search.py
index 97957559a4..99b0a9e5c3 100644
--- a/llm_bench/python/utils/hook_beam_search.py
+++ b/llm_bench/python/utils/hook_beam_search.py
@@ -5,29 +5,21 @@
import time
import torch
import warnings
-import transformers
import logging as log
-import utils.hook_common as hook_common
from torch import nn
-from packaging import version
from typing import Optional, Tuple, Union, List
from transformers.generation.stopping_criteria import (
+ EosTokenCriteria,
StoppingCriteriaList,
validate_stopping_criteria,
)
from transformers.generation.logits_process import LogitsProcessorList
from transformers.generation.beam_search import BeamScorer
-from transformers.generation.stopping_criteria import (
- EosTokenCriteria,
- StoppingCriteriaList,
- validate_stopping_criteria,
-)
from transformers.generation.utils import (
_split_model_inputs,
stack_model_outputs,
)
from transformers.utils import ModelOutput
-import utils.hook_beam_search_old as hook_old_beam
logger = log.getLogger(__name__)
@@ -64,8 +56,8 @@ class GenerateBeamEncoderDecoderOutput(ModelOutput):
tm_infer_list = []
-# Transformers version: Release/v4.39.2 97c00cdfe132164dbd793447a088432fa359fd36
-# Copied from https://github.com/eaidova/optimum-intel/blob/86c2baf253c1f8d063bf71e143837e5d8e629909/optimum/intel/openvino/modeling_decoder.py#L767
+# Transformers version: v4.40-release 4fdf58afb72b0754da30037fc800b6044e7d9c99
+# Copied from https://github.com/huggingface/transformers/blob/4fdf58afb72b0754da30037fc800b6044e7d9c99/src/transformers/generation/utils.py#L2911
# Add the function of collecting latency
def new_beam_search(
self,
@@ -85,6 +77,120 @@ def new_beam_search(
sequential: Optional[bool] = None,
**model_kwargs,
) -> Union[GenerateBeamOutput, torch.LongTensor]:
+ r"""
+ Generates sequences of token ids for models with a language modeling head using **beam search decoding** and
+ can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+
+
+
+ In most cases, you do not need to call [`~generation.GenerationMixin._beam_search`] directly. Use generate()
+ instead. For an overview of generation strategies and code examples, check the [following
+ guide](../generation_strategies).
+
+
+
+ Parameters:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ The sequence used as a prompt for the generation.
+ beam_scorer (`BeamScorer`):
+ An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
+ sorted during generation. For more information, the documentation of [`BeamScorer`] should be read.
+ logits_processor (`LogitsProcessorList`, *optional*):
+ An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+ used to modify the prediction scores of the language modeling head applied at each generation step.
+ stopping_criteria (`StoppingCriteriaList`, *optional*):
+ An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+ used to tell if the generation loop should stop.
+ max_length (`int`, *optional*, defaults to 20):
+ **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
+ tokens. The maximum length of the sequence to be generated.
+ pad_token_id (`int`, *optional*):
+ The id of the *padding* token.
+ eos_token_id (`Union[int, List[int]]`, *optional*):
+ The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+ output_attentions (`bool`, *optional*, defaults to `False`):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more details.
+ output_hidden_states (`bool`, *optional*, defaults to `False`):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+ for more details.
+ output_logits (`bool`, *optional*, defaults to `False`):
+ Whether or not to return the raw prediction logit scores. See `logits` under returned tensors for
+ more details.
+ output_scores (`bool`, *optional*, defaults to `False`):
+ Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+ return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+ synced_gpus (`bool`, *optional*, defaults to `False`):
+ Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+ sequential (`bool`, defaults to `False`):
+ By default, beam search has `batch_size * num_beams` as effective batch size (see `beam_search()` for
+ more details). This flag will avoid parallelizing the beam search and will instead run beam search
+ sequentially.
+ model_kwargs:
+ Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
+ an encoder-decoder model the kwargs should include `encoder_outputs`.
+
+ Return:
+ [`generation.GenerateBeamDecoderOnlyOutput`], [`~generation.GenerateBeamEncoderDecoderOutput`] or
+ `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+ [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+ `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if
+ `model.config.is_encoder_decoder=True`.
+
+
+ Examples:
+
+ ```python
+ >>> from transformers import (
+ ... AutoTokenizer,
+ ... AutoModelForSeq2SeqLM,
+ ... LogitsProcessorList,
+ ... MinLengthLogitsProcessor,
+ ... BeamSearchScorer,
+ ... )
+ >>> import torch
+
+ >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
+ >>> model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")
+
+ >>> encoder_input_str = "translate English to German: How old are you?"
+ >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
+
+
+ >>> # lets run beam search using 3 beams
+ >>> num_beams = 3
+ >>> # define decoder start token ids
+ >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
+ >>> input_ids = input_ids * model.config.decoder_start_token_id
+
+ >>> # add encoder_outputs to model keyword arguments
+ >>> model_kwargs = {
+ ... "encoder_outputs": model.get_encoder()(
+ ... encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True
+ ... )
+ ... }
+
+ >>> # instantiate beam scorer
+ >>> beam_scorer = BeamSearchScorer(
+ ... batch_size=1,
+ ... num_beams=num_beams,
+ ... device=model.device,
+ ... )
+
+ >>> # instantiate logits processors
+ >>> logits_processor = LogitsProcessorList(
+ ... [
+ ... MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
+ ... ]
+ ... )
+
+ >>> outputs = model._beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs)
+
+ >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+ ['Wie alt bist du?']
+ ```"""
+ # init values
logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
sequential = sequential if sequential is not None else self.generation_config.low_memory
@@ -107,6 +213,8 @@ def new_beam_search(
)
stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id))
else:
+ # TODO remove when the method is totally private and beam scorer refactored
+ # need to get `eos_token_id` and add stopping criteria, so that generation does not go forever
eos_token_id = [
criteria.eos_token_id.tolist() for criteria in stopping_criteria if hasattr(criteria, "eos_token_id")
]
@@ -139,16 +247,28 @@ def new_beam_search(
cur_len = model_kwargs["inputs_embeds"].shape[1]
model_kwargs["cache_position"] = torch.arange(cur_len, device=input_ids.device)
+ if num_beams * batch_size != batch_beam_size:
+ raise ValueError(
+ f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
+ )
+
# init attention / hidden states / scores tuples
scores = () if (return_dict_in_generate and output_scores) else None
raw_logits = () if (return_dict_in_generate and output_logits) else None
beam_indices = (
- tuple(() for _ in range(num_beams * batch_size)) if (return_dict_in_generate and output_scores) else None
+ tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None
)
decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
cross_attentions = () if (return_dict_in_generate and output_attentions) else None
decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+ # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+ if return_dict_in_generate and self.config.is_encoder_decoder:
+ encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+ encoder_hidden_states = (
+ model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+ )
+
# initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens
# of the first beam are considered to avoid sampling the exact same tokens across all beams.
beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
@@ -158,7 +278,6 @@ def new_beam_search(
this_peer_finished = False
decoder_prompt_len = input_ids.shape[-1] # record the prompt length of decoder
- first_iteration = True
while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
tic = time.perf_counter()
@@ -167,10 +286,27 @@ def new_beam_search(
# if sequential is True, split the input to batches of batch_size and run sequentially
tic_infer = time.perf_counter()
if sequential:
+ if any(
+ model_name in self.__class__.__name__.lower()
+ for model_name in [
+ "fsmt",
+ "reformer",
+ "bloom",
+ "ctrl",
+ "gpt_bigcode",
+ "transo_xl",
+ "xlnet",
+ "cpm",
+ "jamba",
+ ]
+ ):
+ raise RuntimeError(
+ f"Currently generation for {self.__class__.__name__} is not supported "
+ f"for `low_memory beam_search`. Please open an issue on GitHub if you need this feature."
+ )
+
inputs_per_sub_batches = _split_model_inputs(
- model_inputs,
- split_size=batch_size,
- full_batch_size=batch_beam_size if not first_iteration else batch_size,
+ model_inputs, split_size=batch_size, full_batch_size=batch_beam_size
)
outputs_per_sub_batch = [
self(
@@ -192,17 +328,12 @@ def new_beam_search(
output_hidden_states=output_hidden_states,
)
tm_infer_list.append(time.perf_counter() - tic_infer)
- if first_iteration:
- input_ids = input_ids.repeat_interleave(num_beams, dim=0)
- model_kwargs = self._update_inputs_for_beam_search(model_kwargs, num_beams)
- logits, past_key_values = self._expand_outputs_for_generation(
- num_beams, outputs.logits, outputs.past_key_values
- )
- outputs.logits = logits
- outputs.past_key_values = past_key_values
+ if synced_gpus and this_peer_finished:
+ cur_len = cur_len + 1
+ continue # don't waste resources running the code we don't need
next_token_logits = outputs.logits[:, -1, :]
- next_token_scores = torch.nn.functional.log_softmax(
+ next_token_scores = nn.functional.log_softmax(
next_token_logits, dim=-1
) # (batch_size * num_beams, vocab_size)
@@ -266,7 +397,7 @@ def new_beam_search(
model_kwargs,
is_encoder_decoder=self.config.is_encoder_decoder,
)
- if model_kwargs.get("past_key_values", None) is not None and not first_iteration:
+ if model_kwargs.get("past_key_values", None) is not None:
model_kwargs["past_key_values"] = self._temporary_reorder_cache(
model_kwargs["past_key_values"], beam_idx
)
@@ -280,8 +411,6 @@ def new_beam_search(
if beam_scorer.is_done or all(stopping_criteria(input_ids, scores)):
this_peer_finished = True
- first_iteration = False
-
sequence_outputs = beam_scorer.finalize(
input_ids,
beam_scores,
@@ -298,16 +427,31 @@ def new_beam_search(
if not output_scores:
sequence_outputs["sequence_scores"] = None
- return GenerateBeamDecoderOnlyOutput(
- sequences=sequence_outputs["sequences"],
- sequences_scores=sequence_outputs["sequence_scores"],
- scores=scores,
- logits=raw_logits,
- beam_indices=sequence_outputs["beam_indices"],
- attentions=decoder_attentions,
- hidden_states=decoder_hidden_states,
- past_key_values=model_kwargs.get("past_key_values"),
- )
+ if self.config.is_encoder_decoder:
+ return GenerateBeamEncoderDecoderOutput(
+ sequences=sequence_outputs["sequences"],
+ sequences_scores=sequence_outputs["sequence_scores"],
+ scores=scores,
+ logits=raw_logits,
+ beam_indices=sequence_outputs["beam_indices"],
+ encoder_attentions=encoder_attentions,
+ encoder_hidden_states=encoder_hidden_states,
+ decoder_attentions=decoder_attentions,
+ cross_attentions=cross_attentions,
+ decoder_hidden_states=decoder_hidden_states,
+ past_key_values=model_kwargs.get("past_key_values"),
+ )
+ else:
+ return GenerateBeamDecoderOnlyOutput(
+ sequences=sequence_outputs["sequences"],
+ sequences_scores=sequence_outputs["sequence_scores"],
+ scores=scores,
+ logits=raw_logits,
+ beam_indices=sequence_outputs["beam_indices"],
+ attentions=decoder_attentions,
+ hidden_states=decoder_hidden_states,
+ past_key_values=model_kwargs.get("past_key_values"),
+ )
else:
return sequence_outputs["sequences"]
@@ -339,15 +483,6 @@ def get_time_infer_list(self):
global tm_infer_list
return tm_infer_list
- def new_forward(self, model, model_type=None):
+ def new_forward(self, model):
"""Define a new beam search function."""
- min_version = version.parse(hook_common.TRANS_MIN_VERSION)
- trans_version = version.parse(transformers.__version__)
- if trans_version < min_version:
- log.warning(f'The function of getting latency of beam search will not be available with current transformers version:{trans_version}')
- else:
- min_second_version = version.parse(hook_common.TRANS_SENCOND_VERSION)
- if trans_version >= min_second_version:
- model._beam_search = new_beam_search.__get__(model, model.__class__)
- else:
- model.beam_search = hook_old_beam.old_beam_search.__get__(model, model.__class__)
\ No newline at end of file
+ model._beam_search = new_beam_search.__get__(model, model.__class__)
\ No newline at end of file
diff --git a/llm_bench/python/utils/hook_beam_search_old.py b/llm_bench/python/utils/hook_beam_search_old.py
deleted file mode 100644
index a1b1845f1e..0000000000
--- a/llm_bench/python/utils/hook_beam_search_old.py
+++ /dev/null
@@ -1,374 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (C) 2023-2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-# flake8: noqa
-import time
-import torch
-import warnings
-import transformers
-import torch.distributed as dist
-import logging as log
-from torch import nn
-from packaging import version
-from typing import Optional, Tuple, Union, List
-from transformers.generation.stopping_criteria import (
- StoppingCriteriaList,
- validate_stopping_criteria,
-)
-from transformers.generation.logits_process import LogitsProcessorList
-from transformers.generation.beam_search import BeamScorer
-from transformers.utils import ModelOutput
-import utils.hook_beam_search as hook_beam
-
-
-class BeamSearchEncoderDecoderOutput(ModelOutput):
- sequences: torch.LongTensor = None
- sequences_scores: Optional[torch.FloatTensor] = None
- scores: Optional[Tuple[torch.FloatTensor]] = None
- beam_indices: Optional[torch.LongTensor] = None
- encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
- encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
- decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
- cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
- decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-
-
-class BeamSearchDecoderOnlyOutput(ModelOutput):
- sequences: torch.LongTensor = None
- sequences_scores: Optional[torch.FloatTensor] = None
- scores: Optional[Tuple[torch.FloatTensor]] = None
- beam_indices: Optional[torch.LongTensor] = None
- attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
- hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-
-
-BeamSearchOutput = Union[BeamSearchEncoderDecoderOutput, BeamSearchDecoderOnlyOutput]
-
-
-# Transformers version: Release/v4.35.2 514de24abfd4416aeba6a6455ad5920f57f3567d
-# Copied from https://github.com/huggingface/transformers/blob/514de24abfd4416aeba6a6455ad5920f57f3567d/src/transformers/generation/utils.py#L2894
-# Add the function of collecting latency
-def old_beam_search(
- self,
- input_ids: torch.LongTensor,
- beam_scorer: BeamScorer,
- logits_processor: Optional[LogitsProcessorList] = None,
- stopping_criteria: Optional[StoppingCriteriaList] = None,
- max_length: Optional[int] = None,
- pad_token_id: Optional[int] = None,
- eos_token_id: Optional[Union[int, List[int]]] = None,
- output_attentions: Optional[bool] = None,
- output_hidden_states: Optional[bool] = None,
- output_scores: Optional[bool] = None,
- return_dict_in_generate: Optional[bool] = None,
- synced_gpus: bool = False,
- **model_kwargs,
- ) -> Union[BeamSearchOutput, torch.LongTensor]:
- r"""
- Generates sequences of token ids for models with a language modeling head using **beam search decoding** and
- can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
-
-
-
- In most cases, you do not need to call [`~generation.GenerationMixin.beam_search`] directly. Use generate()
- instead. For an overview of generation strategies and code examples, check the [following
- guide](../generation_strategies).
-
-
-
- Parameters:
- input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
- The sequence used as a prompt for the generation.
- beam_scorer (`BeamScorer`):
- An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
- sorted during generation. For more information, the documentation of [`BeamScorer`] should be read.
- logits_processor (`LogitsProcessorList`, *optional*):
- An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
- used to modify the prediction scores of the language modeling head applied at each generation step.
- stopping_criteria (`StoppingCriteriaList`, *optional*):
- An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
- used to tell if the generation loop should stop.
- max_length (`int`, *optional*, defaults to 20):
- **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
- tokens. The maximum length of the sequence to be generated.
- pad_token_id (`int`, *optional*):
- The id of the *padding* token.
- eos_token_id (`Union[int, List[int]]`, *optional*):
- The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
- output_attentions (`bool`, *optional*, defaults to `False`):
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under
- returned tensors for more details.
- output_hidden_states (`bool`, *optional*, defaults to `False`):
- Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
- for more details.
- output_scores (`bool`, *optional*, defaults to `False`):
- Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
- return_dict_in_generate (`bool`, *optional*, defaults to `False`):
- Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
- synced_gpus (`bool`, *optional*, defaults to `False`):
- Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
- model_kwargs:
- Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
- an encoder-decoder model the kwargs should include `encoder_outputs`.
-
- Return:
- [`generation.BeamSearchDecoderOnlyOutput`], [`~generation.BeamSearchEncoderDecoderOutput`] or
- `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
- [`~generation.BeamSearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
- `return_dict_in_generate=True` or a [`~generation.BeamSearchEncoderDecoderOutput`] if
- `model.config.is_encoder_decoder=True`.
-
-
- Examples:
-
- ```python
- >>> from transformers import (
- ... AutoTokenizer,
- ... AutoModelForSeq2SeqLM,
- ... LogitsProcessorList,
- ... MinLengthLogitsProcessor,
- ... BeamSearchScorer,
- ... )
- >>> import torch
-
- >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
- >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
-
- >>> encoder_input_str = "translate English to German: How old are you?"
- >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
-
-
- >>> # lets run beam search using 3 beams
- >>> num_beams = 3
- >>> # define decoder start token ids
- >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
- >>> input_ids = input_ids * model.config.decoder_start_token_id
-
- >>> # add encoder_outputs to model keyword arguments
- >>> model_kwargs = {
- ... "encoder_outputs": model.get_encoder()(
- ... encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True
- ... )
- ... }
-
- >>> # instantiate beam scorer
- >>> beam_scorer = BeamSearchScorer(
- ... batch_size=1,
- ... num_beams=num_beams,
- ... device=model.device,
- ... )
-
- >>> # instantiate logits processors
- >>> logits_processor = LogitsProcessorList(
- ... [
- ... MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
- ... ]
- ... )
-
- >>> outputs = model.beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs)
-
- >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
- ['Wie alt bist du?']
- ```"""
- # init values
- logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
- stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
- if max_length is not None:
- warnings.warn(
- "`max_length` is deprecated in this function, use"
- " `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))` instead.",
- UserWarning,
- )
- stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
- if len(stopping_criteria) == 0:
- warnings.warn("You don't have defined any stopping_criteria, this will likely loop forever", UserWarning)
- pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
- eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
- if isinstance(eos_token_id, int):
- eos_token_id = [eos_token_id]
- output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
- output_attentions = (
- output_attentions if output_attentions is not None else self.generation_config.output_attentions
- )
- output_hidden_states = (
- output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
- )
- return_dict_in_generate = (
- return_dict_in_generate
- if return_dict_in_generate is not None
- else self.generation_config.return_dict_in_generate
- )
-
- batch_size = len(beam_scorer._beam_hyps)
- num_beams = beam_scorer.num_beams
-
- batch_beam_size, cur_len = input_ids.shape
-
- if num_beams * batch_size != batch_beam_size:
- raise ValueError(
- f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
- )
-
- # init attention / hidden states / scores tuples
- scores = () if (return_dict_in_generate and output_scores) else None
- beam_indices = (
- tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None
- )
- decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
- cross_attentions = () if (return_dict_in_generate and output_attentions) else None
- decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
-
- # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
- if return_dict_in_generate and self.config.is_encoder_decoder:
- encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
- encoder_hidden_states = (
- model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
- )
-
- # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens
- # of the first beam are considered to avoid sampling the exact same tokens across all beams.
- beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
- beam_scores[:, 1:] = -1e9
- beam_scores = beam_scores.view((batch_size * num_beams,))
-
- this_peer_finished = False # used by synced_gpus only
- while True:
- tic = time.perf_counter()
- if synced_gpus:
- # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
- # The following logic allows an early break if all peers finished generating their sequence
- this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
- # send 0.0 if we finished, 1.0 otherwise
- dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
- # did all peers finish? the reduced sum will be 0.0 then
- if this_peer_finished_flag.item() == 0.0:
- break
-
- model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
- tic_infer = time.perf_counter()
- outputs = self(
- **model_inputs,
- return_dict=True,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- )
- hook_beam.tm_infer_list.append(time.perf_counter() - tic_infer)
-
- if synced_gpus and this_peer_finished:
- cur_len = cur_len + 1
- continue # don't waste resources running the code we don't need
-
- next_token_logits = outputs.logits[:, -1, :]
- next_token_scores = nn.functional.log_softmax(
- next_token_logits, dim=-1
- ) # (batch_size * num_beams, vocab_size)
-
- next_token_scores_processed = logits_processor(input_ids, next_token_scores)
- next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(
- next_token_scores_processed
- )
-
- # Store scores, attentions and hidden_states when required
- if return_dict_in_generate:
- if output_scores:
- scores += (next_token_scores_processed,)
- if output_attentions:
- decoder_attentions += (
- (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
- )
- if self.config.is_encoder_decoder:
- cross_attentions += (outputs.cross_attentions,)
-
- if output_hidden_states:
- decoder_hidden_states += (
- (outputs.decoder_hidden_states,)
- if self.config.is_encoder_decoder
- else (outputs.hidden_states,)
- )
-
- # reshape for beam search
- vocab_size = next_token_scores.shape[-1]
- next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)
-
- # Sample 1 + len(eos_token_id) next tokens for each beam so we have at least 1 non eos token per beam.
- n_eos_tokens = len(eos_token_id) if eos_token_id else 0
- next_token_scores, next_tokens = torch.topk(
- next_token_scores, max(2, 1 + n_eos_tokens) * num_beams, dim=1, largest=True, sorted=True
- )
-
- next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
- next_tokens = next_tokens % vocab_size
-
- # stateless
- beam_outputs = beam_scorer.process(
- input_ids,
- next_token_scores,
- next_tokens,
- next_indices,
- pad_token_id=pad_token_id,
- eos_token_id=eos_token_id,
- beam_indices=beam_indices,
- )
-
- beam_scores = beam_outputs["next_beam_scores"]
- beam_next_tokens = beam_outputs["next_beam_tokens"]
- beam_idx = beam_outputs["next_beam_indices"]
-
- input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
-
- model_kwargs = self._update_model_kwargs_for_generation(
- outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder,
- )
- if model_kwargs["past_key_values"] is not None:
- model_kwargs["past_key_values"] = self._reorder_cache(model_kwargs["past_key_values"], beam_idx)
-
- if return_dict_in_generate and output_scores:
- beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices))))
-
- # increase cur_len
- cur_len = cur_len + 1
- hook_beam.tm_list.append(time.perf_counter() - tic)
- if beam_scorer.is_done or stopping_criteria(input_ids, scores):
- if not synced_gpus:
- break
- else:
- this_peer_finished = True
-
- sequence_outputs = beam_scorer.finalize(
- input_ids,
- beam_scores,
- next_tokens,
- next_indices,
- pad_token_id=pad_token_id,
- eos_token_id=eos_token_id,
- max_length=stopping_criteria.max_length,
- beam_indices=beam_indices,
- )
-
- if return_dict_in_generate:
- if not output_scores:
- sequence_outputs["sequence_scores"] = None
-
- if self.config.is_encoder_decoder:
- return BeamSearchEncoderDecoderOutput(
- sequences=sequence_outputs["sequences"],
- sequences_scores=sequence_outputs["sequence_scores"],
- scores=scores,
- beam_indices=sequence_outputs["beam_indices"],
- encoder_attentions=encoder_attentions,
- encoder_hidden_states=encoder_hidden_states,
- decoder_attentions=decoder_attentions,
- cross_attentions=cross_attentions,
- decoder_hidden_states=decoder_hidden_states,
- )
- else:
- return BeamSearchDecoderOnlyOutput(
- sequences=sequence_outputs["sequences"],
- sequences_scores=sequence_outputs["sequence_scores"],
- scores=scores,
- beam_indices=sequence_outputs["beam_indices"],
- attentions=decoder_attentions,
- hidden_states=decoder_hidden_states,
- )
- else:
- return sequence_outputs["sequences"]
\ No newline at end of file
diff --git a/llm_bench/python/utils/hook_common.py b/llm_bench/python/utils/hook_common.py
index 5e93385d45..3ff78c9f68 100644
--- a/llm_bench/python/utils/hook_common.py
+++ b/llm_bench/python/utils/hook_common.py
@@ -2,149 +2,26 @@
# Copyright (C) 2023-2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
# flake8: noqa
-import torch
-from typing import Union, List, Dict
-from transformers.utils import ModelOutput
-
-TRANS_MIN_VERSION = '4.36.0'
-TRANS_SENCOND_VERSION = '4.39.0'
-
-
-# Copied from https://github.com/huggingface/transformers/blob/v4.39-release/src/transformers/generation/utils.py#L4783
-def _split(data, full_batch_size: int, split_size: int = None):
- """
- Takes care of three cases:
- 1. data is a tensor: e.g. last_hidden_state, pooler_output etc. split them on the batch_size dim
- 2. data is a tuple: e.g. hidden_states, attentions etc. Keep the tuple as it is and split each tensor in it and
- return a list of tuples
- 3. data is a tuple of tuples, e.g. past_key_values. Keep the tuple as it is and split each tuple in it and
- return a list of tuples of tuples
- (see documentation of ModelOutput)
- """
- if data is None:
- return [None] * (full_batch_size // split_size)
- if isinstance(data, torch.Tensor):
- return [data[i : i + split_size] for i in range(0, full_batch_size, split_size)]
- elif isinstance(data, tuple):
- # If the elements of the tuple are also tuples (e.g., past_key_values in our earlier example)
- if isinstance(data[0], tuple):
- return [
- tuple(tuple(tensor[i : i + split_size] for tensor in inner_tuple) for inner_tuple in data)
- for i in range(0, full_batch_size, split_size)
- ]
-
+import logging as log
+import transformers
+from packaging import version
+
+TRANS_MIN_VERSION = '4.40.0'
+
+
+def get_bench_hook(num_beams, ov_model):
+ min_version = version.parse(TRANS_MIN_VERSION)
+ trans_version = version.parse(transformers.__version__)
+ search_type = 'beam search' if num_beams > 1 else 'greedy search'
+ if trans_version >= min_version:
+ import utils.hook_greedy_search
+ import utils.hook_beam_search
+ if num_beams > 1:
+ bench_hook = utils.hook_beam_search.BeamSearchHook()
else:
- return [
- tuple(sub_tensor[i : i + split_size] for sub_tensor in data)
- for i in range(0, full_batch_size, split_size)
- ]
+ bench_hook = utils.hook_greedy_search.GreedySearchHook()
+ bench_hook.new_forward(ov_model)
else:
- raise ValueError(f"Unexpected attribute type: {type(data)}")
-
-
-# Copied from https://github.com/huggingface/transformers/blob/v4.39-release/src/transformers/generation/utils.py#L4814
-def _split_model_inputs(
- model_input: Union[ModelOutput, Dict], split_size: int, full_batch_size: int
-) -> List[Union[ModelOutput, Dict]]:
- """
- Split a ModelOutput object (or its subclasses) or Dict into a list of same-class objects based on a specified split
- size. The input object is dict when it was prepared for forward pass and ModelOutput when it was returned from
- previous forward pass.
- """
- # Edge case: if model_input is None, return a list of Nones
- # this happens with Whisper where encoder_outputs is None
- if model_input is None:
- return [model_input] * (full_batch_size // split_size)
- # Infer the class from the object
- model_output_cls = type(model_input)
- if (full_batch_size % split_size) != 0:
- raise ValueError("`full_batch_size` must be divisible by `split_size`")
-
- if split_size > full_batch_size:
- raise ValueError("`split_size` must be smaller or equal to `full_batch_size`")
-
- # Helper function to split tensors or tuples of tensors
-
- # Find all the dataclass fields (e.g., last_hidden_state, pooler_output etc.) and split them
- keys = (
- model_input.__dataclass_fields__.keys() if hasattr(model_input, "__dataclass_fields__") else model_input.keys()
- )
- # We only keep keys that are in the model_input
- keys = [k for k in keys if k in model_input]
- # Here we can have four types of values: tensors, tuples of tensors and booleans, and encoder_outputs which is a
- # ModelOutput object.
- # bool should not be split but replicated for each split
- bool_keys = [k for k in keys if isinstance(model_input[k], bool) or k == "cache_position"]
- keys_to_ignore = ["cache_position", "encoder_outputs"]
- non_bool_keys = [k for k in keys if not isinstance(model_input[k], bool) and k not in keys_to_ignore]
-
- # we split the tensors and tuples of tensors
- data_split_list = [
- {k: _split(model_input[k], full_batch_size, split_size)[i] for k in non_bool_keys}
- for i in range(full_batch_size // split_size)
- ]
- # bool values are the same and replicated for each split
- bool_data = {k: model_input[k] for k in bool_keys}
- # encoder_outputs is a ModelOutput object and should be split by its own
- if "encoder_outputs" in model_input:
- encoder_outputs_split = _split_model_inputs(model_input["encoder_outputs"], split_size, full_batch_size)
- data_split_list = [
- {**data_split, "encoder_outputs": encoder_outputs_split[i]} for i, data_split in enumerate(data_split_list)
- ]
-
- # Convert each dictionary in the list to an object of the inferred class
- split_model_inputs: List[Union[ModelOutput, Dict]] = [
- model_output_cls(**data_split, **bool_data) for data_split in data_split_list
- ]
-
- return split_model_inputs
-
-
-# Copied from https://github.com/huggingface/transformers/blob/v4.39-release/src/transformers/generation/utils.py#L4871
-def stack_model_outputs(model_outputs: List[ModelOutput]) -> ModelOutput:
- """
- Stack a list of ModelOutput objects (or its subclasses) along the batch_size dimension. The function infers the
- specific ModelOutput subclass from the list provided.
- """
- if not model_outputs:
- raise ValueError("Input list is empty.")
-
- # Infer the class from the first object in the list
- model_output_cls = type(model_outputs[0])
-
- # Ensure all objects are of the same type
- if not all(isinstance(obj, model_output_cls) for obj in model_outputs):
- raise ValueError("All elements in the list should be of the same type.")
-
- # Helper function to concat tensors or tuples of tensors
- def _concat(data):
- """
- Reverse of `_split` function above.
- """
- if any(data is None for data in data):
- return None
- if isinstance(data[0], torch.Tensor):
- return torch.cat(data, dim=0)
- elif isinstance(data[0], tuple):
- # If the elements of the tuple are also tuples (e.g., past_key_values in our earlier example)
- if isinstance(data[0][0], tuple):
- return tuple(
- tuple(torch.cat([attr[i][j] for attr in data], dim=0) for j in range(len(data[0][0])))
- for i in range(len(data[0]))
- )
- else:
- return tuple(torch.cat([attr[i] for attr in data], dim=0) for i in range(len(data[0])))
- elif isinstance(data[0], (int, float)):
- # If the elements are integers or floats, return a tensor
- return torch.tensor(data)
- else:
- raise ValueError(f"Unexpected attribute type: {type(data[0])}")
-
- # Use a dictionary comprehension to gather attributes from all objects and concatenate them
- concatenated_data = {
- k: _concat([getattr(model_output, k) for model_output in model_outputs])
- for k in model_output_cls.__dataclass_fields__.keys()
- }
-
- # Return a new object of the inferred class with the concatenated attributes
- return model_output_cls(**concatenated_data)
\ No newline at end of file
+ log.warning(f'The minimum version of transformers to get 1st and 2nd tokens latency of {search_type} is: {min_version}')
+ bench_hook = None
+ return bench_hook
\ No newline at end of file
diff --git a/llm_bench/python/utils/hook_greedy_search.py b/llm_bench/python/utils/hook_greedy_search.py
index 7fcbff2fd7..a3912726d7 100644
--- a/llm_bench/python/utils/hook_greedy_search.py
+++ b/llm_bench/python/utils/hook_greedy_search.py
@@ -5,20 +5,19 @@
import time
import torch
import warnings
-import transformers
-import torch.distributed as dist
import logging as log
-import utils.hook_common as hook_common
-from packaging import version
from typing import Optional, Tuple, Union, List
from transformers.generation.stopping_criteria import (
+ EosTokenCriteria,
StoppingCriteriaList,
validate_stopping_criteria,
)
from transformers.generation.logits_process import LogitsProcessorList
from transformers.generation.streamers import BaseStreamer
from transformers.utils import ModelOutput
-import utils.hook_greedy_search_old as hook_old_greedy
+
+
+logger = log.getLogger(__name__)
class GenerateDecoderOnlyOutput(ModelOutput):
@@ -47,8 +46,8 @@ class GenerateEncoderDecoderOutput(ModelOutput):
tm_list = []
tm_infer_list = []
-# Transformers version: Release/v4.39.2 97c00cdfe132164dbd793447a088432fa359fd36
-# Copied from https://github.com/huggingface/transformers/blob/v4.39-release/src/transformers/generation/utils.py#L2244
+# Transformers version: v4.40-release 4fdf58afb72b0754da30037fc800b6044e7d9c99
+# Copied from https://github.com/huggingface/transformers/blob/4fdf58afb72b0754da30037fc800b6044e7d9c99/src/transformers/generation/utils.py#L2310
# Add the function of collecting latency
def new_greedy_search(
self,
@@ -173,10 +172,27 @@ def new_greedy_search(
)
stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
- eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
+ if eos_token_id is not None:
+ logger.warning_once(
+ "`eos_token_id` is deprecated in this function and will be removed in v4.41, use"
+ " `stopping_criteria=StoppingCriteriaList([EosTokenCriteria(eos_token_id=eos_token_id)])` instead."
+ " Otherwise make sure to set `model.generation_config.eos_token_id`",
+ FutureWarning,
+ )
+ stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id))
+ else:
+ # TODO remove when the method is totally private
+ # need to get `eos_token_id` and add stopping criteria, so that generation does not go forever
+ eos_token_id = [
+ criteria.eos_token_id.tolist() for criteria in stopping_criteria if hasattr(criteria, "eos_token_id")
+ ]
+ eos_token_id = eos_token_id[0] if eos_token_id else None
+ if eos_token_id is None and self.generation_config.eos_token_id is not None:
+ eos_token_id = self.generation_config.eos_token_id
+ stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id))
+
if isinstance(eos_token_id, int):
eos_token_id = [eos_token_id]
- eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
output_attentions = (
output_attentions if output_attentions is not None else self.generation_config.output_attentions
@@ -274,12 +290,6 @@ def new_greedy_search(
is_encoder_decoder=self.config.is_encoder_decoder,
)
- # if eos_token was found in one sentence, set sentence to finished
- if eos_token_id_tensor is not None:
- unfinished_sequences = unfinished_sequences.mul(
- next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
- )
-
unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores)
this_peer_finished = unfinished_sequences.max() == 0
tm_list.append(time.perf_counter() - tic)
@@ -340,15 +350,7 @@ def get_time_infer_list(self):
global tm_infer_list
return tm_infer_list
- def new_forward(self, model, model_type=None):
+ def new_forward(self, model):
"""Define a new greedy search function."""
- min_version = version.parse(hook_common.TRANS_MIN_VERSION)
- trans_version = version.parse(transformers.__version__)
- if trans_version < min_version:
- log.warning(f'The function of getting latency of greedy search will not be available with current transformers version:{trans_version}')
- else:
- min_second_version = version.parse(hook_common.TRANS_SENCOND_VERSION)
- if trans_version >= min_second_version:
- model._greedy_search = new_greedy_search.__get__(model, model.__class__)
- else:
- model.greedy_search = hook_old_greedy.old_greedy_search.__get__(model, model.__class__)
+ model._greedy_search = new_greedy_search.__get__(model, model.__class__)
+
diff --git a/llm_bench/python/utils/hook_greedy_search_old.py b/llm_bench/python/utils/hook_greedy_search_old.py
deleted file mode 100644
index 595aa596da..0000000000
--- a/llm_bench/python/utils/hook_greedy_search_old.py
+++ /dev/null
@@ -1,302 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (C) 2023-2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-# flake8: noqa
-import time
-import torch
-import warnings
-import torch.distributed as dist
-from typing import Optional, Tuple, Union, List
-from transformers.generation.stopping_criteria import (
- StoppingCriteriaList,
- validate_stopping_criteria,
-)
-from transformers.generation.logits_process import LogitsProcessorList
-from transformers.generation.streamers import BaseStreamer
-from transformers.utils import ModelOutput
-import utils.hook_greedy_search as hook_greedy
-
-
-class GreedySearchDecoderOnlyOutput(ModelOutput):
- sequences: torch.LongTensor = None
- scores: Optional[Tuple[torch.FloatTensor]] = None
- attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
- hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-
-
-class GreedySearchEncoderDecoderOutput(ModelOutput):
- sequences: torch.LongTensor = None
- scores: Optional[Tuple[torch.FloatTensor]] = None
- encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
- encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
- decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
- cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
- decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
-
-
-GreedySearchOutput = Union[GreedySearchEncoderDecoderOutput, GreedySearchDecoderOnlyOutput]
-
-# Transformers version: Release/v4.35.2 514de24abfd4416aeba6a6455ad5920f57f3567d
-# Copied from https://github.com/huggingface/transformers/blob/514de24abfd4416aeba6a6455ad5920f57f3567d/src/transformers/generation/utils.py#L2353
-# Add the function of collecting latency
-def old_greedy_search(
- self,
- input_ids: torch.LongTensor,
- logits_processor: Optional[LogitsProcessorList] = None,
- stopping_criteria: Optional[StoppingCriteriaList] = None,
- max_length: Optional[int] = None,
- pad_token_id: Optional[int] = None,
- eos_token_id: Optional[Union[int, List[int]]] = None,
- output_attentions: Optional[bool] = None,
- output_hidden_states: Optional[bool] = None,
- output_scores: Optional[bool] = None,
- return_dict_in_generate: Optional[bool] = None,
- synced_gpus: bool = False,
- streamer: Optional["BaseStreamer"] = None,
- **model_kwargs,
- ) -> Union[GreedySearchOutput, torch.LongTensor]:
- r"""
- Generates sequences of token ids for models with a language modeling head using **greedy decoding** and can be
- used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
-
-
-
- In most cases, you do not need to call [`~generation.GenerationMixin.greedy_search`] directly. Use generate()
- instead. For an overview of generation strategies and code examples, check the [following
- guide](../generation_strategies).
-
-
-
-
- Parameters:
- input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
- The sequence used as a prompt for the generation.
- logits_processor (`LogitsProcessorList`, *optional*):
- An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
- used to modify the prediction scores of the language modeling head applied at each generation step.
- stopping_criteria (`StoppingCriteriaList`, *optional*):
- An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
- used to tell if the generation loop should stop.
-
- max_length (`int`, *optional*, defaults to 20):
- **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
- tokens. The maximum length of the sequence to be generated.
- pad_token_id (`int`, *optional*):
- The id of the *padding* token.
- eos_token_id (`Union[int, List[int]]`, *optional*):
- The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
- output_attentions (`bool`, *optional*, defaults to `False`):
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under
- returned tensors for more details.
- output_hidden_states (`bool`, *optional*, defaults to `False`):
- Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
- for more details.
- output_scores (`bool`, *optional*, defaults to `False`):
- Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
- return_dict_in_generate (`bool`, *optional*, defaults to `False`):
- Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
- synced_gpus (`bool`, *optional*, defaults to `False`):
- Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
- streamer (`BaseStreamer`, *optional*):
- Streamer object that will be used to stream the generated sequences. Generated tokens are passed
- through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
- model_kwargs:
- Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
- If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
-
- Return:
- [`~generation.GreedySearchDecoderOnlyOutput`], [`~generation.GreedySearchEncoderDecoderOutput`] or
- `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
- [`~generation.GreedySearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
- `return_dict_in_generate=True` or a [`~generation.GreedySearchEncoderDecoderOutput`] if
- `model.config.is_encoder_decoder=True`.
-
- Examples:
-
- ```python
- >>> from transformers import (
- ... AutoTokenizer,
- ... AutoModelForCausalLM,
- ... LogitsProcessorList,
- ... MinLengthLogitsProcessor,
- ... StoppingCriteriaList,
- ... MaxLengthCriteria,
- ... )
-
- >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
- >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
-
- >>> # set pad_token_id to eos_token_id because GPT2 does not have a PAD token
- >>> model.generation_config.pad_token_id = model.generation_config.eos_token_id
-
- >>> input_prompt = "It might be possible to"
- >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
-
- >>> # instantiate logits processors
- >>> logits_processor = LogitsProcessorList(
- ... [
- ... MinLengthLogitsProcessor(10, eos_token_id=model.generation_config.eos_token_id),
- ... ]
- ... )
- >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)])
-
- >>> outputs = model.greedy_search(
- ... input_ids, logits_processor=logits_processor, stopping_criteria=stopping_criteria
- ... )
-
- >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
- ["It might be possible to get a better understanding of the nature of the problem, but it's not"]
- ```"""
- # init values
- logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
- stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
- if max_length is not None:
- warnings.warn(
- "`max_length` is deprecated in this function, use"
- " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.",
- UserWarning,
- )
- stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
- pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
- eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
- if isinstance(eos_token_id, int):
- eos_token_id = [eos_token_id]
- eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
- output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
- output_attentions = (
- output_attentions if output_attentions is not None else self.generation_config.output_attentions
- )
- output_hidden_states = (
- output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
- )
- return_dict_in_generate = (
- return_dict_in_generate
- if return_dict_in_generate is not None
- else self.generation_config.return_dict_in_generate
- )
-
- # init attention / hidden states / scores tuples
- scores = () if (return_dict_in_generate and output_scores) else None
- decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
- cross_attentions = () if (return_dict_in_generate and output_attentions) else None
- decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
-
- # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
- if return_dict_in_generate and self.config.is_encoder_decoder:
- encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
- encoder_hidden_states = (
- model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
- )
-
- # keep track of which sequences are already finished
- unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device)
-
- this_peer_finished = False # used by synced_gpus only
- while True:
- tic = time.perf_counter()
- if synced_gpus:
- # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
- # The following logic allows an early break if all peers finished generating their sequence
- this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
- # send 0.0 if we finished, 1.0 otherwise
- dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
- # did all peers finish? the reduced sum will be 0.0 then
- if this_peer_finished_flag.item() == 0.0:
- break
-
- # prepare model inputs
- model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
-
- # forward pass to get next token
- tic_infer = time.perf_counter()
- outputs = self(
- **model_inputs,
- return_dict=True,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- )
- hook_greedy.tm_infer_list.append(time.perf_counter() - tic_infer)
-
- if synced_gpus and this_peer_finished:
- continue # don't waste resources running the code we don't need
-
- next_token_logits = outputs.logits[:, -1, :]
-
- # pre-process distribution
- next_tokens_scores = logits_processor(input_ids, next_token_logits)
-
- # Store scores, attentions and hidden_states when required
- if return_dict_in_generate:
- if output_scores:
- scores += (next_tokens_scores,)
- if output_attentions:
- decoder_attentions += (
- (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
- )
- if self.config.is_encoder_decoder:
- cross_attentions += (outputs.cross_attentions,)
-
- if output_hidden_states:
- decoder_hidden_states += (
- (outputs.decoder_hidden_states,)
- if self.config.is_encoder_decoder
- else (outputs.hidden_states,)
- )
-
- # argmax
- next_tokens = torch.argmax(next_tokens_scores, dim=-1)
-
- # finished sentences should have their next token be a padding token
- if eos_token_id is not None:
- if pad_token_id is None:
- raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
- next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
-
- # update generated ids, model inputs, and length for next step
- input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
- if streamer is not None:
- streamer.put(next_tokens.cpu())
- model_kwargs = self._update_model_kwargs_for_generation(
- outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
- )
-
- # if eos_token was found in one sentence, set sentence to finished
- if eos_token_id_tensor is not None:
- unfinished_sequences = unfinished_sequences.mul(
- next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
- )
-
- # stop when each sentence is finished
- if unfinished_sequences.max() == 0:
- this_peer_finished = True
-
- # stop if we exceed the maximum length
- if stopping_criteria(input_ids, scores):
- this_peer_finished = True
- hook_greedy.tm_list.append(time.perf_counter() - tic)
- if this_peer_finished and not synced_gpus:
- break
-
- if streamer is not None:
- streamer.end()
-
- if return_dict_in_generate:
- if self.config.is_encoder_decoder:
- return GreedySearchEncoderDecoderOutput(
- sequences=input_ids,
- scores=scores,
- encoder_attentions=encoder_attentions,
- encoder_hidden_states=encoder_hidden_states,
- decoder_attentions=decoder_attentions,
- cross_attentions=cross_attentions,
- decoder_hidden_states=decoder_hidden_states,
- )
- else:
- return GreedySearchDecoderOnlyOutput(
- sequences=input_ids,
- scores=scores,
- attentions=decoder_attentions,
- hidden_states=decoder_hidden_states,
- )
- else:
- return input_ids
\ No newline at end of file
diff --git a/llm_bench/python/utils/ov_utils.py b/llm_bench/python/utils/ov_utils.py
index a2416ccb92..3d77941ca9 100644
--- a/llm_bench/python/utils/ov_utils.py
+++ b/llm_bench/python/utils/ov_utils.py
@@ -9,9 +9,7 @@
import torch
import time
import types
-import utils.hook_greedy_search
-import utils.hook_beam_search
-
+import utils.hook_common as hook_common
from utils.config_class import OV_MODEL_CLASSES_MAPPING, TOKENIZE_CLASSES_MAPPING, DEFAULT_MODEL_CLASSES
import openvino.runtime.opset13 as opset
@@ -159,11 +157,7 @@ def create_text_gen_model(model_path, device, **kwargs):
if not isinstance(ov_model, OV_MODEL_CLASSES_MAPPING['t5']):
patch_inter_processing_and_compile(ov_model, **kwargs)
end = time.perf_counter()
- if kwargs['num_beams'] > 1:
- bench_hook = utils.hook_beam_search.BeamSearchHook()
- else:
- bench_hook = utils.hook_greedy_search.GreedySearchHook()
- bench_hook.new_forward(ov_model, model_type)
+ bench_hook = hook_common.get_bench_hook(kwargs['num_beams'], ov_model)
from_pretrained_time = end - start
log.info(f'From pretrained time: {from_pretrained_time:.2f}s')
# load token
diff --git a/llm_bench/python/utils/pt_utils.py b/llm_bench/python/utils/pt_utils.py
index d703f4bb1a..ccf401330c 100644
--- a/llm_bench/python/utils/pt_utils.py
+++ b/llm_bench/python/utils/pt_utils.py
@@ -7,11 +7,7 @@
import os
import time
import logging as log
-import openvino.torch # noqa: F401
-import utils.hook_greedy_search
-import utils.hook_beam_search
-
-MAX_CONNECT_TIME = 50
+import utils.hook_common as hook_common
def set_bf16(model, device, **kwargs):
@@ -95,11 +91,7 @@ def create_text_gen_model(model_path, device, **kwargs):
else:
raise RuntimeError('==Failure ==: no device to load')
- if kwargs['num_beams'] > 1:
- bench_hook = utils.hook_beam_search.BeamSearchHook()
- else:
- bench_hook = utils.hook_greedy_search.GreedySearchHook()
- bench_hook.new_forward(model, model_type)
+ bench_hook = hook_common.get_bench_hook(kwargs['num_beams'], model)
if kwargs['torch_compile_backend']:
backend = kwargs['torch_compile_backend']