Skip to content

Commit

Permalink
update hook funciton to transformers v4.40-release
Browse files Browse the repository at this point in the history
  • Loading branch information
wgzintel committed May 20, 2024
1 parent 7a046e1 commit a69d187
Show file tree
Hide file tree
Showing 8 changed files with 247 additions and 919 deletions.
16 changes: 10 additions & 6 deletions llm_bench/python/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,10 +132,13 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list,
if num == 0:
warmup_md5[prompt_index] = result_md5_list
per_token_time = generation_time * 1000 / (num_tokens / args['batch_size'])
tm_list = bench_hook.get_time_list()
log.debug('latency of all tokens:')
[log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_list)]
tm_infer_list = bench_hook.get_time_infer_list()
tm_list = []
tm_infer_list = []
if bench_hook is not None:
tm_list = bench_hook.get_time_list()
log.debug('latency of all tokens:')
[log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_list)]
tm_infer_list = bench_hook.get_time_infer_list()
iter_data = gen_iterate_data(
num,
input_token_size * args['batch_size'],
Expand Down Expand Up @@ -168,8 +171,9 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list,
utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0])
else:
utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0])
bench_hook.clear_time_list()
bench_hook.clear_time_infer_list()
if bench_hook is not None:
bench_hook.clear_time_list()
bench_hook.clear_time_infer_list()


def run_text_generation_benchmark(model_path, framework, device, args, num_iters):
Expand Down
233 changes: 184 additions & 49 deletions llm_bench/python/utils/hook_beam_search.py

Large diffs are not rendered by default.

374 changes: 0 additions & 374 deletions llm_bench/python/utils/hook_beam_search_old.py

This file was deleted.

165 changes: 21 additions & 144 deletions llm_bench/python/utils/hook_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,149 +2,26 @@
# Copyright (C) 2023-2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
# flake8: noqa
import torch
from typing import Union, List, Dict
from transformers.utils import ModelOutput

TRANS_MIN_VERSION = '4.36.0'
TRANS_SENCOND_VERSION = '4.39.0'


# Copied from https://github.com/huggingface/transformers/blob/v4.39-release/src/transformers/generation/utils.py#L4783
def _split(data, full_batch_size: int, split_size: int = None):
"""
Takes care of three cases:
1. data is a tensor: e.g. last_hidden_state, pooler_output etc. split them on the batch_size dim
2. data is a tuple: e.g. hidden_states, attentions etc. Keep the tuple as it is and split each tensor in it and
return a list of tuples
3. data is a tuple of tuples, e.g. past_key_values. Keep the tuple as it is and split each tuple in it and
return a list of tuples of tuples
(see documentation of ModelOutput)
"""
if data is None:
return [None] * (full_batch_size // split_size)
if isinstance(data, torch.Tensor):
return [data[i : i + split_size] for i in range(0, full_batch_size, split_size)]
elif isinstance(data, tuple):
# If the elements of the tuple are also tuples (e.g., past_key_values in our earlier example)
if isinstance(data[0], tuple):
return [
tuple(tuple(tensor[i : i + split_size] for tensor in inner_tuple) for inner_tuple in data)
for i in range(0, full_batch_size, split_size)
]

import logging as log
import transformers
from packaging import version

TRANS_MIN_VERSION = '4.40.0'


def get_bench_hook(num_beams, ov_model):
min_version = version.parse(TRANS_MIN_VERSION)
trans_version = version.parse(transformers.__version__)
search_type = 'beam search' if num_beams > 1 else 'greedy search'
if trans_version >= min_version:
import utils.hook_greedy_search
import utils.hook_beam_search
if num_beams > 1:
bench_hook = utils.hook_beam_search.BeamSearchHook()
else:
return [
tuple(sub_tensor[i : i + split_size] for sub_tensor in data)
for i in range(0, full_batch_size, split_size)
]
bench_hook = utils.hook_greedy_search.GreedySearchHook()
bench_hook.new_forward(ov_model)
else:
raise ValueError(f"Unexpected attribute type: {type(data)}")


# Copied from https://github.com/huggingface/transformers/blob/v4.39-release/src/transformers/generation/utils.py#L4814
def _split_model_inputs(
model_input: Union[ModelOutput, Dict], split_size: int, full_batch_size: int
) -> List[Union[ModelOutput, Dict]]:
"""
Split a ModelOutput object (or its subclasses) or Dict into a list of same-class objects based on a specified split
size. The input object is dict when it was prepared for forward pass and ModelOutput when it was returned from
previous forward pass.
"""
# Edge case: if model_input is None, return a list of Nones
# this happens with Whisper where encoder_outputs is None
if model_input is None:
return [model_input] * (full_batch_size // split_size)
# Infer the class from the object
model_output_cls = type(model_input)
if (full_batch_size % split_size) != 0:
raise ValueError("`full_batch_size` must be divisible by `split_size`")

if split_size > full_batch_size:
raise ValueError("`split_size` must be smaller or equal to `full_batch_size`")

# Helper function to split tensors or tuples of tensors

# Find all the dataclass fields (e.g., last_hidden_state, pooler_output etc.) and split them
keys = (
model_input.__dataclass_fields__.keys() if hasattr(model_input, "__dataclass_fields__") else model_input.keys()
)
# We only keep keys that are in the model_input
keys = [k for k in keys if k in model_input]
# Here we can have four types of values: tensors, tuples of tensors and booleans, and encoder_outputs which is a
# ModelOutput object.
# bool should not be split but replicated for each split
bool_keys = [k for k in keys if isinstance(model_input[k], bool) or k == "cache_position"]
keys_to_ignore = ["cache_position", "encoder_outputs"]
non_bool_keys = [k for k in keys if not isinstance(model_input[k], bool) and k not in keys_to_ignore]

# we split the tensors and tuples of tensors
data_split_list = [
{k: _split(model_input[k], full_batch_size, split_size)[i] for k in non_bool_keys}
for i in range(full_batch_size // split_size)
]
# bool values are the same and replicated for each split
bool_data = {k: model_input[k] for k in bool_keys}
# encoder_outputs is a ModelOutput object and should be split by its own
if "encoder_outputs" in model_input:
encoder_outputs_split = _split_model_inputs(model_input["encoder_outputs"], split_size, full_batch_size)
data_split_list = [
{**data_split, "encoder_outputs": encoder_outputs_split[i]} for i, data_split in enumerate(data_split_list)
]

# Convert each dictionary in the list to an object of the inferred class
split_model_inputs: List[Union[ModelOutput, Dict]] = [
model_output_cls(**data_split, **bool_data) for data_split in data_split_list
]

return split_model_inputs


# Copied from https://github.com/huggingface/transformers/blob/v4.39-release/src/transformers/generation/utils.py#L4871
def stack_model_outputs(model_outputs: List[ModelOutput]) -> ModelOutput:
"""
Stack a list of ModelOutput objects (or its subclasses) along the batch_size dimension. The function infers the
specific ModelOutput subclass from the list provided.
"""
if not model_outputs:
raise ValueError("Input list is empty.")

# Infer the class from the first object in the list
model_output_cls = type(model_outputs[0])

# Ensure all objects are of the same type
if not all(isinstance(obj, model_output_cls) for obj in model_outputs):
raise ValueError("All elements in the list should be of the same type.")

# Helper function to concat tensors or tuples of tensors
def _concat(data):
"""
Reverse of `_split` function above.
"""
if any(data is None for data in data):
return None
if isinstance(data[0], torch.Tensor):
return torch.cat(data, dim=0)
elif isinstance(data[0], tuple):
# If the elements of the tuple are also tuples (e.g., past_key_values in our earlier example)
if isinstance(data[0][0], tuple):
return tuple(
tuple(torch.cat([attr[i][j] for attr in data], dim=0) for j in range(len(data[0][0])))
for i in range(len(data[0]))
)
else:
return tuple(torch.cat([attr[i] for attr in data], dim=0) for i in range(len(data[0])))
elif isinstance(data[0], (int, float)):
# If the elements are integers or floats, return a tensor
return torch.tensor(data)
else:
raise ValueError(f"Unexpected attribute type: {type(data[0])}")

# Use a dictionary comprehension to gather attributes from all objects and concatenate them
concatenated_data = {
k: _concat([getattr(model_output, k) for model_output in model_outputs])
for k in model_output_cls.__dataclass_fields__.keys()
}

# Return a new object of the inferred class with the concatenated attributes
return model_output_cls(**concatenated_data)
log.warning(f'The minimum version of transformers to get 1st and 2nd tokens latency of {search_type} is: {min_version}')
bench_hook = None
return bench_hook
54 changes: 28 additions & 26 deletions llm_bench/python/utils/hook_greedy_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,19 @@
import time
import torch
import warnings
import transformers
import torch.distributed as dist
import logging as log
import utils.hook_common as hook_common
from packaging import version
from typing import Optional, Tuple, Union, List
from transformers.generation.stopping_criteria import (
EosTokenCriteria,
StoppingCriteriaList,
validate_stopping_criteria,
)
from transformers.generation.logits_process import LogitsProcessorList
from transformers.generation.streamers import BaseStreamer
from transformers.utils import ModelOutput
import utils.hook_greedy_search_old as hook_old_greedy


logger = log.getLogger(__name__)


class GenerateDecoderOnlyOutput(ModelOutput):
Expand Down Expand Up @@ -47,8 +46,8 @@ class GenerateEncoderDecoderOutput(ModelOutput):
tm_list = []
tm_infer_list = []

# Transformers version: Release/v4.39.2 97c00cdfe132164dbd793447a088432fa359fd36
# Copied from https://github.com/huggingface/transformers/blob/v4.39-release/src/transformers/generation/utils.py#L2244
# Transformers version: v4.40-release 4fdf58afb72b0754da30037fc800b6044e7d9c99
# Copied from https://github.com/huggingface/transformers/blob/4fdf58afb72b0754da30037fc800b6044e7d9c99/src/transformers/generation/utils.py#L2310
# Add the function of collecting latency
def new_greedy_search(
self,
Expand Down Expand Up @@ -173,10 +172,27 @@ def new_greedy_search(
)
stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
if eos_token_id is not None:
logger.warning_once(
"`eos_token_id` is deprecated in this function and will be removed in v4.41, use"
" `stopping_criteria=StoppingCriteriaList([EosTokenCriteria(eos_token_id=eos_token_id)])` instead."
" Otherwise make sure to set `model.generation_config.eos_token_id`",
FutureWarning,
)
stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id))
else:
# TODO remove when the method is totally private
# need to get `eos_token_id` and add stopping criteria, so that generation does not go forever
eos_token_id = [
criteria.eos_token_id.tolist() for criteria in stopping_criteria if hasattr(criteria, "eos_token_id")
]
eos_token_id = eos_token_id[0] if eos_token_id else None
if eos_token_id is None and self.generation_config.eos_token_id is not None:
eos_token_id = self.generation_config.eos_token_id
stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id))

if isinstance(eos_token_id, int):
eos_token_id = [eos_token_id]
eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
output_attentions = (
output_attentions if output_attentions is not None else self.generation_config.output_attentions
Expand Down Expand Up @@ -274,12 +290,6 @@ def new_greedy_search(
is_encoder_decoder=self.config.is_encoder_decoder,
)

# if eos_token was found in one sentence, set sentence to finished
if eos_token_id_tensor is not None:
unfinished_sequences = unfinished_sequences.mul(
next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
)

unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores)
this_peer_finished = unfinished_sequences.max() == 0
tm_list.append(time.perf_counter() - tic)
Expand Down Expand Up @@ -340,15 +350,7 @@ def get_time_infer_list(self):
global tm_infer_list
return tm_infer_list

def new_forward(self, model, model_type=None):
def new_forward(self, model):
"""Define a new greedy search function."""
min_version = version.parse(hook_common.TRANS_MIN_VERSION)
trans_version = version.parse(transformers.__version__)
if trans_version < min_version:
log.warning(f'The function of getting latency of greedy search will not be available with current transformers version:{trans_version}')
else:
min_second_version = version.parse(hook_common.TRANS_SENCOND_VERSION)
if trans_version >= min_second_version:
model._greedy_search = new_greedy_search.__get__(model, model.__class__)
else:
model.greedy_search = hook_old_greedy.old_greedy_search.__get__(model, model.__class__)
model._greedy_search = new_greedy_search.__get__(model, model.__class__)

Loading

0 comments on commit a69d187

Please sign in to comment.