Skip to content

Commit

Permalink
resovle conflict
Browse files Browse the repository at this point in the history
  • Loading branch information
wgzintel committed May 23, 2024
2 parents a69d187 + 03e78fe commit 8c83f6e
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 27 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/llm_bench-python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ jobs:
run: |
python -m pip install --upgrade pip
python -m pip install flake8 pytest black
pip install -r ${{ env.LLM_BENCH_PYPATH }}/requirements.txt
GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ${{ env.LLM_BENCH_PYPATH }}/requirements.txt
pip install openvino-nightly
- name: Lint with flake8
Expand Down Expand Up @@ -73,7 +73,7 @@ jobs:
python-version: 3.8
- name: Test stateful
run: |
python -m pip install -r llm_bench/python/requirements.txt
GIT_CLONE_PROTECTION_ACTIVE=false python -m pip install -r llm_bench/python/requirements.txt
python -m pip uninstall --yes openvino
python -m pip install openvino-nightly
python llm_bench/python/convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --output_dir . --stateful
Expand Down
39 changes: 15 additions & 24 deletions llm_bench/python/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
DEFAULT_SUPER_RESOLUTION_STEPS = 50
DEFAULT_SUPER_RESOLUTION_WIDTH = 128
DEFAULT_SUPER_RESOLUTION_HEIGHT = 128
DEFAULT_OUTPUT_TOKEN_SIZE = 512
MAX_OUTPUT_TOKEN_SIZE = 64 * 1024

mem_consumption = MemConsumption()
Expand Down Expand Up @@ -87,22 +88,22 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list,
# Remove `token_type_ids` from inputs
input_tokens = input_data['input_ids'] if 'input_ids' in input_data else input_data
input_token_size = input_tokens[0].numel()

max_output_token_size = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count']
max_output_token_size = MAX_OUTPUT_TOKEN_SIZE if max_output_token_size > MAX_OUTPUT_TOKEN_SIZE else max_output_token_size
if args['batch_size'] > 1:
out_str = '[warm-up]' if num == 0 else '[{}]'.format(num)
out_str += " Batch_size={}, ".format(args['batch_size'])
out_str += 'all input token size after padding: {} * {}, '.format(input_token_size, args['batch_size'])
if args['infer_count'] is not None:
out_str += 'all max_output_token_size: {} * {}'.format(args['infer_count'], args['batch_size'])
out_str += 'all max_output_token_size: {} * {}'.format(max_output_token_size, args['batch_size'])
log.info(out_str)

max_rss_mem_consumption = ''
max_shared_mem_consumption = ''
if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2:
mem_consumption.start_collect_memory_consumption()
min_gen_tokens = 0 if args['infer_count'] is None else args['infer_count']
max_gen_tokens = MAX_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count']
start = time.perf_counter()
result = model.generate(**input_data, min_new_tokens=int(min_gen_tokens), max_new_tokens=int(max_gen_tokens), num_beams=args['num_beams'], use_cache=True)
result = model.generate(**input_data, max_new_tokens=int(max_output_token_size), num_beams=args['num_beams'], use_cache=True)
end = time.perf_counter()
if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2:
mem_consumption.end_collect_momory_consumption()
Expand All @@ -123,7 +124,7 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list,
else:
generated_text_len = len(result[bs_idx])
num_tokens += generated_text_len
if generated_text_len > max_gen_tokens:
if generated_text_len > max_output_token_size:
log.error('Output token size is over max output token size!')
result_text = generated_text[bs_idx]
if args["output_dir"] is not None:
Expand All @@ -136,13 +137,11 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list,
tm_infer_list = []
if bench_hook is not None:
tm_list = bench_hook.get_time_list()
log.debug('latency of all tokens:')
[log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_list)]
tm_infer_list = bench_hook.get_time_infer_list()
iter_data = gen_iterate_data(
num,
input_token_size * args['batch_size'],
len(tm_infer_list),
max_output_token_size,
num_tokens,
generation_time,
per_token_time,
Expand Down Expand Up @@ -178,7 +177,7 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list,

def run_text_generation_benchmark(model_path, framework, device, args, num_iters):
model, tokenizer, pretrain_time, bench_hook = FW_UTILS[framework].create_text_gen_model(model_path, device, **args)
model_precision = utils.model_utils.get_model_precision(model_path.parents._parts)
model_precision = utils.model_utils.get_model_precision(model_path.parts)
iter_data_list = []
warmup_md5 = {}
input_text_list = utils.model_utils.get_prompts(args)
Expand Down Expand Up @@ -417,15 +416,6 @@ def num_iters_type(x):
return x


def num_infer_count_type(x):
x = int(x)
if x < 1:
raise argparse.ArgumentTypeError('Minimum input value is 1')
elif x > MAX_OUTPUT_TOKEN_SIZE:
raise argparse.ArgumentTypeError(f'Max input value is {MAX_OUTPUT_TOKEN_SIZE}')
return x


def get_argprser():
parser = argparse.ArgumentParser('LLM benchmarking tool', add_help=True, formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('-m', '--model', help='model folder including IR files or Pytorch files', required=TabError)
Expand All @@ -439,8 +429,9 @@ def get_argprser():
'-ic',
'--infer_count',
default=None,
type=num_infer_count_type,
help='set the output token size, the value must be greater than 0.'
type=int,
help='limit the output token size '
f'(default {DEFAULT_OUTPUT_TOKEN_SIZE}) of text_gen and code_gen models.',
)
parser.add_argument(
'-n',
Expand Down Expand Up @@ -514,7 +505,7 @@ def get_argprser():


def main():
log.basicConfig(format='[ %(levelname)s ] %(message)s', level=os.environ.get("LOGLEVEL", log.INFO), stream=sys.stdout)
log.basicConfig(format='[ %(levelname)s ] %(message)s', level=log.INFO, stream=sys.stdout)
args = get_argprser()
model_path, framework, model_args, model_name = utils.model_utils.analyze_args(args)

Expand All @@ -537,10 +528,10 @@ def main():
if args.report is not None or args.report_json is not None:
model_precision = ''
if framework == 'ov':
ir_conversion_frontend = utils.model_utils.get_ir_conversion_frontend(model_name, model_path.parents._parts)
ir_conversion_frontend = utils.model_utils.get_ir_conversion_frontend(model_name, model_path.parts)
if ir_conversion_frontend != '':
framework = framework + '(' + ir_conversion_frontend + ')'
model_precision = utils.model_utils.get_model_precision(model_path.parents._parts)
model_precision = utils.model_utils.get_model_precision(model_path.parts)
if args.report is not None:
utils.output_csv.write_result(
args.report,
Expand Down
2 changes: 1 addition & 1 deletion llm_bench/python/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ transformers>=4.33.0
diffusers>=0.22.0
#optimum is in dependency list of optimum-intel
git+https://github.com/huggingface/optimum-intel.git@8c2b787cc75a45ae4670d37970a5394eba90eedc#egg=optimum-intel
git+https://github.com/openvinotoolkit/nncf.git#egg=nncf
git+https://github.com/openvinotoolkit/nncf.git@develop#egg=nncf
packaging
psutil
timm
Expand Down

0 comments on commit 8c83f6e

Please sign in to comment.