From 17d4ef71ba0772283138cc3e8f3bc8768e2bdfa6 Mon Sep 17 00:00:00 2001 From: eaidova Date: Mon, 20 Jan 2025 21:53:26 +0400 Subject: [PATCH 1/4] enable prompt permutations for prevent prefix caching --- tools/llm_bench/benchmark.py | 2 ++ .../llm_bench/llm_bench_utils/model_utils.py | 7 +++--- tools/llm_bench/task/text_generation.py | 23 ++++++++++++++++++- 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/tools/llm_bench/benchmark.py b/tools/llm_bench/benchmark.py index 6fc135c4ef..25460599d8 100644 --- a/tools/llm_bench/benchmark.py +++ b/tools/llm_bench/benchmark.py @@ -161,6 +161,8 @@ def get_argprser(): parser.add_argument("--num_steps", type=int, required=False, help="Number of inference steps for image generation") parser.add_argument("--height", type=int, required=False, help="Generated image height. Applicable only for Image Generation.") parser.add_argument("--width", type=int, required=False, help="Generated image width. Applicable only for Image Generation.") + parser.add_argument("--disable_prompt_permutation", action="store_true", help="Disable modification prompt from run to run for avoid prefix caching") + return parser.parse_args() diff --git a/tools/llm_bench/llm_bench_utils/model_utils.py b/tools/llm_bench/llm_bench_utils/model_utils.py index 324a67bc2a..585d4a557c 100644 --- a/tools/llm_bench/llm_bench_utils/model_utils.py +++ b/tools/llm_bench/llm_bench_utils/model_utils.py @@ -37,12 +37,12 @@ def get_param_from_file(args, input_key): if args["use_case"] != "vlm": raise RuntimeError("Multiple sources for benchmarking supported only for Visual Language Models") data_dict = {} - if args["media"] is None: + if args["media"] is None and args["image"] is None: log.warn("Input image is not provided. Only text generation part will be evaluated") else: - data_dict["media"] = args["media"] + data_dict["media"] = args["media"] if args["media"] is not None else args["image"] if args["prompt"] is None: - data_dict["prompt"] = "What is OpenVINO?" if args["media"] is None else "Describe image" + data_dict["prompt"] = "What is OpenVINO?" if data_dict["media"] is None else "Describe image" else: data_dict["prompt"] = args["prompt"] data_list.append(data_dict) @@ -113,6 +113,7 @@ def analyze_args(args): model_args['torch_compile_options'] = args.torch_compile_options model_args['torch_compile_input_module'] = args.torch_compile_input_module model_args['media'] = args.media + model_args["disable_prompt_permutation"] = args.disable_prompt_permutation optimum = args.optimum diff --git a/tools/llm_bench/task/text_generation.py b/tools/llm_bench/task/text_generation.py index de798f158f..872fdfa2c2 100644 --- a/tools/llm_bench/task/text_generation.py +++ b/tools/llm_bench/task/text_generation.py @@ -207,6 +207,17 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data tokenization_end = time.perf_counter() tokenization_time = [(tokenization_end - tokenization_start) * 1000] + enable_prompt_permutations = not args.get("disable_prompt_permutation", False) + if enable_prompt_permutations: + log.warning("Enabled input prompt permutations. It means that generation results can be vary on different steps. If it does not expected please specify --disable_prompr_permutation in your benchmarking command to disable this behaviour") + from openvino_genai import TokenizedInputs + import openvino as ov + + input_ids = input_data.input_ids.data + input_ids[:, 0] = num + 1 + attention_mask = input_data.attention_mask + input_data = TokenizedInputs(input_ids=ov.Tensor(input_ids), attention_mask=attention_mask) + num_input_tokens = input_data.input_ids.shape[1] if args['batch_size'] > 1: out_str = '[warm-up]' if num == 0 else '[{}]'.format(num) @@ -325,7 +336,7 @@ def token_printer(): batch_size=args['batch_size'], prompt_idx=prompt_index ) - if num > 0: + if num > 0 and not enable_prompt_permutations: prev_md5 = md5_list[num - 1][prompt_index] if result_md5_list != prev_md5: log.warning(f"[{num}] Prompt[{prompt_index}]'s md5 {result_md5_list} " @@ -366,6 +377,16 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg gen_config.max_new_tokens = max_gen_tokens gen_config.num_beams = args["num_beams"] gen_config.do_sample = False + enable_prompt_permutations = not args.get("disable_prompt_permutation", False) + if enable_prompt_permutations: + log.warning("Enabled input prompt permutations. It means that generation results can be vary on different steps. If it does not expected please specify --disable_prompr_permutation in your benchmarking command to disable this behaviour") + from openvino_genai import TokenizedInputs + import openvino as ov + + input_ids = input_data.input_ids.data + input_ids[:, 0] = num + 1 + attention_mask = input_data.attention_mask + input_data = TokenizedInputs(input_ids=ov.Tensor(input_ids), attention_mask=attention_mask) if args.get('draft_model', ''): config_info = "Speculative decoding config: " if args.get("num_assistant_tokens", None): From 917c1bef32d7168875be0f8670e8bba5eb764b96 Mon Sep 17 00:00:00 2001 From: eaidova Date: Mon, 20 Jan 2025 22:07:59 +0400 Subject: [PATCH 2/4] fix vlm running without image --- tools/llm_bench/llm_bench_utils/model_utils.py | 4 ++-- tools/llm_bench/task/text_generation.py | 13 +++++++++---- tools/llm_bench/task/visual_language_generation.py | 7 +++++-- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/tools/llm_bench/llm_bench_utils/model_utils.py b/tools/llm_bench/llm_bench_utils/model_utils.py index 585d4a557c..51d77d3215 100644 --- a/tools/llm_bench/llm_bench_utils/model_utils.py +++ b/tools/llm_bench/llm_bench_utils/model_utils.py @@ -37,10 +37,10 @@ def get_param_from_file(args, input_key): if args["use_case"] != "vlm": raise RuntimeError("Multiple sources for benchmarking supported only for Visual Language Models") data_dict = {} - if args["media"] is None and args["image"] is None: + if args["media"] is None and args["images"] is None: log.warn("Input image is not provided. Only text generation part will be evaluated") else: - data_dict["media"] = args["media"] if args["media"] is not None else args["image"] + data_dict["media"] = args["media"] if args["media"] is not None else args["images"] if args["prompt"] is None: data_dict["prompt"] = "What is OpenVINO?" if data_dict["media"] is None else "Describe image" else: diff --git a/tools/llm_bench/task/text_generation.py b/tools/llm_bench/task/text_generation.py index 872fdfa2c2..372a034148 100644 --- a/tools/llm_bench/task/text_generation.py +++ b/tools/llm_bench/task/text_generation.py @@ -209,7 +209,10 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data enable_prompt_permutations = not args.get("disable_prompt_permutation", False) if enable_prompt_permutations: - log.warning("Enabled input prompt permutations. It means that generation results can be vary on different steps. If it does not expected please specify --disable_prompr_permutation in your benchmarking command to disable this behaviour") + log.warning( + "Enabled input prompt permutations. It means that generation results can be vary on different steps. " + "If it does not expected please specify --disable_prompr_permutation in your benchmarking command to disable this behavior" + ) from openvino_genai import TokenizedInputs import openvino as ov @@ -217,7 +220,6 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data input_ids[:, 0] = num + 1 attention_mask = input_data.attention_mask input_data = TokenizedInputs(input_ids=ov.Tensor(input_ids), attention_mask=attention_mask) - num_input_tokens = input_data.input_ids.shape[1] if args['batch_size'] > 1: out_str = '[warm-up]' if num == 0 else '[{}]'.format(num) @@ -379,7 +381,10 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg gen_config.do_sample = False enable_prompt_permutations = not args.get("disable_prompt_permutation", False) if enable_prompt_permutations: - log.warning("Enabled input prompt permutations. It means that generation results can be vary on different steps. If it does not expected please specify --disable_prompr_permutation in your benchmarking command to disable this behaviour") + log.warning( + "Enabled input prompt permutations. It means that generation results can be vary on different steps. " + "If it does not expected please specify --disable_prompr_permutation in your benchmarking command to disable this behavior" + ) from openvino_genai import TokenizedInputs import openvino as ov @@ -460,7 +465,7 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg batch_size=args['batch_size'], prompt_idx=prompt_index ) - if num > 0: + if num > 0 and not enable_prompt_permutations: prev_md5 = md5_list[num - 1][prompt_index] if result_md5_list != prev_md5: log.warning(f"[{num}] Prompt[{prompt_index}]'s md5 {result_md5_list} " diff --git a/tools/llm_bench/task/visual_language_generation.py b/tools/llm_bench/task/visual_language_generation.py index a5fb0ecc0c..4eb76bef99 100644 --- a/tools/llm_bench/task/visual_language_generation.py +++ b/tools/llm_bench/task/visual_language_generation.py @@ -44,7 +44,7 @@ def run_visual_language_generation_optimum( for bs_index, in_text in enumerate(prompts): llm_bench_utils.output_file.output_input_text(in_text, args, model_precision, prompt_index, bs_index, proc_id) tok_encode_start = time.perf_counter() - input_data = model.preprocess_inputs(text=prompts[0], image=images[0], **processor) + input_data = model.preprocess_inputs(text=prompts[0], image=images[0] if images else None, **processor) tok_encode_end = time.perf_counter() tok_encode_time = (tok_encode_end - tok_encode_start) * 1000 # Remove `token_type_ids` from inputs @@ -211,8 +211,11 @@ def run_visual_language_generation_genai( gen_config.max_new_tokens = max_gen_tokens gen_config.num_beams = args["num_beams"] gen_config.do_sample = False + kwargs = {} + if len(images) >= 1: + kwargs["images"] = images[0] start = time.perf_counter() - generation_result = model.generate(prompts[0], images=images[0], generation_config=gen_config) + generation_result = model.generate(prompts[0], generation_config=gen_config) end = time.perf_counter() generated_text = generation_result.texts perf_metrics = generation_result.perf_metrics From 017b06c2fde2d6e235640d56d707cf8ecd1ccae7 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Mon, 20 Jan 2025 22:19:45 +0400 Subject: [PATCH 3/4] Update tools/llm_bench/task/visual_language_generation.py --- tools/llm_bench/task/visual_language_generation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/llm_bench/task/visual_language_generation.py b/tools/llm_bench/task/visual_language_generation.py index 4eb76bef99..a02b16b2bb 100644 --- a/tools/llm_bench/task/visual_language_generation.py +++ b/tools/llm_bench/task/visual_language_generation.py @@ -215,7 +215,7 @@ def run_visual_language_generation_genai( if len(images) >= 1: kwargs["images"] = images[0] start = time.perf_counter() - generation_result = model.generate(prompts[0], generation_config=gen_config) + generation_result = model.generate(prompts[0], generation_config=gen_config, **kwargs) end = time.perf_counter() generated_text = generation_result.texts perf_metrics = generation_result.perf_metrics From 115409d540eee5d2be823ceaa289353a7a9cb9ea Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Mon, 20 Jan 2025 22:23:02 +0400 Subject: [PATCH 4/4] Update tools/llm_bench/benchmark.py --- tools/llm_bench/benchmark.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/llm_bench/benchmark.py b/tools/llm_bench/benchmark.py index 25460599d8..5d4a1436a7 100644 --- a/tools/llm_bench/benchmark.py +++ b/tools/llm_bench/benchmark.py @@ -162,7 +162,6 @@ def get_argprser(): parser.add_argument("--height", type=int, required=False, help="Generated image height. Applicable only for Image Generation.") parser.add_argument("--width", type=int, required=False, help="Generated image width. Applicable only for Image Generation.") parser.add_argument("--disable_prompt_permutation", action="store_true", help="Disable modification prompt from run to run for avoid prefix caching") - return parser.parse_args()