Skip to content

Commit

Permalink
[llm_bench] enable prompt permutations for prevent prefix caching and…
Browse files Browse the repository at this point in the history
… fix vlm image load (#1607)

CVS-160892
  • Loading branch information
eaidova authored Jan 21, 2025
1 parent 2da00a0 commit e0488c8
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 7 deletions.
1 change: 1 addition & 0 deletions tools/llm_bench/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ def get_argprser():
parser.add_argument("--num_steps", type=int, required=False, help="Number of inference steps for image generation")
parser.add_argument("--height", type=int, required=False, help="Generated image height. Applicable only for Image Generation.")
parser.add_argument("--width", type=int, required=False, help="Generated image width. Applicable only for Image Generation.")
parser.add_argument("--disable_prompt_permutation", action="store_true", help="Disable modification prompt from run to run for avoid prefix caching")
return parser.parse_args()


Expand Down
7 changes: 4 additions & 3 deletions tools/llm_bench/llm_bench_utils/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,12 @@ def get_param_from_file(args, input_key):
if args["use_case"] != "vlm":
raise RuntimeError("Multiple sources for benchmarking supported only for Visual Language Models")
data_dict = {}
if args["media"] is None:
if args["media"] is None and args["images"] is None:
log.warn("Input image is not provided. Only text generation part will be evaluated")
else:
data_dict["media"] = args["media"]
data_dict["media"] = args["media"] if args["media"] is not None else args["images"]
if args["prompt"] is None:
data_dict["prompt"] = "What is OpenVINO?" if args["media"] is None else "Describe image"
data_dict["prompt"] = "What is OpenVINO?" if data_dict["media"] is None else "Describe image"
else:
data_dict["prompt"] = args["prompt"]
data_list.append(data_dict)
Expand Down Expand Up @@ -113,6 +113,7 @@ def analyze_args(args):
model_args['torch_compile_options'] = args.torch_compile_options
model_args['torch_compile_input_module'] = args.torch_compile_input_module
model_args['media'] = args.media
model_args["disable_prompt_permutation"] = args.disable_prompt_permutation

optimum = args.optimum

Expand Down
30 changes: 28 additions & 2 deletions tools/llm_bench/task/text_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,19 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data
tokenization_end = time.perf_counter()
tokenization_time = [(tokenization_end - tokenization_start) * 1000]

enable_prompt_permutations = not args.get("disable_prompt_permutation", False)
if enable_prompt_permutations:
log.warning(
"Enabled input prompt permutations. It means that generation results can be vary on different steps. "
"If it does not expected please specify --disable_prompr_permutation in your benchmarking command to disable this behavior"
)
from openvino_genai import TokenizedInputs
import openvino as ov

input_ids = input_data.input_ids.data
input_ids[:, 0] = num + 1
attention_mask = input_data.attention_mask
input_data = TokenizedInputs(input_ids=ov.Tensor(input_ids), attention_mask=attention_mask)
num_input_tokens = input_data.input_ids.shape[1]
if args['batch_size'] > 1:
out_str = '[warm-up]' if num == 0 else '[{}]'.format(num)
Expand Down Expand Up @@ -325,7 +338,7 @@ def token_printer():
batch_size=args['batch_size'],
prompt_idx=prompt_index
)
if num > 0:
if num > 0 and not enable_prompt_permutations:
prev_md5 = md5_list[num - 1][prompt_index]
if result_md5_list != prev_md5:
log.warning(f"[{num}] Prompt[{prompt_index}]'s md5 {result_md5_list} "
Expand Down Expand Up @@ -366,6 +379,19 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg
gen_config.max_new_tokens = max_gen_tokens
gen_config.num_beams = args["num_beams"]
gen_config.do_sample = False
enable_prompt_permutations = not args.get("disable_prompt_permutation", False)
if enable_prompt_permutations:
log.warning(
"Enabled input prompt permutations. It means that generation results can be vary on different steps. "
"If it does not expected please specify --disable_prompr_permutation in your benchmarking command to disable this behavior"
)
from openvino_genai import TokenizedInputs
import openvino as ov

input_ids = input_data.input_ids.data
input_ids[:, 0] = num + 1
attention_mask = input_data.attention_mask
input_data = TokenizedInputs(input_ids=ov.Tensor(input_ids), attention_mask=attention_mask)
if args.get('draft_model', ''):
config_info = "Speculative decoding config: "
if args.get("num_assistant_tokens", None):
Expand Down Expand Up @@ -439,7 +465,7 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg
batch_size=args['batch_size'],
prompt_idx=prompt_index
)
if num > 0:
if num > 0 and not enable_prompt_permutations:
prev_md5 = md5_list[num - 1][prompt_index]
if result_md5_list != prev_md5:
log.warning(f"[{num}] Prompt[{prompt_index}]'s md5 {result_md5_list} "
Expand Down
7 changes: 5 additions & 2 deletions tools/llm_bench/task/visual_language_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def run_visual_language_generation_optimum(
for bs_index, in_text in enumerate(prompts):
llm_bench_utils.output_file.output_input_text(in_text, args, model_precision, prompt_index, bs_index, proc_id)
tok_encode_start = time.perf_counter()
input_data = model.preprocess_inputs(text=prompts[0], image=images[0], **processor)
input_data = model.preprocess_inputs(text=prompts[0], image=images[0] if images else None, **processor)
tok_encode_end = time.perf_counter()
tok_encode_time = (tok_encode_end - tok_encode_start) * 1000
# Remove `token_type_ids` from inputs
Expand Down Expand Up @@ -211,8 +211,11 @@ def run_visual_language_generation_genai(
gen_config.max_new_tokens = max_gen_tokens
gen_config.num_beams = args["num_beams"]
gen_config.do_sample = False
kwargs = {}
if len(images) >= 1:
kwargs["images"] = images[0]
start = time.perf_counter()
generation_result = model.generate(prompts[0], images=images[0], generation_config=gen_config)
generation_result = model.generate(prompts[0], generation_config=gen_config, **kwargs)
end = time.perf_counter()
generated_text = generation_result.texts
perf_metrics = generation_result.perf_metrics
Expand Down

0 comments on commit e0488c8

Please sign in to comment.