diff --git a/tools/llm_bench/task/speech_to_text_generation.py b/tools/llm_bench/task/speech_to_text_generation.py index f1e7ac54a0..15a47a8b6a 100644 --- a/tools/llm_bench/task/speech_to_text_generation.py +++ b/tools/llm_bench/task/speech_to_text_generation.py @@ -57,7 +57,7 @@ def run_speech_2_txt_generation(input_param, args, md5_list, iter_data_list): - np.array(perf_metrics.raw_metrics.m_new_token_times[:-1]) ).tolist() tm_list = (np.array([first_token_time] + second_tokens_durations) / 1000).tolist() - tm_infer_list = None + tm_infer_list = (np.array(perf_metrics.raw_metrics.token_infer_durations) / 1000 / 1000).tolist() result_text = result_text.texts[0] else: start = time.perf_counter() diff --git a/tools/llm_bench/task/text_generation.py b/tools/llm_bench/task/text_generation.py index 5fbf950d2c..200a8da6f2 100644 --- a/tools/llm_bench/task/text_generation.py +++ b/tools/llm_bench/task/text_generation.py @@ -294,6 +294,7 @@ def token_printer(): np.mean(perf_metrics.raw_metrics.tokenization_durations) / 1000, np.mean(perf_metrics.raw_metrics.detokenization_durations) / 1000 ) + inference_durations = np.array(perf_metrics.raw_metrics.token_infer_durations) / 1000 / 1000 iter_data = gen_output_data.gen_iterate_data( iter_idx=num, in_size=num_input_tokens * args['batch_size'], @@ -313,7 +314,7 @@ def token_printer(): num, iter_data, tm_list.tolist(), - None, + inference_durations.tolist(), warm_up=(num == 0), max_rss_mem=max_rss_mem_consumption, max_shared_mem=max_shared_mem_consumption,