From 0aea7f9d2a87a11974a5c0479fbe4460f121c9c5 Mon Sep 17 00:00:00 2001 From: Ekaterina Shiryaeva Date: Tue, 29 Oct 2024 19:56:20 +0000 Subject: [PATCH] Add model modifications for StaticWhisperPipeline --- src/cpp/src/whisper_pipeline_static.cpp | 201 ++++++++++++++++++++++-- 1 file changed, 186 insertions(+), 15 deletions(-) diff --git a/src/cpp/src/whisper_pipeline_static.cpp b/src/cpp/src/whisper_pipeline_static.cpp index 8a2abf7ad1..825a3ea65c 100644 --- a/src/cpp/src/whisper_pipeline_static.cpp +++ b/src/cpp/src/whisper_pipeline_static.cpp @@ -15,7 +15,8 @@ #include "whisper/whisper.hpp" #include "whisper/whisper_config.hpp" - +#include "openvino/core/layout.hpp" +#include "openvino/core/preprocess/pre_post_process.hpp" #include "openvino/pass/pattern/matcher.hpp" #include "openvino/pass/pattern/op/wrap_type.hpp" #include "openvino/pass/graph_rewrite.hpp" @@ -68,8 +69,9 @@ ov::Tensor make_tensor_slice(ov::Tensor tensor, size_t dim, size_t start_pos, si void set_cross_attn_key_value(ov::InferRequest& source, ov::InferRequest& dest) { // NB: Source outputs: - // present_key_values.0.encoder.key - // present_key_values.0.encoder.value + // for optimum-cli + // present.0.encoder.key + // present.0.encoder.value // NB: Dest inputs: // past_key_values.0.encoder.key @@ -80,15 +82,16 @@ void set_cross_attn_key_value(ov::InferRequest& source, ov::InferRequest& dest) if (source_output_name.find("encoder") == std::string::npos) { continue; } - std::string with_past_input_name = std::regex_replace(source_output_name, std::regex("present"), "past"); + std::string with_past_input_name = std::regex_replace(source_output_name, std::regex("present"), "past_key_values"); dest.set_tensor(with_past_input_name, source.get_tensor(source_output_name)); } } void update_past_key_value(ov::InferRequest& source, ov::InferRequest& dest, const size_t kv_pos = 0u) { // NB: Source outputs: - // present_key_values.0.decoder.key - // present_key_values.0.decoder.value + // for optimum-cli + // present.0.decoder.key + // present.0.decoder.value // NB: Dest inputs: // past_key_values.0.decoder.key @@ -100,7 +103,7 @@ void update_past_key_value(ov::InferRequest& source, ov::InferRequest& dest, con continue; } - std::string with_past_input_name = std::regex_replace(source_output_name, std::regex("present"), "past"); + std::string with_past_input_name = std::regex_replace(source_output_name, std::regex("present"), "past_key_values"); auto src_kv_tensor = source.get_tensor(source_output_name); auto dst_kv_tensor = dest.get_tensor(with_past_input_name); @@ -133,6 +136,9 @@ void set_decoder_input_ids_attention_mask(ov::InferRequest& decoder, auto attention_mask_data = attention_mask_tensor.data(); std::fill_n(attention_mask_data, init_ids.size(), 1u); std::fill(attention_mask_data + init_ids.size(), attention_mask_data + attention_mask_tensor.get_size(), 0u); + + //decoder.get_tensor("attention_mask").data()[input_ids.size() - 1] = 0u; + // ^ Need to used attention_mask size here! } int64_t decode(ov::Tensor& encoder_hidden_state, @@ -171,7 +177,8 @@ int64_t decode_with_past(ov::InferRequest& decoder_with_past, // FIXME: Avoid this cast to i32. Why it's not i64 precision in model? decoder_with_past.get_tensor("input_ids").data()[0] = static_cast(input_id); // FIXME: Avoid this cast to i32. Why it's not i64 precision in model? - decoder_with_past.get_tensor("position_ids").data()[0] = static_cast(position_id); + //decoder_with_past.get_tensor("position_ids").data()[0] = static_cast(position_id); + decoder_with_past.get_tensor("cache_position").data()[0] = position_id; // for optimum-cli // FIXME: Is "attention_mask" supposed to be f16? decoder_with_past.get_tensor("attention_mask").data()[position_id - 1] = 1u; @@ -195,7 +202,7 @@ void zero_past_key_values(ov::InferRequest& request) { past_key_value_decoder_name.find("past_key_values") == std::string::npos) { continue; } - fill_tensor(request.get_tensor(past_key_value_decoder_name), 0); + fill_tensor(request.get_tensor(past_key_value_decoder_name), 0); // for optimum-cli } } @@ -204,8 +211,12 @@ void prepare_decoder_with_past(ov::InferRequest& decoder_with_past, ov::InferReq auto attention_mask = decoder_with_past.get_tensor("attention_mask"); auto* attention_mask_ptr = attention_mask.data(); std::fill(attention_mask_ptr, attention_mask_ptr + 3u, 1); - std::fill(attention_mask_ptr + 3u, attention_mask_ptr + attention_mask.get_size() - 1, 0); - attention_mask_ptr[attention_mask.get_size() - 1] = 1; + //std::fill(attention_mask_ptr + 3u, attention_mask_ptr + attention_mask.get_size() - 1, 0); + //attention_mask_ptr[attention_mask.get_size() - 1] = 1; + // NB: for optimum-cli models attention_mask should be [1, 1, 1, 0, 0, 0, 0, ..., 1, 0], size = size+1 :FIXME + std::fill(attention_mask_ptr + 3u, attention_mask_ptr + attention_mask.get_size() - 2, 0); + attention_mask_ptr[attention_mask.get_size() - 2] = 1; + attention_mask_ptr[attention_mask.get_size() - 1] = 0; // NB: Zero past_key_values.*.decoder.value tensors zero_past_key_values(decoder_with_past); // NB: Copy KV-caches from decoder @@ -395,6 +406,128 @@ void add_attention_mask_input(std::shared_ptr model) { pm.run_passes(model); } +void reshape_to_static(std::shared_ptr model, const uint32_t input_size, const uint32_t kvcache_size) { + //std::cout << "[DEBUG] Reshaping decoder_with_past_model ..." << std::endl; + + std::map new_shapes; + for (auto input : model->inputs()) { + const auto& input_name = input.get_any_name(); + ov::PartialShape new_shape; + if (input_name.find("input_ids") != std::string::npos) { + new_shape = ov::PartialShape({1, input_size}); + } else if (input_name.find("attention_mask") != std::string::npos) { + new_shape = ov::PartialShape({1, kvcache_size + 1}); // Artefact in attention_mask + } else if (input_name.find("position_ids") != std::string::npos) { + new_shape = ov::PartialShape({1, input_size}); + } else if (input_name.find("cache_position") != std::string::npos) { + new_shape = ov::PartialShape({1}); + } else if (input_name.find("encoder_hidden_states") != std::string::npos) { + const auto& partial_shape = input.get_partial_shape(); + new_shape = partial_shape; + new_shape[0] = 1; // batch_dim + new_shape[1] = 1500; // FIXME: where to get this? is it got from encoder output{'last_hidden_state'} + } else if (input_name.find("past_key_values") != std::string::npos) { + const auto& partial_shape = input.get_partial_shape(); + new_shape = partial_shape; + new_shape[0] = 1; // Use batch dim here + new_shape[2] = input_name.find(".decoder") != std::string::npos + ? kvcache_size - input_size + : 1500; // kv_size for decoder, 1500 for encoder : is it got from encoder + // output{'last_hidden_state'} + + // ^ use kv_dim here + } + new_shapes.emplace(input_name, new_shape); + } + + model->reshape(new_shapes); +} + +void reshape_to_static_encoder(std::shared_ptr model) { + std::map new_shapes; + for (auto input : model->inputs()) { + const auto& input_name = input.get_any_name(); + ov::PartialShape new_shape; + if (input_name.find("input_features") != std::string::npos) { + const auto& partial_shape = input.get_partial_shape(); + new_shape = partial_shape; + new_shape[0] = 1; // batch_dim + } + new_shapes.emplace(input_name, new_shape); + } + model->reshape(new_shapes); +} + +void preprocess_encoder(std::shared_ptr model) { + ov::preprocess::PrePostProcessor preprocessor(model); + + preprocessor.input("input_features").tensor().set_element_type(ov::element::Type_t::f32); + preprocessor.input("input_features").preprocess().convert_element_type(ov::element::Type_t::f32); + preprocessor.output("last_hidden_state").tensor().set_element_type(ov::element::Type_t::f16); + + model = preprocessor.build(); +} + +void preprocess_decoder(std::shared_ptr model) { + ov::preprocess::PrePostProcessor preprocessor(model); + + for (auto tensor : model->inputs()) { + if (tensor.get_any_name().find("input_ids") != std::string::npos) { + preprocessor.input("input_ids").tensor().set_element_type(ov::element::Type_t::i32); + preprocessor.input("input_ids").preprocess().convert_element_type(ov::element::Type_t::i32); + } else if (tensor.get_any_name().find("attention_mask") != std::string::npos) { + preprocessor.input("attention_mask").tensor().set_element_type(ov::element::Type_t::f16); + preprocessor.input("attention_mask").preprocess().convert_element_type(); + } else if (tensor.get_any_name().find("encoder_hidden_states") != std::string::npos) { + preprocessor.input("encoder_hidden_states").tensor().set_element_type(ov::element::Type_t::f16); + preprocessor.input("encoder_hidden_states").preprocess().convert_element_type(ov::element::Type_t::f32); // () + } else if (tensor.get_any_name().find("past_key_values") != std::string::npos) { + preprocessor.input(tensor.get_any_name()).tensor().set_element_type(ov::element::Type_t::f16); + preprocessor.input(tensor.get_any_name()).preprocess().convert_element_type(); + + // if (tensor.get_any_name().find(".value") != std::string::npos) { + // preprocessor.output(tensor.get_any_name()).tensor().set_layout(ov::Layout("NCWH")); + // preprocessor.output(tensor.get_any_name()).model().set_layout(ov::Layout("NCHW")); + //} else if (tensor.get_any_name().find(".key") != std::string::npos) { + // preprocessor.output(tensor.get_any_name()).tensor().set_layout(ov::Layout("NCHW")); + // preprocessor.output(tensor.get_any_name()).model().set_layout(ov::Layout("NCHW")); + //} + } + } + + for (auto tensor : model->outputs()) { + //preprocessor.output(tensor.get_any_name()).tensor().set_element_type(ov::element::Type_t::f16); + if (tensor.get_any_name().find("present") != std::string::npos) { // "present" for models from arch team + preprocessor.output(tensor.get_any_name()).tensor().set_element_type(ov::element::Type_t::f16); + preprocessor.output(tensor.get_any_name()).postprocess().convert_element_type(); + + // if (tensor.get_any_name().find(".value") != std::string::npos) { + // preprocessor.output(tensor.get_any_name()).tensor().set_layout(ov::Layout("NCWH")); + // preprocessor.output(tensor.get_any_name()).model().set_layout(ov::Layout("NCHW")); + //} else if (tensor.get_any_name().find(".key") != std::string::npos) { + // preprocessor.output(tensor.get_any_name()).tensor().set_layout(ov::Layout("NCHW")); + // preprocessor.output(tensor.get_any_name()).model().set_layout(ov::Layout("NCHW")); + //} + } + } + + model = preprocessor.build(); +} + +std::shared_ptr redirect_new_kv_to_output(const std::shared_ptr& model) { + const auto kStartOutputKVCacheLayers = 1u; + for (int i = kStartOutputKVCacheLayers; i < model->outputs().size(); ++i) { + auto kvout = model->output(i); + auto kvrslt = kvout.get_node(); + auto kvcat = kvrslt->inputs()[0].get_source_output().get_node(); + auto kvval = kvcat->inputs()[1].get_source_output(); + kvval.set_names({kvout.get_any_name()}); + kvrslt->inputs()[0].replace_source_output(kvval); + } + model->validate_nodes_and_infer_types(); + return model; +} + } // namespace namespace ov { @@ -418,10 +551,48 @@ WhisperPipeline::StaticWhisperPipeline::StaticWhisperPipeline(const std::filesys } // TODO: There must be model reshape to eliminate dynamism! - - m_models.encoder = core.compile_model(encoder_model, "NPU").create_infer_request(); - m_models.decoder = core.compile_model(decoder_model, "NPU").create_infer_request(); - m_models.decoder_with_past = core.compile_model(decoder_with_past_model, "NPU").create_infer_request(); + size_t max_sequence_length = 128; + + reshape_to_static_encoder(encoder_model); + reshape_to_static(decoder_model, 4, 4); // What is 4 here?? + reshape_to_static(decoder_with_past_model, 1, max_sequence_length); + + // Replace KV-tensors for the entire cache to tensors only for new token + decoder_with_past_model = redirect_new_kv_to_output(decoder_with_past_model); + + ov::AnyMap config_encoder = { + {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add"}, + {"NPU_USE_NPUW", "YES"}, + {"NPUW_ONLINE_PIPELINE", "NONE"}, + //{"NPUW_FOLD", "YES"}, + //{"NPUW_DCOFF_TYPE", "f16"}, + //{"NPUW_DCOFF_SCALE", "YES"}, + {"NPUW_DEVICES", "CPU"}}; + + ov::AnyMap config = { + {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add"}, + {"NPU_USE_NPUW", "YES"}, + //{"NPUW_FOLD", "YES"}, + //{"NPUW_DCOFF_TYPE", "f16"}, + //{"NPUW_DCOFF_SCALE", "YES"}, + {"NPUW_DEVICES", "CPU"}}; + + preprocess_encoder(encoder_model); + preprocess_decoder(decoder_model); + preprocess_decoder(decoder_with_past_model); + + std::cout << "[DEBUG] All model modifications are done, saving models..." << std::endl; + ov::save_model(encoder_model, models_path / "0_openvino_encoder_model_attn.xml"); + ov::save_model(decoder_model, models_path / "0_openvino_decoder_model_attn.xml"); + ov::save_model(decoder_with_past_model, models_path / "0_openvino_decoder_with_past_model_attn.xml"); + + m_models.encoder = core.compile_model(encoder_model, "NPU", config_encoder).create_infer_request(); + std::cout << "[DEBUG] Compile encoder model - DONE" << std::endl; + m_models.decoder = core.compile_model(decoder_model, "NPU", config_encoder).create_infer_request(); + std::cout << "[DEBUG] Compile decoder model - DONE" << std::endl; + m_models.decoder_with_past = + core.compile_model(decoder_with_past_model, "NPU", config_encoder).create_infer_request(); + std::cout << "[DEBUG] Compile decoder with past model - DONE" << std::endl; // If eos_token_id was not provided, take value if (m_generation_config.eos_token_id == -1) {