From 39e9eb91aa2459b94c64be2488da82c0a52cbc64 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Wed, 31 Jan 2024 15:49:21 +0000 Subject: [PATCH] Del PackedString Representation for Sentencepiece --- .../tokenizer/sentence_piece.cpp | 39 ++----------------- .../tokenizer/tensorflow_translators.cpp | 31 ++------------- 2 files changed, 7 insertions(+), 63 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp index f6f75ae95..ab3de7121 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp @@ -91,27 +91,14 @@ SentencepieceTokenizer::SentencepieceTokenizer(const OutputVector& args, const s } void SentencepieceTokenizer::validate_and_infer_types() { - - #if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS - - FRONT_END_GENERAL_CHECK(get_input_size() == 1 + 3, "SentencepieceTokenizer expects 4 inputs: sp model and input sentences represented as 3 decomposed tensors (begins, ends, sybols)"); - FRONT_END_GENERAL_CHECK(get_input_element_type(0) == element::u8, "SentencepieceTokenizer accepts sp model as the first input and it should be of type u8 tensor"); - FRONT_END_GENERAL_CHECK(get_input_element_type(1) == element::i32, "SentencepieceTokenizer accepts begins offsets as the second and it should be of type i32 tensor"); - FRONT_END_GENERAL_CHECK(get_input_element_type(2) == element::i32, "SentencepieceTokenizer accepts ends offsets as the third and it should be of type i32 tensor"); - FRONT_END_GENERAL_CHECK(get_input_element_type(3) == element::u8, "SentencepieceTokenizer accepts sentence symbols as the fourth input and it should be of type u8 tensor"); - - #else - FRONT_END_GENERAL_CHECK(get_input_size() == 2, "SentencepieceTokenizer expects two inputs: sp model and input sentences"); FRONT_END_GENERAL_CHECK(get_input_element_type(0) == element::u8, "SentencepieceTokenizer accepts sp model as the first input and it should be of type u8 tensor"); FRONT_END_GENERAL_CHECK( - // WA: sometimes f32 appeared as a placeholder for unknown type - get_input_element_type(1) == element::u8 || get_input_element_type(1) == element::string || get_input_element_type(1) == element::f32, + // WA: f32 appeared as a placeholder for unknown type during intermediate conversion steps + get_input_element_type(1) == element::string || get_input_element_type(1) == element::f32, "SentencepieceTokenizer accepts sentences as the second input and it should be of type string tensor"); - #endif - // The operation SentencepieceTokenizerExtensionOp has three outputs: sparse indices, sparse values // and dense shape set_output_type(0, element::i64, PartialShape{ Dimension(), Dimension(2) }); @@ -133,16 +120,6 @@ bool SentencepieceTokenizer::evaluate(TensorVector& outputs, const TensorVector& std::vector sparse_values; std::vector sparse_dense_shape; -#if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS - - auto begin_ids = inputs[1].data(); - auto end_ids = inputs[2].data(); - auto data = inputs[3].data(); - - auto batch_size = shape_size(inputs[1].get_shape()); - -#else - auto input_element_type = get_input_element_type(1); int32_t batch_size; @@ -157,24 +134,14 @@ bool SentencepieceTokenizer::evaluate(TensorVector& outputs, const TensorVector& if(input_element_type == ov::element::string) { strings = inputs[1].data(); batch_size = static_cast(ov::shape_size(inputs[1].get_shape())); - } else if(input_element_type == ov::element::u8) { - parse_packed_strings(inputs[1], batch_size, begin_ids, end_ids, data); } else { OPENVINO_THROW("Unexpected input type during inference. SentencepieceTokenizer accepts element::u8 or element::string."); } -#endif - size_t max_token_id = 0; for (size_t batch_ind = 0; batch_ind < batch_size; ++batch_ind) { absl::string_view sentence; - if(input_element_type == ov::element::string) { - sentence = strings[batch_ind]; - } else if(input_element_type == ov::element::u8) { - auto begin_ind = begin_ids[batch_ind]; - auto end_ind = end_ids[batch_ind]; - sentence = absl::string_view((const char*)data + begin_ind, end_ind - begin_ind); - } + sentence = strings[batch_ind]; std::vector ids; CHECK_OK(m_sp->SampleEncode(sentence, m_nbest_size, m_alpha, &ids)); diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/tensorflow_translators.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/tensorflow_translators.cpp index b74293018..2c5e99108 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/tensorflow_translators.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/tensorflow_translators.cpp @@ -39,35 +39,14 @@ namespace { OutputVector translate_sentencepiece_op(const NodeContext& node) { // extract model to configure SentencePieceTokenizer -// std::cout << "[ Trace 1 ] Before" << std::endl; -// auto sp_model_ov_any = node.get_attribute_as_any("model"); -// std::cout << "[ Trace 1 ] Get Model" << std::endl; -// FRONT_END_GENERAL_CHECK(sp_model_ov_any.is(), -// "SentencePieceOp configuration model is in incorrect format"); - std::ifstream input( "/home/apaniuko/.config/JetBrains/RemoteDev-PY/_home_apaniuko_python_openvino_contrib/scratches/bytes", std::ios::binary ); - std::vector str_spm_model(std::istreambuf_iterator(input), {}); - std::cout << "[ Trace 1 ] FE Check" << std::endl; - -// auto str_spm_model = sp_model_ov_any.as>(); - std::cout << "[ Trace 1 ] As string" << std::endl; -// str_spm_model = str_spm_model.substr(2); -// str_spm_model = str_spm_model.substr(0, str_spm_model.size() - 1); + auto sp_model_ov_any = node.get_attribute_as_any("model"); + FRONT_END_GENERAL_CHECK(sp_model_ov_any.is(), + "SentencePieceOp configuration model is in incorrect format"); + auto str_spm_model = sp_model_ov_any.as(); auto sp_model_const = std::make_shared(element::u8, Shape{ str_spm_model.size() }, str_spm_model.data()); -// std::cout << "[ Trace 1 ] Successful size:"<< str_spm_model.size() << "\n" << str_spm_model.substr(0, 100) << std::endl; - std::cout << "[ Trace 1 ] Successful" << std::endl; return { sp_model_const }; } -//OutputVector translate_sentencepiece_op(const NodeContext& node) { -// // extract model to configure SentencePieceTokenizer -// auto sp_model_ov_any = node.get_attribute_as_any("model"); -// FRONT_END_GENERAL_CHECK(sp_model_ov_any.is(), -// "SentencePieceOp configuration model is in incorrect format"); -// auto str_spm_model = sp_model_ov_any.as(); -// auto sp_model_const = std::make_shared(element::u8, Shape{ str_spm_model.size() }, str_spm_model.data()); -// return { sp_model_const }; -//} - NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) { // this is custom translator that converts a sub-graph with SentencePieceOp, SentencePieceTokenizer, // and RaggedTensorToSparse operation- into a custom operation SentencepieceTokenizerExtensionOp @@ -86,8 +65,6 @@ NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) { // prepare input auto inputs = sp_tokenize_op->input_value(1); - auto parameter = std::dynamic_pointer_cast(inputs.get_node_shared_ptr()); - parameter -> set_partial_shape(PartialShape{ Dimension() }); // extract values for nbest_size, alpha, add_bos, add_eos, reverse attributes auto nbest_size = extract_scalar_const_value(sp_tokenize_op->input_value(2).get_node_shared_ptr(), "nbest_size");