From 39e9eb91aa2459b94c64be2488da82c0a52cbc64 Mon Sep 17 00:00:00 2001
From: Artur Paniukov <chgk1101@gmail.com>
Date: Wed, 31 Jan 2024 15:49:21 +0000
Subject: [PATCH] Del PackedString Representation for Sentencepiece

---
 .../tokenizer/sentence_piece.cpp              | 39 ++-----------------
 .../tokenizer/tensorflow_translators.cpp      | 31 ++-------------
 2 files changed, 7 insertions(+), 63 deletions(-)
diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp
index f6f75ae95..ab3de7121 100644
--- a/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp
+++ b/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp
@@ -91,27 +91,14 @@ SentencepieceTokenizer::SentencepieceTokenizer(const OutputVector& args, const s
 }
 
 void SentencepieceTokenizer::validate_and_infer_types() {
-
-    #if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS
-
-    FRONT_END_GENERAL_CHECK(get_input_size() == 1 + 3, "SentencepieceTokenizer expects 4 inputs: sp model and input sentences represented as 3 decomposed tensors (begins, ends, sybols)");
-    FRONT_END_GENERAL_CHECK(get_input_element_type(0) == element::u8, "SentencepieceTokenizer accepts sp model as the first input and it should be of type u8 tensor");
-    FRONT_END_GENERAL_CHECK(get_input_element_type(1) == element::i32, "SentencepieceTokenizer accepts begins offsets as the second and it should be of type i32 tensor");
-    FRONT_END_GENERAL_CHECK(get_input_element_type(2) == element::i32, "SentencepieceTokenizer accepts ends offsets as the third and it should be of type i32 tensor");
-    FRONT_END_GENERAL_CHECK(get_input_element_type(3) == element::u8, "SentencepieceTokenizer accepts sentence symbols as the fourth input and it should be of type u8 tensor");
-
-    #else
-
     FRONT_END_GENERAL_CHECK(get_input_size() == 2, "SentencepieceTokenizer expects two inputs: sp model and input sentences");
     FRONT_END_GENERAL_CHECK(get_input_element_type(0) == element::u8, "SentencepieceTokenizer accepts sp model as the first input and it should be of type u8 tensor");
 
     FRONT_END_GENERAL_CHECK(
-        // WA: sometimes f32 appeared as a placeholder for unknown type
-        get_input_element_type(1) == element::u8 || get_input_element_type(1) == element::string || get_input_element_type(1) == element::f32,
+        // WA: f32 appeared as a placeholder for unknown type during intermediate conversion steps
+        get_input_element_type(1) == element::string || get_input_element_type(1) == element::f32,
         "SentencepieceTokenizer accepts sentences as the second input and it should be of type string tensor");
 
-    #endif
-
     // The operation SentencepieceTokenizerExtensionOp has three outputs: sparse indices, sparse values
     // and dense shape
     set_output_type(0, element::i64, PartialShape{ Dimension(), Dimension(2) });
@@ -133,16 +120,6 @@ bool SentencepieceTokenizer::evaluate(TensorVector& outputs, const TensorVector&
     std::vector<int32_t> sparse_values;
     std::vector<int64_t> sparse_dense_shape;
 
-#if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS
-
-    auto begin_ids = inputs[1].data<const int32_t>();
-    auto end_ids = inputs[2].data<const int32_t>();
-    auto data = inputs[3].data<const uint8_t>();
-
-    auto batch_size = shape_size(inputs[1].get_shape());
-
-#else
-
     auto input_element_type = get_input_element_type(1);
     int32_t batch_size;
 
@@ -157,24 +134,14 @@ bool SentencepieceTokenizer::evaluate(TensorVector& outputs, const TensorVector&
     if(input_element_type == ov::element::string) {
         strings = inputs[1].data<const std::string>();
         batch_size = static_cast<int32_t>(ov::shape_size(inputs[1].get_shape()));
-    } else if(input_element_type == ov::element::u8) {
-        parse_packed_strings(inputs[1], batch_size, begin_ids, end_ids, data);
     } else {
         OPENVINO_THROW("Unexpected input type during inference. SentencepieceTokenizer accepts element::u8 or element::string.");
     }
 
-#endif
-
     size_t max_token_id = 0;
     for (size_t batch_ind = 0; batch_ind < batch_size; ++batch_ind) {
         absl::string_view sentence;
-        if(input_element_type == ov::element::string) {
-            sentence = strings[batch_ind];
-        } else if(input_element_type == ov::element::u8) {
-            auto begin_ind = begin_ids[batch_ind];
-            auto end_ind = end_ids[batch_ind];
-            sentence = absl::string_view((const char*)data + begin_ind, end_ind - begin_ind);
-        }
+        sentence = strings[batch_ind];
 
         std::vector<int32_t> ids;
         CHECK_OK(m_sp->SampleEncode(sentence, m_nbest_size, m_alpha, &ids));
diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/tensorflow_translators.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/tensorflow_translators.cpp
index b74293018..2c5e99108 100644
--- a/modules/custom_operations/user_ie_extensions/tokenizer/tensorflow_translators.cpp
+++ b/modules/custom_operations/user_ie_extensions/tokenizer/tensorflow_translators.cpp
@@ -39,35 +39,14 @@ namespace {
 
 OutputVector translate_sentencepiece_op(const NodeContext& node) {
     // extract model to configure SentencePieceTokenizer
-//    std::cout << "[ Trace 1 ] Before" << std::endl;
-//    auto sp_model_ov_any = node.get_attribute_as_any("model");
-//    std::cout << "[ Trace 1 ] Get Model" << std::endl;
-//    FRONT_END_GENERAL_CHECK(sp_model_ov_any.is<std::string>(),
-//        "SentencePieceOp configuration model is in incorrect format");
-    std::ifstream input( "/home/apaniuko/.config/JetBrains/RemoteDev-PY/_home_apaniuko_python_openvino_contrib/scratches/bytes", std::ios::binary );
-    std::vector<unsigned char> str_spm_model(std::istreambuf_iterator<char>(input), {});
-    std::cout << "[ Trace 1 ] FE Check" << std::endl;
-
-//    auto str_spm_model = sp_model_ov_any.as<std::vector<uint32_t>>();
-    std::cout << "[ Trace 1 ] As string" << std::endl;
-//    str_spm_model = str_spm_model.substr(2);
-//    str_spm_model = str_spm_model.substr(0, str_spm_model.size() - 1);
+    auto sp_model_ov_any = node.get_attribute_as_any("model");
+    FRONT_END_GENERAL_CHECK(sp_model_ov_any.is<std::string>(),
+        "SentencePieceOp configuration model is in incorrect format");
+    auto str_spm_model = sp_model_ov_any.as<std::string>();
     auto sp_model_const = std::make_shared<Constant>(element::u8, Shape{ str_spm_model.size() }, str_spm_model.data());
-//    std::cout << "[ Trace 1 ] Successful size:"<< str_spm_model.size() << "\n" << str_spm_model.substr(0, 100) << std::endl;
-    std::cout << "[ Trace 1 ] Successful" << std::endl;
     return { sp_model_const };
 }
 
-//OutputVector translate_sentencepiece_op(const NodeContext& node) {
-//    // extract model to configure SentencePieceTokenizer
-//    auto sp_model_ov_any = node.get_attribute_as_any("model");
-//    FRONT_END_GENERAL_CHECK(sp_model_ov_any.is<std::string>(),
-//        "SentencePieceOp configuration model is in incorrect format");
-//    auto str_spm_model = sp_model_ov_any.as<std::string>();
-//    auto sp_model_const = std::make_shared<Constant>(element::u8, Shape{ str_spm_model.size() }, str_spm_model.data());
-//    return { sp_model_const };
-//}
-
 NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) {
     // this is custom translator that converts a sub-graph with SentencePieceOp, SentencePieceTokenizer,
     // and RaggedTensorToSparse operation- into a custom operation SentencepieceTokenizerExtensionOp
@@ -86,8 +65,6 @@ NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) {
 
     // prepare input
     auto inputs = sp_tokenize_op->input_value(1);
-    auto parameter = std::dynamic_pointer_cast<Parameter>(inputs.get_node_shared_ptr());
-    parameter -> set_partial_shape(PartialShape{ Dimension() });
 
     // extract values for nbest_size, alpha, add_bos, add_eos, reverse attributes
     auto nbest_size = extract_scalar_const_value<int32_t>(sp_tokenize_op->input_value(2).get_node_shared_ptr(), "nbest_size");