Del PackedString Representation for Sentencepiece

apaniukov · Jan 31, 2024 · 39e9eb9 · 39e9eb9
1 parent 2de6890
commit 39e9eb9
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 63 deletions.
diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/sentence_piece.cpp
@@ -91,27 +91,14 @@ SentencepieceTokenizer::SentencepieceTokenizer(const OutputVector& args, const s
 }
 
 void SentencepieceTokenizer::validate_and_infer_types() {
-
-    #if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS
-
-    FRONT_END_GENERAL_CHECK(get_input_size() == 1 + 3, "SentencepieceTokenizer expects 4 inputs: sp model and input sentences represented as 3 decomposed tensors (begins, ends, sybols)");
-    FRONT_END_GENERAL_CHECK(get_input_element_type(0) == element::u8, "SentencepieceTokenizer accepts sp model as the first input and it should be of type u8 tensor");
-    FRONT_END_GENERAL_CHECK(get_input_element_type(1) == element::i32, "SentencepieceTokenizer accepts begins offsets as the second and it should be of type i32 tensor");
-    FRONT_END_GENERAL_CHECK(get_input_element_type(2) == element::i32, "SentencepieceTokenizer accepts ends offsets as the third and it should be of type i32 tensor");
-    FRONT_END_GENERAL_CHECK(get_input_element_type(3) == element::u8, "SentencepieceTokenizer accepts sentence symbols as the fourth input and it should be of type u8 tensor");
-
-    #else
-
     FRONT_END_GENERAL_CHECK(get_input_size() == 2, "SentencepieceTokenizer expects two inputs: sp model and input sentences");
     FRONT_END_GENERAL_CHECK(get_input_element_type(0) == element::u8, "SentencepieceTokenizer accepts sp model as the first input and it should be of type u8 tensor");
 
     FRONT_END_GENERAL_CHECK(
-        // WA: sometimes f32 appeared as a placeholder for unknown type
-        get_input_element_type(1) == element::u8 || get_input_element_type(1) == element::string || get_input_element_type(1) == element::f32,
+        // WA: f32 appeared as a placeholder for unknown type during intermediate conversion steps
+        get_input_element_type(1) == element::string || get_input_element_type(1) == element::f32,
         "SentencepieceTokenizer accepts sentences as the second input and it should be of type string tensor");
 
-    #endif
-
     // The operation SentencepieceTokenizerExtensionOp has three outputs: sparse indices, sparse values
     // and dense shape
     set_output_type(0, element::i64, PartialShape{ Dimension(), Dimension(2) });
@@ -133,16 +120,6 @@ bool SentencepieceTokenizer::evaluate(TensorVector& outputs, const TensorVector&
     std::vector<int32_t> sparse_values;
     std::vector<int64_t> sparse_dense_shape;
 
-#if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS
-
-    auto begin_ids = inputs[1].data<const int32_t>();
-    auto end_ids = inputs[2].data<const int32_t>();
-    auto data = inputs[3].data<const uint8_t>();
-
-    auto batch_size = shape_size(inputs[1].get_shape());
-
-#else
-
     auto input_element_type = get_input_element_type(1);
     int32_t batch_size;
 
@@ -157,24 +134,14 @@ bool SentencepieceTokenizer::evaluate(TensorVector& outputs, const TensorVector&
     if(input_element_type == ov::element::string) {
         strings = inputs[1].data<const std::string>();
         batch_size = static_cast<int32_t>(ov::shape_size(inputs[1].get_shape()));
-    } else if(input_element_type == ov::element::u8) {
-        parse_packed_strings(inputs[1], batch_size, begin_ids, end_ids, data);
     } else {
         OPENVINO_THROW("Unexpected input type during inference. SentencepieceTokenizer accepts element::u8 or element::string.");
     }
 
-#endif
-
     size_t max_token_id = 0;
     for (size_t batch_ind = 0; batch_ind < batch_size; ++batch_ind) {
         absl::string_view sentence;
-        if(input_element_type == ov::element::string) {
-            sentence = strings[batch_ind];
-        } else if(input_element_type == ov::element::u8) {
-            auto begin_ind = begin_ids[batch_ind];
-            auto end_ind = end_ids[batch_ind];
-            sentence = absl::string_view((const char*)data + begin_ind, end_ind - begin_ind);
-        }
+        sentence = strings[batch_ind];
 
         std::vector<int32_t> ids;
         CHECK_OK(m_sp->SampleEncode(sentence, m_nbest_size, m_alpha, &ids));

diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/tensorflow_translators.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/tensorflow_translators.cpp
@@ -39,35 +39,14 @@ namespace {
 
 OutputVector translate_sentencepiece_op(const NodeContext& node) {
     // extract model to configure SentencePieceTokenizer
-//    std::cout << "[ Trace 1 ] Before" << std::endl;
-//    auto sp_model_ov_any = node.get_attribute_as_any("model");
-//    std::cout << "[ Trace 1 ] Get Model" << std::endl;
-//    FRONT_END_GENERAL_CHECK(sp_model_ov_any.is<std::string>(),
-//        "SentencePieceOp configuration model is in incorrect format");
-    std::ifstream input( "/home/apaniuko/.config/JetBrains/RemoteDev-PY/_home_apaniuko_python_openvino_contrib/scratches/bytes", std::ios::binary );
-    std::vector<unsigned char> str_spm_model(std::istreambuf_iterator<char>(input), {});
-    std::cout << "[ Trace 1 ] FE Check" << std::endl;
-
-//    auto str_spm_model = sp_model_ov_any.as<std::vector<uint32_t>>();
-    std::cout << "[ Trace 1 ] As string" << std::endl;
-//    str_spm_model = str_spm_model.substr(2);
-//    str_spm_model = str_spm_model.substr(0, str_spm_model.size() - 1);
+    auto sp_model_ov_any = node.get_attribute_as_any("model");
+    FRONT_END_GENERAL_CHECK(sp_model_ov_any.is<std::string>(),
+        "SentencePieceOp configuration model is in incorrect format");
+    auto str_spm_model = sp_model_ov_any.as<std::string>();
     auto sp_model_const = std::make_shared<Constant>(element::u8, Shape{ str_spm_model.size() }, str_spm_model.data());
-//    std::cout << "[ Trace 1 ] Successful size:"<< str_spm_model.size() << "\n" << str_spm_model.substr(0, 100) << std::endl;
-    std::cout << "[ Trace 1 ] Successful" << std::endl;
     return { sp_model_const };
 }
 
-//OutputVector translate_sentencepiece_op(const NodeContext& node) {
-//    // extract model to configure SentencePieceTokenizer
-//    auto sp_model_ov_any = node.get_attribute_as_any("model");
-//    FRONT_END_GENERAL_CHECK(sp_model_ov_any.is<std::string>(),
-//        "SentencePieceOp configuration model is in incorrect format");
-//    auto str_spm_model = sp_model_ov_any.as<std::string>();
-//    auto sp_model_const = std::make_shared<Constant>(element::u8, Shape{ str_spm_model.size() }, str_spm_model.data());
-//    return { sp_model_const };
-//}
-
 NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) {
     // this is custom translator that converts a sub-graph with SentencePieceOp, SentencePieceTokenizer,
     // and RaggedTensorToSparse operation- into a custom operation SentencepieceTokenizerExtensionOp
@@ -86,8 +65,6 @@ NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) {
 
     // prepare input
     auto inputs = sp_tokenize_op->input_value(1);
-    auto parameter = std::dynamic_pointer_cast<Parameter>(inputs.get_node_shared_ptr());
-    parameter -> set_partial_shape(PartialShape{ Dimension() });
 
     // extract values for nbest_size, alpha, add_bos, add_eos, reverse attributes
     auto nbest_size = extract_scalar_const_value<int32_t>(sp_tokenize_op->input_value(2).get_node_shared_ptr(), "nbest_size");