Skip to content

Commit

Permalink
Del PackedString Representation for Sentencepiece
Browse files Browse the repository at this point in the history
  • Loading branch information
apaniukov committed Jan 31, 2024
1 parent 2de6890 commit 39e9eb9
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 63 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -91,27 +91,14 @@ SentencepieceTokenizer::SentencepieceTokenizer(const OutputVector& args, const s
}

void SentencepieceTokenizer::validate_and_infer_types() {

#if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS

FRONT_END_GENERAL_CHECK(get_input_size() == 1 + 3, "SentencepieceTokenizer expects 4 inputs: sp model and input sentences represented as 3 decomposed tensors (begins, ends, sybols)");
FRONT_END_GENERAL_CHECK(get_input_element_type(0) == element::u8, "SentencepieceTokenizer accepts sp model as the first input and it should be of type u8 tensor");
FRONT_END_GENERAL_CHECK(get_input_element_type(1) == element::i32, "SentencepieceTokenizer accepts begins offsets as the second and it should be of type i32 tensor");
FRONT_END_GENERAL_CHECK(get_input_element_type(2) == element::i32, "SentencepieceTokenizer accepts ends offsets as the third and it should be of type i32 tensor");
FRONT_END_GENERAL_CHECK(get_input_element_type(3) == element::u8, "SentencepieceTokenizer accepts sentence symbols as the fourth input and it should be of type u8 tensor");

#else

FRONT_END_GENERAL_CHECK(get_input_size() == 2, "SentencepieceTokenizer expects two inputs: sp model and input sentences");
FRONT_END_GENERAL_CHECK(get_input_element_type(0) == element::u8, "SentencepieceTokenizer accepts sp model as the first input and it should be of type u8 tensor");

FRONT_END_GENERAL_CHECK(
// WA: sometimes f32 appeared as a placeholder for unknown type
get_input_element_type(1) == element::u8 || get_input_element_type(1) == element::string || get_input_element_type(1) == element::f32,
// WA: f32 appeared as a placeholder for unknown type during intermediate conversion steps
get_input_element_type(1) == element::string || get_input_element_type(1) == element::f32,
"SentencepieceTokenizer accepts sentences as the second input and it should be of type string tensor");

#endif

// The operation SentencepieceTokenizerExtensionOp has three outputs: sparse indices, sparse values
// and dense shape
set_output_type(0, element::i64, PartialShape{ Dimension(), Dimension(2) });
Expand All @@ -133,16 +120,6 @@ bool SentencepieceTokenizer::evaluate(TensorVector& outputs, const TensorVector&
std::vector<int32_t> sparse_values;
std::vector<int64_t> sparse_dense_shape;

#if SENTENCE_PIECE_EXTENSION_DECOMPOSED_STRINGS

auto begin_ids = inputs[1].data<const int32_t>();
auto end_ids = inputs[2].data<const int32_t>();
auto data = inputs[3].data<const uint8_t>();

auto batch_size = shape_size(inputs[1].get_shape());

#else

auto input_element_type = get_input_element_type(1);
int32_t batch_size;

Expand All @@ -157,24 +134,14 @@ bool SentencepieceTokenizer::evaluate(TensorVector& outputs, const TensorVector&
if(input_element_type == ov::element::string) {
strings = inputs[1].data<const std::string>();
batch_size = static_cast<int32_t>(ov::shape_size(inputs[1].get_shape()));
} else if(input_element_type == ov::element::u8) {
parse_packed_strings(inputs[1], batch_size, begin_ids, end_ids, data);
} else {
OPENVINO_THROW("Unexpected input type during inference. SentencepieceTokenizer accepts element::u8 or element::string.");
}

#endif

size_t max_token_id = 0;
for (size_t batch_ind = 0; batch_ind < batch_size; ++batch_ind) {
absl::string_view sentence;
if(input_element_type == ov::element::string) {
sentence = strings[batch_ind];
} else if(input_element_type == ov::element::u8) {
auto begin_ind = begin_ids[batch_ind];
auto end_ind = end_ids[batch_ind];
sentence = absl::string_view((const char*)data + begin_ind, end_ind - begin_ind);
}
sentence = strings[batch_ind];

std::vector<int32_t> ids;
CHECK_OK(m_sp->SampleEncode(sentence, m_nbest_size, m_alpha, &ids));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,35 +39,14 @@ namespace {

OutputVector translate_sentencepiece_op(const NodeContext& node) {
// extract model to configure SentencePieceTokenizer
// std::cout << "[ Trace 1 ] Before" << std::endl;
// auto sp_model_ov_any = node.get_attribute_as_any("model");
// std::cout << "[ Trace 1 ] Get Model" << std::endl;
// FRONT_END_GENERAL_CHECK(sp_model_ov_any.is<std::string>(),
// "SentencePieceOp configuration model is in incorrect format");
std::ifstream input( "/home/apaniuko/.config/JetBrains/RemoteDev-PY/_home_apaniuko_python_openvino_contrib/scratches/bytes", std::ios::binary );
std::vector<unsigned char> str_spm_model(std::istreambuf_iterator<char>(input), {});
std::cout << "[ Trace 1 ] FE Check" << std::endl;

// auto str_spm_model = sp_model_ov_any.as<std::vector<uint32_t>>();
std::cout << "[ Trace 1 ] As string" << std::endl;
// str_spm_model = str_spm_model.substr(2);
// str_spm_model = str_spm_model.substr(0, str_spm_model.size() - 1);
auto sp_model_ov_any = node.get_attribute_as_any("model");
FRONT_END_GENERAL_CHECK(sp_model_ov_any.is<std::string>(),
"SentencePieceOp configuration model is in incorrect format");
auto str_spm_model = sp_model_ov_any.as<std::string>();
auto sp_model_const = std::make_shared<Constant>(element::u8, Shape{ str_spm_model.size() }, str_spm_model.data());
// std::cout << "[ Trace 1 ] Successful size:"<< str_spm_model.size() << "\n" << str_spm_model.substr(0, 100) << std::endl;
std::cout << "[ Trace 1 ] Successful" << std::endl;
return { sp_model_const };
}

//OutputVector translate_sentencepiece_op(const NodeContext& node) {
// // extract model to configure SentencePieceTokenizer
// auto sp_model_ov_any = node.get_attribute_as_any("model");
// FRONT_END_GENERAL_CHECK(sp_model_ov_any.is<std::string>(),
// "SentencePieceOp configuration model is in incorrect format");
// auto str_spm_model = sp_model_ov_any.as<std::string>();
// auto sp_model_const = std::make_shared<Constant>(element::u8, Shape{ str_spm_model.size() }, str_spm_model.data());
// return { sp_model_const };
//}

NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) {
// this is custom translator that converts a sub-graph with SentencePieceOp, SentencePieceTokenizer,
// and RaggedTensorToSparse operation- into a custom operation SentencepieceTokenizerExtensionOp
Expand All @@ -86,8 +65,6 @@ NamedOutputVector translate_sentencepiece_tokenizer(const NodeContext& node) {

// prepare input
auto inputs = sp_tokenize_op->input_value(1);
auto parameter = std::dynamic_pointer_cast<Parameter>(inputs.get_node_shared_ptr());
parameter -> set_partial_shape(PartialShape{ Dimension() });

// extract values for nbest_size, alpha, add_bos, add_eos, reverse attributes
auto nbest_size = extract_scalar_const_value<int32_t>(sp_tokenize_op->input_value(2).get_node_shared_ptr(), "nbest_size");
Expand Down

0 comments on commit 39e9eb9

Please sign in to comment.