From 11fbaa27391d76aa69208d039066e67d23bc043f Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Wed, 15 May 2024 11:45:00 +0200 Subject: [PATCH] tokenizer minor fixes --- src/cpp/src/llm_pipeline.cpp | 8 ++++++-- src/cpp/src/tokenizer.cpp | 8 +------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 30ed23e9f8..47bf3495d5 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -143,7 +143,7 @@ std::string ov::LLMPipeline::LLMPipelineImpl::generate( auto [input_ids, attention_mask] = m_tokenizer.encode(text); - // todo: W/A If sentence begins with a special tokens (, , etc.) openvino_tokenizer inserts 2 special extra tokens and "▁", + // todo: W/A If sentence begins with a specfial tokens (, , etc.) openvino_tokenizer inserts 2 special extra tokens and "▁", // but HF does not do that. Moreover openvino_tokenizer always inserts but in chat scenario HF does not do that because skip_special_tokens=True. // Need to remove both of that tokens manually to get exact token by token alignment with HF auto size = input_ids.get_shape(); @@ -155,7 +155,7 @@ std::string ov::LLMPipeline::LLMPipelineImpl::generate( std::vector tmp_attn_mask(attention_mask_data, attention_mask_data + attention_mask.get_size()); // tmp_attn_mask.erase(tmp_attn_mask.begin()); - std::vector prefixes_to_exclude = {"", ""}; // todo: for TinyLlama, need to get them form generation_config + std::vector prefixes_to_exclude = {config.eos_token, config.bos_token}; auto prefix_match = [&text](std::string prefix) { return text.substr(0, prefix.length()) == prefix; }; if (std::any_of(prefixes_to_exclude.begin(), prefixes_to_exclude.end(), prefix_match)) { tmp_ids.erase(tmp_ids.begin()); @@ -221,6 +221,10 @@ ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::generate( } else if (auto callback = std::get_if>(&*streamer)) { streamer_ptr = std::make_shared(m_tokenizer, *callback); } + auto batch_size = input_ids.get_shape().at(0); + if ((batch_size != 1 || !config_helper.is_greedy_decoding()) && streamer_ptr) { + OPENVINO_THROW("Currently streaming is possible only with batch size=1 and greedy decoding"); + } auto attention_mask_data = attention_mask.has_value() ? *attention_mask : ov::generate_utils::init_attention_mask(input_ids); diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp index a11cfb471a..09d64460a2 100644 --- a/src/cpp/src/tokenizer.cpp +++ b/src/cpp/src/tokenizer.cpp @@ -93,13 +93,7 @@ class Tokenizer::TokenizerImpl { m_tokenize_request.set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, prompts.data()}); auto size_ = m_tokenize_request.get_input_tensor().get_shape(); m_tokenize_request.infer(); - - ::pad_left(m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask"), m_pad_token_id); - // todo: fix mask filled with '2' instead of '0' - ov::Tensor attention_mask = m_tokenize_request.get_tensor("attention_mask"); - int64_t* attention_mask_data = attention_mask.data(); - std::replace(attention_mask_data, attention_mask_data + attention_mask.get_size(), 2, 0); - + pad_left(m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask"), m_pad_token_id); return {m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask")}; }