diff --git a/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp b/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp index 6dc0bce647..218b0af9ec 100644 --- a/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp +++ b/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp @@ -22,7 +22,7 @@ std::string detokenize(ov::InferRequest& detokenizer, const std::vector detokenizer.infer(); return detokenizer.get_output_tensor().data()[0]; } -} // namespace +} int main(int argc, char* argv[]) try { if (argc != 3) { @@ -31,15 +31,17 @@ int main(int argc, char* argv[]) try { // Compile models ov::Core core; core.add_extension(OPENVINO_TOKENIZERS_PATH); // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt + //Read the tokenizer model information from the file to later get the runtime information + auto tokenizer_model = core.read_model(std::string{argv[1]} + "/openvino_tokenizer.xml"); // tokenizer and detokenizer work on CPU only - ov::InferRequest tokenizer = - core.compile_model(std::string{argv[1]} + "/openvino_tokenizer.xml", "CPU").create_infer_request(); + ov::InferRequest tokenizer = core.compile_model( + tokenizer_model, "CPU").create_infer_request(); auto [input_ids, attention_mask] = tokenize(tokenizer, argv[2]); - ov::InferRequest detokenizer = - core.compile_model(std::string{argv[1]} + "/openvino_detokenizer.xml", "CPU").create_infer_request(); + ov::InferRequest detokenizer = core.compile_model( + std::string{argv[1]} + "/openvino_detokenizer.xml", "CPU").create_infer_request(); // The model can be compiled for GPU as well - ov::InferRequest lm = - core.compile_model(std::string{argv[1]} + "/openvino_model.xml", "CPU").create_infer_request(); + ov::InferRequest lm = core.compile_model( + std::string{argv[1]} + "/openvino_model.xml", "CPU").create_infer_request(); // Initialize inputs lm.set_tensor("input_ids", input_ids); lm.set_tensor("attention_mask", attention_mask); @@ -49,8 +51,18 @@ int main(int argc, char* argv[]) try { lm.get_tensor("beam_idx").set_shape({1}); lm.get_tensor("beam_idx").data()[0] = 0; + // Get the runtime info from the tokenizer model that we read earlier + auto rt_info = tokenizer_model->get_rt_info(); //Get the runtime info for the model + int64_t SPECIAL_EOS_TOKEN; + + if (rt_info.count("eos_token_id") > 0) { //check if the runtime information has a valid EOS token ID + SPECIAL_EOS_TOKEN = rt_info["eos_token_id"].as(); + + } else { + throw std::runtime_error("EOS token ID not found in model's runtime information."); + } const int64_t* prompt_data = input_ids.data(); - Parameters parameters{std::vector{prompt_data, prompt_data + input_ids.get_size()}}; + Parameters parameters{std::vector{prompt_data, prompt_data + input_ids.get_size()}, SPECIAL_EOS_TOKEN}; GroupBeamSearcher group_beam_searcher{parameters}; std::vector next_tokens; std::vector next_beams; diff --git a/text_generation/causal_lm/cpp/greedy_causal_lm.cpp b/text_generation/causal_lm/cpp/greedy_causal_lm.cpp index 9dcdea770b..d75d32d0e0 100644 --- a/text_generation/causal_lm/cpp/greedy_causal_lm.cpp +++ b/text_generation/causal_lm/cpp/greedy_causal_lm.cpp @@ -61,9 +61,11 @@ int main(int argc, char* argv[]) try { // Compile models ov::Core core; core.add_extension(OPENVINO_TOKENIZERS_PATH); // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt + //Read the tokenizer model information from the file to later get the runtime information + auto tokenizer_model = core.read_model(std::string{argv[1]} + "/openvino_tokenizer.xml"); // tokenizer and detokenizer work on CPU only ov::InferRequest tokenizer = core.compile_model( - std::string{argv[1]} + "/openvino_tokenizer.xml", "CPU").create_infer_request(); + tokenizer_model, "CPU").create_infer_request(); auto [input_ids, attention_mask] = tokenize(tokenizer, argv[2]); ov::InferRequest detokenizer = core.compile_model( std::string{argv[1]} + "/openvino_detokenizer.xml", "CPU").create_infer_request(); @@ -91,9 +93,17 @@ int main(int argc, char* argv[]) try { lm.get_tensor("input_ids").set_shape({BATCH_SIZE, 1}); position_ids.set_shape({BATCH_SIZE, 1}); TextStreamer text_streamer{std::move(detokenizer)}; - // There's no way to extract special token values from the detokenizer for now - constexpr int64_t SPECIAL_EOS_TOKEN = 2; - + + // Get the runtime info from the tokenizer model that we read earlier + auto rt_info = tokenizer_model->get_rt_info(); //Get the runtime info for the model + int64_t SPECIAL_EOS_TOKEN; + + if (rt_info.count("eos_token_id") > 0) { //check if the runtime information has a valid EOS token ID + SPECIAL_EOS_TOKEN = rt_info["eos_token_id"].as(); + } else { + throw std::runtime_error("EOS token ID not found in model's runtime information."); + } + int max_sequence_length = 100; while (out_token != SPECIAL_EOS_TOKEN && seq_len < max_sequence_length) { ++seq_len; diff --git a/text_generation/causal_lm/cpp/group_beam_searcher.hpp b/text_generation/causal_lm/cpp/group_beam_searcher.hpp index f63de940fc..1703ac79b8 100644 --- a/text_generation/causal_lm/cpp/group_beam_searcher.hpp +++ b/text_generation/causal_lm/cpp/group_beam_searcher.hpp @@ -44,10 +44,7 @@ std::vector kmp_search(const std::vector& haystack, const std: return res; } -struct Token { - float log_prob; - int64_t idx; -}; +struct Token {float log_prob; int64_t idx;}; std::vector log_softmax(const ov::Tensor& logits, size_t batch_idx) { if (logits.get_shape().at(0) <= batch_idx) { @@ -58,10 +55,10 @@ std::vector log_softmax(const ov::Tensor& logits, size_t batch_idx) { size_t sequence_offset = (logits.get_shape().at(1) - 1) * vocab_size; const float* beam_logits = logits.data() + batch_offset + sequence_offset; float max_logit = *std::max_element(beam_logits, beam_logits + vocab_size); - float log_sum = std::log( - std::accumulate(beam_logits, beam_logits + vocab_size, 0.0f, [max_logit](float accumulated, float to_add) { + float log_sum = std::log(std::accumulate( + beam_logits, beam_logits + vocab_size, 0.0f, [max_logit](float accumulated, float to_add) { return accumulated + std::exp(to_add - max_logit); - })); + })); std::vector tokens; tokens.reserve(vocab_size); for (size_t idx = 0; idx < vocab_size; ++idx) { @@ -80,10 +77,11 @@ bool greater(const Beam& left, const Beam& right) { return left.score > right.score; } -enum class StopCriteria { early, heuristic, never }; +enum class StopCriteria {early, heuristic, never}; struct Parameters { std::vector prompt; + int64_t eos_token; size_t n_groups = 3; size_t group_size = 5; float diversity_penalty = 1.0; @@ -91,15 +89,12 @@ struct Parameters { StopCriteria stop_criteria = StopCriteria::heuristic; float length_penalty = 1.0; size_t no_repeat_ngram_size = std::numeric_limits::max(); - // There's no way to extract special token values from the tokenizer for now - int64_t eos_token = 2; - std::function early_finish = [](const Beam&) { - return false; - }; + + std::function early_finish = [](const Beam&){return false;}; }; struct Group { - std::vector ongoing; // Best beams in front + std::vector ongoing; // Best beams in front std::vector min_heap; // The worst of the best completed beams is the first bool done = false; @@ -126,30 +121,26 @@ struct Group { float best_sum_logprobs = ongoing.front().score; float worst_score = min_heap.front().score; switch (parameters.stop_criteria) { - case StopCriteria::early: - done = true; - return; - case StopCriteria::heuristic: { - float highest_attainable_score = best_sum_logprobs / std::pow(float(cur_len), parameters.length_penalty); - done = worst_score >= highest_attainable_score; - return; - } - case StopCriteria::never: { - size_t length = parameters.length_penalty > 0.0 ? parameters.max_new_tokens : cur_len; - float highest_attainable_score = best_sum_logprobs / std::pow(float(length), parameters.length_penalty); - done = worst_score >= highest_attainable_score; - return; - } - default: - throw std::runtime_error("Never reached"); + case StopCriteria::early: + done = true; + return; + case StopCriteria::heuristic: { + float highest_attainable_score = best_sum_logprobs / std::pow(float(cur_len), parameters.length_penalty); + done = worst_score >= highest_attainable_score; + return; + } + case StopCriteria::never: { + size_t length = parameters.length_penalty > 0.0 ? parameters.max_new_tokens : cur_len; + float highest_attainable_score = best_sum_logprobs / std::pow(float(length), parameters.length_penalty); + done = worst_score >= highest_attainable_score; + return; + } + default: throw std::runtime_error("Never reached"); } } }; -struct TokenToBeam { - int64_t token_idx; - int32_t beam_idx; -}; +struct TokenToBeam {int64_t token_idx; int32_t beam_idx;}; // GroupBeamSearcher processes logits prduced by a language model and accumulates beams using group beam search // algorithm. select_next_tokens() returns token ids selected by the algorithm and corresponding beam ids. These values diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers index dc4fcb64bd..0e4bb32ca3 160000 --- a/thirdparty/openvino_tokenizers +++ b/thirdparty/openvino_tokenizers @@ -1 +1 @@ -Subproject commit dc4fcb64bd95ca6b1ca7a8db1016d77a51303c64 +Subproject commit 0e4bb32ca3412f589e1d094faa8b0aad19ee47ca