openvinotoolkit · ilya-lavrenov · Apr 9, 2024 · Mar 20, 2024 · Mar 21, 2024 · Mar 23, 2024
diff --git a/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp b/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp
@@ -22,7 +22,7 @@ std::string detokenize(ov::InferRequest& detokenizer, const std::vector<int64_t>
     detokenizer.infer();
     return detokenizer.get_output_tensor().data<std::string>()[0];
 }
-}  // namespace
+}
 
 int main(int argc, char* argv[]) try {
     if (argc != 3) {
@@ -31,15 +31,17 @@ int main(int argc, char* argv[]) try {
     // Compile models
     ov::Core core;
     core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
+    //Read the tokenizer model information from the file to later get the runtime information
+    auto tokenizer_model = core.read_model(std::string{argv[1]} + "/openvino_tokenizer.xml");
     // tokenizer and detokenizer work on CPU only
-    ov::InferRequest tokenizer =
-        core.compile_model(std::string{argv[1]} + "/openvino_tokenizer.xml", "CPU").create_infer_request();
+    ov::InferRequest tokenizer = core.compile_model(
+        tokenizer_model, "CPU").create_infer_request();
     auto [input_ids, attention_mask] = tokenize(tokenizer, argv[2]);
-    ov::InferRequest detokenizer =
-        core.compile_model(std::string{argv[1]} + "/openvino_detokenizer.xml", "CPU").create_infer_request();
+    ov::InferRequest detokenizer = core.compile_model(
+        std::string{argv[1]} + "/openvino_detokenizer.xml", "CPU").create_infer_request();
     // The model can be compiled for GPU as well
-    ov::InferRequest lm =
-        core.compile_model(std::string{argv[1]} + "/openvino_model.xml", "CPU").create_infer_request();
+    ov::InferRequest lm = core.compile_model(
+        std::string{argv[1]} + "/openvino_model.xml", "CPU").create_infer_request();
     // Initialize inputs
     lm.set_tensor("input_ids", input_ids);
     lm.set_tensor("attention_mask", attention_mask);
@@ -49,8 +51,18 @@ int main(int argc, char* argv[]) try {
     lm.get_tensor("beam_idx").set_shape({1});
     lm.get_tensor("beam_idx").data<int32_t>()[0] = 0;
 
+    // Get the runtime info from the tokenizer model that we read earlier
+    auto rt_info = tokenizer_model->get_rt_info(); //Get the runtime info for the model
+    int64_t SPECIAL_EOS_TOKEN;
+
+    if (rt_info.count("eos_token_id") > 0) { //check if the runtime information has a valid EOS token ID
+        SPECIAL_EOS_TOKEN = rt_info["eos_token_id"].as<int64_t>();
+
+    } else {
+        throw std::runtime_error("EOS token ID not found in model's runtime information.");
+    }
     const int64_t* prompt_data = input_ids.data<const int64_t>();
-    Parameters parameters{std::vector<int64_t>{prompt_data, prompt_data + input_ids.get_size()}};
+    Parameters parameters{std::vector<int64_t>{prompt_data, prompt_data + input_ids.get_size()}, SPECIAL_EOS_TOKEN};
     GroupBeamSearcher group_beam_searcher{parameters};
     std::vector<int64_t> next_tokens;
     std::vector<int32_t> next_beams;

diff --git a/text_generation/causal_lm/cpp/greedy_causal_lm.cpp b/text_generation/causal_lm/cpp/greedy_causal_lm.cpp
@@ -61,9 +61,11 @@ int main(int argc, char* argv[]) try {
     // Compile models
     ov::Core core;
     core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
+    //Read the tokenizer model information from the file to later get the runtime information
+    auto tokenizer_model = core.read_model(std::string{argv[1]} + "/openvino_tokenizer.xml");
     // tokenizer and detokenizer work on CPU only
     ov::InferRequest tokenizer = core.compile_model(
-        std::string{argv[1]} + "/openvino_tokenizer.xml", "CPU").create_infer_request();
+        tokenizer_model, "CPU").create_infer_request();
     auto [input_ids, attention_mask] = tokenize(tokenizer, argv[2]);
     ov::InferRequest detokenizer = core.compile_model(
         std::string{argv[1]} + "/openvino_detokenizer.xml", "CPU").create_infer_request();
@@ -91,9 +93,17 @@ int main(int argc, char* argv[]) try {
     lm.get_tensor("input_ids").set_shape({BATCH_SIZE, 1});
     position_ids.set_shape({BATCH_SIZE, 1});
     TextStreamer text_streamer{std::move(detokenizer)};
-    // There's no way to extract special token values from the detokenizer for now
-    constexpr int64_t SPECIAL_EOS_TOKEN = 2;
-
+
+    // Get the runtime info from the tokenizer model that we read earlier
+    auto rt_info = tokenizer_model->get_rt_info(); //Get the runtime info for the model
+    int64_t SPECIAL_EOS_TOKEN;
+
+    if (rt_info.count("eos_token_id") > 0) { //check if the runtime information has a valid EOS token ID
+        SPECIAL_EOS_TOKEN = rt_info["eos_token_id"].as<int64_t>();
+    } else {
+        throw std::runtime_error("EOS token ID not found in model's runtime information.");
+    }
+
     int max_sequence_length = 100;
     while (out_token != SPECIAL_EOS_TOKEN && seq_len < max_sequence_length) {
         ++seq_len;

diff --git a/text_generation/causal_lm/cpp/group_beam_searcher.hpp b/text_generation/causal_lm/cpp/group_beam_searcher.hpp
@@ -44,10 +44,7 @@ std::vector<int64_t> kmp_search(const std::vector<int64_t>& haystack, const std:
     return res;
 }
 
-struct Token {
-    float log_prob;
-    int64_t idx;
-};
+struct Token {float log_prob; int64_t idx;};
 
 std::vector<Token> log_softmax(const ov::Tensor& logits, size_t batch_idx) {
     if (logits.get_shape().at(0) <= batch_idx) {
@@ -58,10 +55,10 @@ std::vector<Token> log_softmax(const ov::Tensor& logits, size_t batch_idx) {
     size_t sequence_offset = (logits.get_shape().at(1) - 1) * vocab_size;
     const float* beam_logits = logits.data<const float>() + batch_offset + sequence_offset;
     float max_logit = *std::max_element(beam_logits, beam_logits + vocab_size);
-    float log_sum = std::log(
-        std::accumulate(beam_logits, beam_logits + vocab_size, 0.0f, [max_logit](float accumulated, float to_add) {
+    float log_sum = std::log(std::accumulate(
+        beam_logits, beam_logits + vocab_size, 0.0f, [max_logit](float accumulated, float to_add) {
             return accumulated + std::exp(to_add - max_logit);
-        }));
+    }));
     std::vector<Token> tokens;
     tokens.reserve(vocab_size);
     for (size_t idx = 0; idx < vocab_size; ++idx) {
@@ -80,26 +77,24 @@ bool greater(const Beam& left, const Beam& right) {
     return left.score > right.score;
 }
 
-enum class StopCriteria { early, heuristic, never };
+enum class StopCriteria {early, heuristic, never};
 
 struct Parameters {
     std::vector<int64_t> prompt;
+    int64_t eos_token;
     size_t n_groups = 3;
     size_t group_size = 5;
     float diversity_penalty = 1.0;
     size_t max_new_tokens = 20;
     StopCriteria stop_criteria = StopCriteria::heuristic;
     float length_penalty = 1.0;
     size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max();
-    // There's no way to extract special token values from the tokenizer for now
-    int64_t eos_token = 2;
-    std::function<bool(const Beam&)> early_finish = [](const Beam&) {
-        return false;
-    };
+
+    std::function<bool(const Beam&)> early_finish = [](const Beam&){return false;};
 };
 
 struct Group {
-    std::vector<Beam> ongoing;   // Best beams in front
+    std::vector<Beam> ongoing;  // Best beams in front
     std::vector<Beam> min_heap;  // The worst of the best completed beams is the first
     bool done = false;
 
@@ -126,30 +121,26 @@ struct Group {
         float best_sum_logprobs = ongoing.front().score;
         float worst_score = min_heap.front().score;
         switch (parameters.stop_criteria) {
-        case StopCriteria::early:
-            done = true;
-            return;
-        case StopCriteria::heuristic: {
-            float highest_attainable_score = best_sum_logprobs / std::pow(float(cur_len), parameters.length_penalty);
-            done = worst_score >= highest_attainable_score;
-            return;
-        }
-        case StopCriteria::never: {
-            size_t length = parameters.length_penalty > 0.0 ? parameters.max_new_tokens : cur_len;
-            float highest_attainable_score = best_sum_logprobs / std::pow(float(length), parameters.length_penalty);
-            done = worst_score >= highest_attainable_score;
-            return;
-        }
-        default:
-            throw std::runtime_error("Never reached");
+            case StopCriteria::early:
+                done = true;
+                return;
+            case StopCriteria::heuristic: {
+                float highest_attainable_score = best_sum_logprobs / std::pow(float(cur_len), parameters.length_penalty);
+                done = worst_score >= highest_attainable_score;
+                return;
+            }
+            case StopCriteria::never: {
+                size_t length = parameters.length_penalty > 0.0 ? parameters.max_new_tokens : cur_len;
+                float highest_attainable_score = best_sum_logprobs / std::pow(float(length), parameters.length_penalty);
+                done = worst_score >= highest_attainable_score;
+                return;
+            }
+            default: throw std::runtime_error("Never reached");
         }
     }
 };
 
-struct TokenToBeam {
-    int64_t token_idx;
-    int32_t beam_idx;
-};
+struct TokenToBeam {int64_t token_idx; int32_t beam_idx;};
 
 // GroupBeamSearcher processes logits prduced by a language model and accumulates beams using group beam search
 // algorithm. select_next_tokens() returns token ids selected by the algorithm and corresponding beam ids. These values

diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers
+59 −11		.github/workflows/linux.yml
+58 −10		.github/workflows/mac.yml
+59 −10		.github/workflows/windows.yml
+6 −1		CMakeLists.txt
+1 −1		pyproject.toml
+8 −7		python/openvino_tokenizers/hf_parser.py
+8 −0		python/openvino_tokenizers/tokenizer_pipeline.py
+8 −3		src/CMakeLists.txt
+1 −1		tests/pass_rates.json
+1 −1		tests/tokenizers_test.py