diff --git a/.github/workflows/genai_package.yml b/.github/workflows/genai_package.yml index 06322703da..e3eb423516 100644 --- a/.github/workflows/genai_package.yml +++ b/.github/workflows/genai_package.yml @@ -18,7 +18,7 @@ jobs: - run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh - run: sudo apt-get install libtbb-dev - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/ - - run: source ./ov/setupvars.sh && cmake --build ./build/ --config ${{ matrix.build-type }} --target package + - run: source ./ov/setupvars.sh && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j - run: source ./ov/setupvars.sh && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix ov - run: ov/samples/cpp/build_samples.sh -i ${{ github.workspace }}/s\ pace if: ${{ 'Release' == matrix.build-type }} # build_samples enforces Release build @@ -49,7 +49,7 @@ jobs: - run: curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.2.0-15349-765302e0de1/w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64.zip - run: unzip ov.zip - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/ - - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake --build ./build/ --config ${{ matrix.build-type }} --target package + - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64 - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\samples\cpp\build_samples_msvc.bat -i "${{ github.workspace }}/samples_install" if: ${{ 'Release' == matrix.build-type }} # build_samples enforces Release build diff --git a/.github/workflows/genai_python_lib.yml b/.github/workflows/genai_python_lib.yml index a875510cf2..b39e6c74db 100644 --- a/.github/workflows/genai_python_lib.yml +++ b/.github/workflows/genai_python_lib.yml @@ -22,7 +22,7 @@ jobs: # build system doesn't. Install ./requirements-build.txt to detect possible conflicts. - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./requirements-build.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - run: PYTHONPATH=./src/python/ python -c "from openvino_genai import LLMPipeline" - - run: source ./ov/setupvars.sh && CMAKE_BUILD_PARALLEL_LEVEL="" python -m pip install . + - run: source ./ov/setupvars.sh && CMAKE_BUILD_PARALLEL_LEVEL="" python -m pip install --pre . --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - run: python -c "from openvino_genai import LLMPipeline" - name: GenAI Python API tests run: | @@ -51,9 +51,16 @@ jobs: - run: unzip ov.zip - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake --build ./build/ --config Release -j +<<<<<<< HEAD # GitHub Actions already provides what is listed in ./requirements-build.txt but the internal # build system doesn't. Install ./requirements-build.txt to detect possible conflicts. - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./requirements-build.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly +======= + - run: python -m pip install "numpy<1.27" + # GitHub Actions already provides what is listed in ./requirements-build.txt but the internal + # build system doesn't. Install ./requirements-build.txt to detect possible conflicts. + - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./requirements-build.txt +>>>>>>> generate_pipeline - run: set "PYTHONPATH=./src/python;" && call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -c "from openvino_genai import LLMPipeline" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that. - run: set CMAKE_BUILD_PARALLEL_LEVEL=&& call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -m pip install . - run: python -c "from openvino_genai import LLMPipeline" diff --git a/src/README.md b/src/README.md index ad21250989..06a649a752 100644 --- a/src/README.md +++ b/src/README.md @@ -8,7 +8,7 @@ optimum-cli export openvino --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --weigh pip install openvino-genai ``` -LLMPipeline is the main object used for decoding. You can initiliza it straigh away from the folder with the converted model. It will automanically load the main model, tokenizer, detokenizer and default generation configuration. +`LLMPipeline` is the main object used for decoding. You can construct it straight away from the folder with the converted model. It will automatically load the main model, tokenizer, detokenizer and default generation configuration. ### Python @@ -24,8 +24,8 @@ Calling generate with custom generation config parameters, e.g. config for group import openvino_genai as ov_genai pipe = ov_genai.LLMPipeline(model_path, "CPU") -res = pipe.generate("The Sun is yellow bacause", max_new_tokens=30, num_groups=3, group_size=5) -print(res) +result = pipe.generate("The Sun is yellow bacause", max_new_tokens=30, num_groups=3, group_size=5, diversity_penalty=1.5) +print(result) ``` output: @@ -38,7 +38,7 @@ A simples chat in python: import openvino_genai as ov_genai pipe = ov_ov_genai.LLMPipeline(model_path) -config = {'num_groups': 3, 'group_size': 5, 'diversity_penalty': 1.1} +config = {'num_groups': 3, 'group_size': 5, 'diversity_penalty': 1.5} pipe.set_generation_cofnig(config) pipe.start_chat() @@ -49,7 +49,6 @@ while True:         break     print(pipe(prompt)) pipe.finish_chat() - ``` Test to compare with Huggingface outputs @@ -63,7 +62,7 @@ Minimalistc example int main(int argc, char* argv[]) { std::string model_path = argv[1]; - ov::LLMPipeline pipe(model_path, "CPU"); + ov::genai::LLMPipeline pipe(model_path, "CPU"); std::cout << pipe.generate("The Sun is yellow bacause"); } ``` @@ -75,9 +74,9 @@ Using Group Beam Search Decoding int main(int argc, char* argv[]) { std::string model_path = argv[1]; - ov::LLMPipeline pipe(model_path, "CPU"); + ov::genai::LLMPipeline pipe(model_path, "CPU"); - ov::GenerationConfig config = pipe.get_generation_config(); + ov::genai::GenerationConfig config = pipe.get_generation_config(); config.max_new_tokens = 256; config.num_groups = 3; config.group_size = 5; @@ -87,7 +86,7 @@ int main(int argc, char* argv[]) { } ``` -A simplest chat in C++ +A simple chat in C++ using grouped beam search decoding ``` cpp #include "openvino/genai/llm_pipeline.hpp" #include @@ -96,71 +95,50 @@ int main(int argc, char* argv[]) { std::string prompt; std::string model_path = argv[1]; - ov::LLMPipeline pipe(model_path, "CPU"); - - pipe.start_chat(); - for (size_t i = 0; i < questions.size(); i++) { - std::cout << "question:\n"; - std::getline(std::cin, prompt); - - std::cout << pipe(prompt) << std::endl>>; - } - pipe.finish_chat(); -} -``` - -Specifying generation_config to use grouped beam search -``` cpp -int main(int argc, char* argv[]) { - std::string prompt; - - std::string model_path = argv[1]; - ov::LLMPipeline pipe(model_path, "CPU"); + ov::genai::LLMPipeline pipe(model_path, "CPU"); - ov::GenerationConfig config = pipe.get_generation_config(); + ov::genai::GenerationConfig config = pipe.get_generation_config(); config.max_new_tokens = 256; config.num_groups = 3; config.group_size = 5; config.diversity_penalty = 1.0f; - auto streamer = [](std::string word) { std::cout << word << std::flush; }; - pipe.start_chat(); - for (size_t i = 0; i < questions.size(); i++) { - + for (;;;) { std::cout << "question:\n"; - cout << prompt << endl; + std::getline(std::cin, prompt); + if (prompt == "Stop!") + break; - auto answer = pipe(prompt, config, streamer); - // no need to print answer, streamer will do that + std::cout << "answer:\n"; + auto answer = pipe(prompt, config); + std::cout << answer << std::endl; } pipe.finish_chat(); } ``` -Streaming exapmle with lambda function - +Streaming example with lambda function ``` cpp - #include "openvino/genai/llm_pipeline.hpp" #include int main(int argc, char* argv[]) { std::string model_path = argv[1]; - ov::LLMPipeline pipe(model_path, "CPU"); + ov::genai::LLMPipeline pipe(model_path, "CPU"); auto streamer = [](std::string word) { std::cout << word << std::flush; }; std::cout << pipe.generate("The Sun is yellow bacause", streamer); } ``` -Streaming with custom class +Streaming with a custom class ``` cpp -#include +#include "openvino/genai/streamer_base.hpp" #include "openvino/genai/llm_pipeline.hpp" #include -class CustomStreamer: publict StreamerBase { +class CustomStreamer: public ov::genai::StreamerBase { public: void put(int64_t token) { /* custom decoding/tokens processing code @@ -179,7 +157,7 @@ int main(int argc, char* argv[]) { CustomStreamer custom_streamer; std::string model_path = argv[1]; - ov::LLMPipeline pipe(model_path, "CPU"); - cout << pipe.generate("The Sun is yellow bacause", custom_streamer); + ov::genai::LLMPipeline pipe(model_path, "CPU"); + std::cout << pipe.generate("The Sun is yellow bacause", custom_streamer); } ``` diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp index 837fae21ad..4c43f880d9 100644 --- a/src/cpp/include/openvino/genai/generation_config.hpp +++ b/src/cpp/include/openvino/genai/generation_config.hpp @@ -12,6 +12,7 @@ #include "openvino/genai/tokenizer.hpp" namespace ov { +namespace genai { /** * @brief controls the stopping condition for grouped beam search. The following values are possible: @@ -22,43 +23,48 @@ namespace ov { enum class StopCriteria { early, heuristic, never }; /** - * @brief structure to keep generation config parameters. + * @brief Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group + * and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will + * be used while greedy and beam search parameters will not affect decoding at all. * + * Generic parameters: * @param max_length the maximum length the generated tokens can have. Corresponds to the length of the input prompt + * `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set. * @param max_new_tokens the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. * @param ignore_eos if set to true, then generation will not stop even if token is met. + * @param pad_token_id token_id of (padding) + * @param bos_token_id token_id of (beggining of sentence) + * @param eos_token_id token_id of (end of sentence) + * @param bos_token token string representation + * @param eos_token token string representation + * + * Beam search specific parameters: * @param num_beams number of beams for beam search. 1 disables beam search. * @param num_beam_groups number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. * @param diversity_penalty this value is subtracted from a beam's score if it generates the same token as any beam from other group at a - * particular time. Note that `diversity_penalty` is only effective if `group beam search` is enabled. + * particular time. See https://arxiv.org/pdf/1909.05858. * @param length_penalty exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to * the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log * likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while * `length_penalty` < 0.0 encourages shorter sequences. - * @param num_return_sequences the number of sequences to return for grouped beam search decoding + * @param num_return_sequences the number of sequences to return for grouped beam search decoding. * @param no_repeat_ngram_size if set to int > 0, all ngrams of that size can only occur once. * @param stop_criteria controls the stopping condition for grouped beam search. It accepts the following values: * "early", where the generation stops as soon as there are `num_beams` complete candidates; "heuristic", where an * heuristic is applied and the generation stops when is it very unlikely to find better candidates; * "never", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). - * @param temperature the value used to modulate token probabilities for random sampling + * + * Random sampling parameters: + * @param temperature the value used to modulate token probabilities for random sampling. * @param top_p - if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. * @param top_k the number of highest probability vocabulary tokens to keep for top-k-filtering. - * @param do_sample whether or not to use multinomial random sampling - * that add up to `top_p` or higher are kept. - * @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty. See https://arxiv.org/pdf/1909.05858. - * @param pad_token_id id of padding token - * @param bos_token_id id of token - * @param eos_token_id id of token - * @param bos_token token string representation - * @param eos_token token string representation - * @param draft_model draft model for assitive decoding + * @param do_sample whether or not to use multinomial random sampling that add up to `top_p` or higher are kept. + * @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty. */ class OPENVINO_GENAI_EXPORTS GenerationConfig { public: GenerationConfig() = default; - GenerationConfig(std::string json_path); + explicit GenerationConfig(std::string json_path); // Generic size_t max_new_tokens = SIZE_MAX; @@ -89,6 +95,13 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { // used for chat scenario std::string bos_token = ""; std::string eos_token = ""; + + size_t get_max_new_tokens(size_t prompt_length = 0) const; + bool is_greedy_decoding() const; + bool is_beam_search() const; + bool is_multimomial() const; + static GenerationConfig anymap_to_generation_config(const ov::AnyMap& config_map = {}); }; -} // namespace ov +} // namespace genai +} // namespace ov diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp index 3bc8453d4e..7501058ca9 100644 --- a/src/cpp/include/openvino/genai/llm_pipeline.hpp +++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp @@ -6,12 +6,13 @@ #include #include -#include +#include "openvino/core/any.hpp" #include "openvino/genai/generation_config.hpp" #include "openvino/genai/tokenizer.hpp" #include "openvino/genai/streamer_base.hpp" namespace ov { +namespace genai { using StreamerVariant = std::variant, std::shared_ptr>; using OptionalGenerationConfig = std::optional; @@ -71,7 +72,7 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { * @param device optional device * @param plugin_config optional plugin_config */ - LLMPipeline(std::string& path, std::string device="CPU", + LLMPipeline(const std::string& path, const std::string& device="CPU", const ov::AnyMap& plugin_config={}, const std::string& ov_tokenizers_path=""); @@ -84,11 +85,10 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { * @param plugin_config optional plugin_config */ LLMPipeline( - const std::string model_path, - const ov::Tokenizer& tokenizer, - const std::string device="CPU", - const ov::AnyMap& plugin_config = {}, - const std::string& ov_tokenizers_path="" + const std::string& model_path, + const ov::genai::Tokenizer& tokenizer, + const std::string& device="CPU", + const ov::AnyMap& plugin_config = {} ); ~LLMPipeline(); @@ -127,8 +127,7 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { * @param generation_config optional GenerationConfig * @return DecodedResults a structure with resulting texts & scores */ - DecodedResults generate(std::vector texts, OptionalGenerationConfig generation_config); - DecodedResults generate(std::initializer_list text, OptionalGenerationConfig generation_config); + DecodedResults generate(const std::vector& texts, OptionalGenerationConfig generation_config); /** * @brief Low level generate to be called with already encoded input_ids tokens. @@ -153,14 +152,19 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { return generate(text, AnyMap{std::forward(properties)...}); } - DecodedResults operator()(std::vector text, OptionalGenerationConfig generation_config=std::nullopt); - DecodedResults operator()(std::initializer_list text, OptionalGenerationConfig generation_config=std::nullopt); + DecodedResults operator()(const std::vector& text, OptionalGenerationConfig generation_config=std::nullopt) { + return generate(text, generation_config); + } - // generate with streamers - std::string operator()(std::string text, OptionalGenerationConfig generation_config=std::nullopt, OptionalStreamerVariant streamer=std::nullopt); - std::string operator()(std::string text, OptionalStreamerVariant streamer); + std::string operator()( + std::string text, + OptionalGenerationConfig generation_config=std::nullopt, + OptionalStreamerVariant streamer=std::nullopt + ) { + return generate(text, generation_config, streamer); + } - ov::Tokenizer get_tokenizer(); + ov::genai::Tokenizer get_tokenizer(); GenerationConfig get_generation_config() const; void set_generation_config(const GenerationConfig& generation_config); @@ -174,10 +178,9 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { }; /* - * utils that allow to use generate and operarator() in the folllowing way: + * utils that allow to use generate and operator() in the following way: * pipe.generate(input_ids, ov::max_new_tokens(200), ov::temperature(1.0f),...) * pipe(text, ov::max_new_tokens(200), ov::temperature(1.0f),...) - * All names match to names in cofnig except streamer. */ static constexpr ov::Property max_new_tokens{"max_new_tokens"}; static constexpr ov::Property max_length{"max_length"}; @@ -207,6 +210,7 @@ static constexpr ov::Property eos_token{"eos_token"}; // only lambda streamer can be set via ov::streamer(),... syntaxic sugar, // because std::variant> can not be stored in AnyMap -static constexpr ov::Property> streamer_lambda{"streamer"}; +static constexpr ov::Property> streamer{"streamer"}; -} // namespace ov +} // namespace genai +} // namespace ov diff --git a/src/cpp/include/openvino/genai/streamer_base.hpp b/src/cpp/include/openvino/genai/streamer_base.hpp index 3f0879d702..7731b51c1c 100644 --- a/src/cpp/include/openvino/genai/streamer_base.hpp +++ b/src/cpp/include/openvino/genai/streamer_base.hpp @@ -6,6 +6,7 @@ #include "openvino/genai/tokenizer.hpp" namespace ov { +namespace genai { /** * @brief base class for streamers. In order to use inherit from from this class and inplement put, and methods @@ -15,7 +16,7 @@ namespace ov { class StreamerBase { public: Tokenizer m_tokenizer; - StreamerBase(Tokenizer tokenizer): m_tokenizer(tokenizer) {}; + explicit StreamerBase(Tokenizer tokenizer): m_tokenizer(tokenizer) {} StreamerBase() = default; /// @brief put is called every time new token is decoded @@ -25,4 +26,5 @@ class StreamerBase { virtual void end() = 0; }; -} // namespace ov +} // namespace genai +} // namespace ov diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp index 03c0cd64f7..e0214fcfbb 100644 --- a/src/cpp/include/openvino/genai/tokenizer.hpp +++ b/src/cpp/include/openvino/genai/tokenizer.hpp @@ -10,6 +10,7 @@ #include "openvino/genai/visibility.hpp" namespace ov { +namespace genai { /** * @brief class is used to encode prompts and decode resulting tokens @@ -78,4 +79,5 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { std::shared_ptr m_pimpl; }; -} // namespace ov +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp index 14fc370c59..66f31f7ffd 100644 --- a/src/cpp/src/generation_config.cpp +++ b/src/cpp/src/generation_config.cpp @@ -6,22 +6,15 @@ #include #include - #include "openvino/genai/generation_config.hpp" - -#include "generation_config_helper.hpp" #include "utils.hpp" -namespace { - - -} // namespace - namespace ov { +namespace genai { GenerationConfig::GenerationConfig(std::string json_path) { - using ov::generate_utils::read_json_param; + using ov::genai::utils::read_json_param; std::ifstream f(json_path); OPENVINO_ASSERT(f.is_open(), "Failed to open '" + json_path + "' with generation config"); @@ -62,10 +55,10 @@ GenerationConfig::GenerationConfig(std::string json_path) { } -GenerationConfig GenerationConfigHelper::anymap_to_generation_config(const ov::AnyMap& config_map) { - using ov::generate_utils::read_anymap_param; +GenerationConfig GenerationConfig::anymap_to_generation_config(const ov::AnyMap& config_map) { + using ov::genai::utils::read_anymap_param; - GenerationConfig config = m_config; + GenerationConfig config; read_anymap_param(config_map, "max_new_tokens", config.max_new_tokens); read_anymap_param(config_map, "max_length", config.max_length); read_anymap_param(config_map, "ignore_eos", config.ignore_eos); @@ -90,25 +83,26 @@ GenerationConfig GenerationConfigHelper::anymap_to_generation_config(const ov::A return config; } -size_t GenerationConfigHelper::get_max_new_tokens(size_t prompt_length) { +size_t GenerationConfig::get_max_new_tokens(size_t prompt_length) const { // max_new_tokens has priority over max_length, only if max_new_tokens was not specified use max_length - if (m_config.max_new_tokens != SIZE_MAX) { - return m_config.max_new_tokens; + if (max_new_tokens != SIZE_MAX) { + return max_new_tokens; } else { - return m_config.max_length - prompt_length; + return max_length - prompt_length; } } -bool GenerationConfigHelper::is_greedy_decoding() const { - return !m_config.do_sample && !is_beam_search(); +bool GenerationConfig::is_greedy_decoding() const { + return !do_sample && !is_beam_search(); } -bool GenerationConfigHelper::is_beam_search() const { - return m_config.num_beams > 1; +bool GenerationConfig::is_beam_search() const { + return num_beams > 1; } -bool GenerationConfigHelper::is_multimomial() const { - return m_config.do_sample; +bool GenerationConfig::is_multimomial() const { + return do_sample; } +} // namespace genai } // namespace ov diff --git a/src/cpp/src/generation_config_helper.hpp b/src/cpp/src/generation_config_helper.hpp deleted file mode 100644 index f4e5839990..0000000000 --- a/src/cpp/src/generation_config_helper.hpp +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright (C) 2023-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "openvino/genai/generation_config.hpp" - -namespace ov { - - -class GenerationConfigHelper { -public: - GenerationConfig m_config; - - GenerationConfigHelper() = default; - - GenerationConfigHelper(const GenerationConfig& config): m_config(config) {}; - - size_t get_max_new_tokens(size_t prompt_length = 0); - - bool is_greedy_decoding() const; - - bool is_beam_search() const; - - bool is_multimomial() const; - - GenerationConfig anymap_to_generation_config(const ov::AnyMap& config_map = {}); - -}; - -} // namespace ov diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp index 3298553a76..51e8023b42 100644 --- a/src/cpp/src/greedy_decoding.cpp +++ b/src/cpp/src/greedy_decoding.cpp @@ -1,17 +1,21 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 -#include "generation_config_helper.hpp" #include "openvino/genai/llm_pipeline.hpp" #include "utils.hpp" namespace ov { - -ov::EncodedResults greedy_decoding(ov::InferRequest& m_model_runner, - ov::Tensor input_ids, ov::Tensor attention_mask, ov::GenerationConfig generation_config, - std::shared_ptr streamer, bool is_chat_conversation) { +namespace genai { + +EncodedResults greedy_decoding( + ov::InferRequest& m_model_runner, + ov::Tensor input_ids, + ov::Tensor attention_mask, + const ov::genai::GenerationConfig generation_config, + const std::shared_ptr streamer, + const bool is_chat_conversation +) { - ov::GenerationConfigHelper config_helper = generation_config; ov::Shape prompts_shape = input_ids.get_shape(); size_t batch_size = prompts_shape[0]; size_t prompt_len = prompts_shape[1]; @@ -20,9 +24,9 @@ ov::EncodedResults greedy_decoding(ov::InferRequest& m_model_runner, // todo: make this work even if position_ids are not specified auto position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()}; - generate_utils::initialize_position_ids(position_ids, attention_mask, kv_cache_len); + utils::initialize_position_ids(position_ids, attention_mask, kv_cache_len); - ov::EncodedResults results; + EncodedResults results; results.scores.resize(batch_size); results.tokens.resize(batch_size); std::fill(results.scores.begin(), results.scores.end(), 0); @@ -58,7 +62,7 @@ ov::EncodedResults greedy_decoding(ov::InferRequest& m_model_runner, auto beam_data = m_model_runner.get_tensor("beam_idx").data(); std::iota(beam_data, beam_data + batch_size, 0); - size_t max_tokens = config_helper.get_max_new_tokens(prompt_len); + size_t max_tokens = generation_config.get_max_new_tokens(prompt_len); m_model_runner.infer(); auto logits = m_model_runner.get_tensor("logits"); @@ -69,7 +73,7 @@ ov::EncodedResults greedy_decoding(ov::InferRequest& m_model_runner, std::vector token_iter_results(batch_size); // results of a single infer request std::vector eos_met(batch_size, 0); // use int because can not use std::all_of with vector for (size_t batch = 0; batch < batch_size; ++batch) { - auto res = generate_utils::softmax(logits, batch); + auto res = utils::softmax(logits, batch); auto out_token = res.first; results.tokens[batch].emplace_back(res.first); results.scores[batch] += res.second; @@ -86,8 +90,8 @@ ov::EncodedResults greedy_decoding(ov::InferRequest& m_model_runner, return results; for (size_t i = 0; i < max_tokens - 1; ++i) { - generate_utils::update_position_ids(m_model_runner.get_tensor("position_ids"), m_model_runner.get_tensor("attention_mask")); - m_model_runner.set_tensor("attention_mask", generate_utils::extend_attention(m_model_runner.get_tensor("attention_mask"))); + utils::update_position_ids(m_model_runner.get_tensor("position_ids"), m_model_runner.get_tensor("attention_mask")); + m_model_runner.set_tensor("attention_mask", utils::extend_attention(m_model_runner.get_tensor("attention_mask"))); // todo: consider replacing with start_async and run callback right after that m_model_runner.infer(); @@ -99,7 +103,7 @@ ov::EncodedResults greedy_decoding(ov::InferRequest& m_model_runner, std::vector eos_met(batch_size, 0); // use int because can not use std::all_of with vector for (size_t batch = 0; batch < batch_size; ++batch) { - auto res = ov::generate_utils::softmax(logits, batch); + auto res = ov::genai::utils::softmax(logits, batch); auto out_token = res.first; results.tokens[batch].emplace_back(res.first); results.scores[batch] += res.second; @@ -122,4 +126,5 @@ ov::EncodedResults greedy_decoding(ov::InferRequest& m_model_runner, return results; } -} \ No newline at end of file +} //namespace genai +} //namespace ov \ No newline at end of file diff --git a/src/cpp/src/group_beam_searcher.cpp b/src/cpp/src/group_beam_searcher.cpp index 312671c8f0..96138cec62 100644 --- a/src/cpp/src/group_beam_searcher.cpp +++ b/src/cpp/src/group_beam_searcher.cpp @@ -2,7 +2,6 @@ // SPDX-License-Identifier: Apache-2.0 #include -#include "generation_config_helper.hpp" #include "openvino/genai/llm_pipeline.hpp" #include "utils.hpp" @@ -87,12 +86,12 @@ bool greater(const Beam& left, const Beam& right) { struct Parameters { std::vector> prompts; - int64_t eos_token; + int64_t eos_token_id; size_t n_groups = 3; size_t group_size = 5; float diversity_penalty = 1.0; size_t max_new_tokens = 20; - ov::StopCriteria stop_criteria = ov::StopCriteria::heuristic; + ov::genai::StopCriteria stop_criteria = ov::genai::StopCriteria::heuristic; float length_penalty = 1.0; size_t no_repeat_ngram_size = std::numeric_limits::max(); @@ -110,7 +109,7 @@ struct Group { beam.score /= std::pow(float(beam.tokens.size()), parameters.length_penalty); // HF implementation counts eos_token for length penalty calculation - if (beam.tokens.back() == parameters.eos_token) { + if (beam.tokens.back() == parameters.eos_token_id) { beam.tokens.pop_back(); } @@ -129,15 +128,15 @@ struct Group { float best_sum_logprobs = ongoing.front().score; float worst_score = min_heap.front().score; switch (parameters.stop_criteria) { - case ov::StopCriteria::early: + case ov::genai::StopCriteria::early: done = true; return; - case ov::StopCriteria::heuristic: { + case ov::genai::StopCriteria::heuristic: { float highest_attainable_score = best_sum_logprobs / std::pow(float(cur_len), parameters.length_penalty); done = worst_score >= highest_attainable_score; return; } - case ov::StopCriteria::never: { + case ov::genai::StopCriteria::never: { size_t length = parameters.length_penalty > 0.0 ? parameters.max_new_tokens : cur_len; float highest_attainable_score = best_sum_logprobs / std::pow(float(length), parameters.length_penalty); done = worst_score >= highest_attainable_score; @@ -270,7 +269,7 @@ struct GroupBeamSearcher { std::partial_sort(candidates.begin(), to_sort, candidates.end(), greater); group->ongoing.clear(); for (size_t cand_idx = 0; cand_idx < candidates.size(); ++cand_idx) { - if (parameters.eos_token == candidates.at(cand_idx).tokens.back()) { + if (parameters.eos_token_id == candidates.at(cand_idx).tokens.back()) { // If beam_token does not belong to top num_beams tokens, it should not be added if (cand_idx >= parameters.group_size) { continue; @@ -325,7 +324,7 @@ void initialize_inputs(const ov::Tensor& input_ids, const ov::Tensor& attention_ ov::Tensor position_ids = request.get_tensor("position_ids"); position_ids.set_shape(input_shape); - ov::generate_utils::initialize_position_ids(position_ids, attention_mask); + ov::genai::utils::initialize_position_ids(position_ids, attention_mask); ov::Tensor beam_idx = request.get_tensor("beam_idx"); beam_idx.set_shape({input_shape.at(0)}); @@ -368,9 +367,9 @@ void update_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention namespace ov { +namespace genai { EncodedResults beam_search(ov::InferRequest& lm, ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig config) { - GenerationConfigHelper config_helper = config; OPENVINO_ASSERT(config.num_beams % config.num_beam_groups == 0, "number of beams should be divisible by number of groups"); // Initialize beam search @@ -388,7 +387,7 @@ EncodedResults beam_search(ov::InferRequest& lm, ov::Tensor input_ids, ov::Tenso Parameters parameters{std::move(prompts)}; parameters.max_new_tokens = config.max_new_tokens; - parameters.eos_token = config.eos_token_id; + parameters.eos_token_id = config.eos_token_id; parameters.n_groups = config.num_beam_groups; parameters.group_size = config.num_beams / config.num_beam_groups; parameters.diversity_penalty = config.diversity_penalty; @@ -429,7 +428,7 @@ EncodedResults beam_search(ov::InferRequest& lm, ov::Tensor input_ids, ov::Tenso auto compare_scores = [](Beam left, Beam right) { return (left.score > right.score); }; std::sort(beams.begin(), beams.end(), compare_scores); - ov::EncodedResults results; + ov::genai::EncodedResults results; for (auto beam = beams.begin(); beam != beams.begin() + config.num_return_sequences; ++beam) { results.scores.emplace_back(beam->score); results.tokens.emplace_back(beam->tokens); @@ -437,4 +436,5 @@ EncodedResults beam_search(ov::InferRequest& lm, ov::Tensor input_ids, ov::Tenso return results; } -} // namespace ov +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 9ea685e583..4a3683bbd7 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -13,19 +13,61 @@ #include "openvino/genai/generation_config.hpp" #include "openvino/genai/llm_pipeline.hpp" #include "utils.hpp" -#include "generation_config_helper.hpp" #include "text_callback_streamer.hpp" +namespace { + +ov::genai::GenerationConfig from_config_json_if_exists(const std::string& path) { + constexpr char generation_config_fname[] = "generation_config.json"; + constexpr char config_fname[] = "config.json"; + if (std::filesystem::exists(path + "/" + generation_config_fname)) { + return ov::genai::GenerationConfig(path + "/" + generation_config_fname); + } else if (std::filesystem::exists(path + "/" + config_fname)) { + // some models (e.g. google/gemma-*) do not have generation_config.json, but have config.json + // and special tokens are stored there. + std::ifstream file(path + "/" + config_fname); + if (!file.is_open()) + return ov::genai::GenerationConfig{}; + + nlohmann::json data = nlohmann::json::parse(file); + using ov::genai::utils::read_json_param; + ov::genai::GenerationConfig config; + + read_json_param(data, "pad_token_id", config.pad_token_id); + read_json_param(data, "bos_token_id", config.bos_token_id); + read_json_param(data, "eos_token_id", config.eos_token_id); + return config; + + } + return ov::genai::GenerationConfig{}; +} + +std::string from_tokenizer_json_if_exists(const std::string& path) { + std::string res = ""; + + if (!std::filesystem::exists(path)) + return res; + + std::ifstream file(path + "/tokenizer_config.json"); + if (!file.is_open()) + return res; + + ov::genai::utils::read_json_param(nlohmann::json::parse(file), "chat_template", res); + return res; +} + +} namespace ov { +namespace genai { -ov::EncodedResults greedy_decoding( +ov::genai::EncodedResults greedy_decoding( ov::InferRequest& model_runner, ov::Tensor prompts, ov::Tensor attentin_mask, - GenerationConfig sampling_params, - std::shared_ptr streamer, - bool is_chat_conversation = false + const GenerationConfig sampling_params, + const std::shared_ptr streamer, + const bool is_chat_conversation = false ); EncodedResults beam_search(ov::InferRequest& lm, ov::Tensor prompts, ov::Tensor attentin_mask, GenerationConfig config); @@ -36,20 +78,22 @@ class LLMPipeline::LLMPipelineImpl { ov::InferRequest m_model_runner; Tokenizer m_tokenizer; GenerationConfig m_generation_config; - std::string m_device; - ov::AnyMap m_plugin_config; std::string m_chat_template = ""; bool is_chat_conversation = false; LLMPipelineImpl( - const std::string model_path, - const ov::Tokenizer& tokenizer, - const std::string device, - const ov::AnyMap& plugin_config, - const std::string& ov_tokenizers_path="" + const std::string& model_path, + const ov::genai::Tokenizer& tokenizer, + const std::string& device, + const ov::AnyMap& plugin_config ); - LLMPipelineImpl(std::string& path, std::string device, const ov::AnyMap& config, const std::string& ov_tokenizers_path=""); + LLMPipelineImpl( + const std::string& path, + const std::string& device, + const ov::AnyMap& config, + const std::string& ov_tokenizers_path="" + ); GenerationConfig generation_config() const; @@ -60,33 +104,31 @@ class LLMPipeline::LLMPipelineImpl { std::string apply_chat_template(std::string prompt, std::string role = "user") const; }; -} // namespace ov +} // namespace genai +} // namespace ov using namespace std; - -ov::LLMPipeline::LLMPipeline( - const std::string model_path, - const ov::Tokenizer& tokenizer, - const std::string device, - const ov::AnyMap& plugin_config, - const std::string& ov_tokenizers_path +ov::genai::LLMPipeline::LLMPipeline( + const std::string& model_path, + const ov::genai::Tokenizer& tokenizer, + const std::string& device, + const ov::AnyMap& plugin_config ) { - m_pimpl = make_unique(model_path, tokenizer, device, plugin_config, ov_tokenizers_path); + m_pimpl = make_unique(model_path, tokenizer, device, plugin_config); } -ov::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl( - const std::string model_path, - const ov::Tokenizer& tokenizer, - std::string device, - const ov::AnyMap& plugin_config, - const std::string& ov_tokenizers_path -): m_tokenizer(tokenizer), m_device(device), m_plugin_config(plugin_config) { +ov::genai::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl( + const std::string& model_path, + const ov::genai::Tokenizer& tokenizer, + const std::string& device, + const ov::AnyMap& plugin_config +): m_tokenizer(tokenizer) { ov::Core core; std::string full_path = model_path; - if (!ov::generate_utils::is_xml(full_path)) + if (!ov::genai::utils::is_xml(full_path)) full_path += "/openvino_model.xml"; try { m_model_runner = core.compile_model(full_path, device, plugin_config).create_infer_request(); @@ -95,53 +137,36 @@ ov::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl( } } -ov::LLMPipeline::LLMPipeline(std::string& path, std::string device, const ov::AnyMap& config, const std::string& ov_tokenizers_path) { +ov::genai::LLMPipeline::LLMPipeline( + const std::string& path, + const std::string& device, + const ov::AnyMap& config, + const std::string& ov_tokenizers_path +) { m_pimpl = make_unique(path, device, config, ov_tokenizers_path); } -ov::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl(std::string& path, std::string device, - const ov::AnyMap& config, const std::string& ov_tokenizers_path) { - std::string config_path = path + "/" + "config.json"; - std::string tokenizer_config_path = path + "/" +"tokenizer_config.json"; - std::string generation_config_path = path + "/" +"generation_config.json"; - - if (std::filesystem::exists(generation_config_path)) { - m_generation_config = GenerationConfig(generation_config_path); - } else if (std::filesystem::exists(config_path)) { - // some models (e.g. google/gemma-*) do not have generation_config.json, but have config.json - // and special tokens are stored there. - - std::ifstream f(config_path); - OPENVINO_ASSERT(f.is_open(), "Failed to open '" + config_path + "' with config.json"); - - nlohmann::json data = nlohmann::json::parse(f); - using ov::generate_utils::read_json_param; - read_json_param(data, "pad_token_id", m_generation_config.pad_token_id); - read_json_param(data, "bos_token_id", m_generation_config.bos_token_id); - read_json_param(data, "eos_token_id", m_generation_config.eos_token_id); - } - - if (std::filesystem::exists(tokenizer_config_path)) { - std::ifstream f(tokenizer_config_path); - ov::generate_utils::read_json_param(nlohmann::json::parse(f), "chat_template", m_chat_template); - } - - m_device = device; - - ov::Core core; - m_model_runner = core.compile_model(path + "/openvino_model.xml", device, config).create_infer_request(); - m_tokenizer = Tokenizer(path, device, ov_tokenizers_path); -} - -ov::GenerationConfig ov::LLMPipeline::LLMPipelineImpl::generation_config() const { +ov::genai::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl( + const std::string& path, + const std::string& device, + const ov::AnyMap& config, + const std::string& ov_tokenizers_path +): + m_model_runner{ov::Core{}.compile_model(path + "/openvino_model.xml", device, config).create_infer_request()}, + m_tokenizer{Tokenizer(path, device, ov_tokenizers_path)}, + m_generation_config{from_config_json_if_exists(path)}, + m_chat_template{from_tokenizer_json_if_exists(path)} + {} + +ov::genai::GenerationConfig ov::genai::LLMPipeline::LLMPipelineImpl::generation_config() const { return m_generation_config; } -ov::GenerationConfig ov::LLMPipeline::get_generation_config() const { +ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const { return m_pimpl->generation_config(); } -std::string ov::LLMPipeline::LLMPipelineImpl::generate( +std::string ov::genai::LLMPipeline::LLMPipelineImpl::generate( std::string text, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer @@ -191,15 +216,11 @@ std::string ov::LLMPipeline::LLMPipelineImpl::generate( return m_tokenizer.decode(generate_results.tokens)[0]; } -ov::DecodedResults ov::LLMPipeline::generate(std::vector texts, OptionalGenerationConfig generation_config) { +ov::genai::DecodedResults ov::genai::LLMPipeline::generate(const std::vector& texts, OptionalGenerationConfig generation_config) { return m_pimpl->generate(texts, generation_config); } -ov::DecodedResults ov::LLMPipeline::generate(std::initializer_list text, OptionalGenerationConfig generation_config) { - return m_pimpl->generate(text, generation_config); -} - -ov::DecodedResults ov::LLMPipeline::LLMPipelineImpl::generate(std::vector texts, OptionalGenerationConfig generation_config) { +ov::genai::DecodedResults ov::genai::LLMPipeline::LLMPipelineImpl::generate(std::vector texts, OptionalGenerationConfig generation_config) { auto [input_ids, attention_mask] = m_tokenizer.encode(texts); auto generate_results = generate(input_ids, attention_mask, generation_config, {}); @@ -207,29 +228,20 @@ ov::DecodedResults ov::LLMPipeline::LLMPipelineImpl::generate(std::vector texts, OptionalGenerationConfig generation_config) { - return m_pimpl-> generate(texts, generation_config); -} - -ov::DecodedResults ov::LLMPipeline::operator()(std::initializer_list text, OptionalGenerationConfig generation_config) { - return m_pimpl->generate(text, generation_config); -} - -ov::EncodedResults ov::LLMPipeline::LLMPipeline::generate(ov::Tensor input_ids, +ov::genai::EncodedResults ov::genai::LLMPipeline::LLMPipeline::generate(ov::Tensor input_ids, std::optional attention_mask, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer) { return m_pimpl->generate(input_ids, attention_mask, generation_config, streamer); } -ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::generate( +ov::genai::EncodedResults ov::genai::LLMPipeline::LLMPipelineImpl::generate( ov::Tensor input_ids, std::optional attention_mask, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer ) { - ov::EncodedResults result; + ov::genai::EncodedResults result; GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config; - GenerationConfigHelper config_helper = config; std::shared_ptr streamer_ptr; if (!streamer.has_value()){ @@ -240,15 +252,15 @@ ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::generate( streamer_ptr = std::make_shared(m_tokenizer, *callback); } auto batch_size = input_ids.get_shape().at(0); - if ((batch_size != 1 || !config_helper.is_greedy_decoding()) && streamer_ptr) { + if ((batch_size != 1 || !config.is_greedy_decoding()) && streamer_ptr) { OPENVINO_THROW("Currently streaming is possible only with batch size=1 and greedy decoding"); } - auto attention_mask_data = attention_mask.has_value() ? *attention_mask : ov::generate_utils::init_attention_mask(input_ids); + auto attention_mask_data = attention_mask.has_value() ? *attention_mask : ov::genai::utils::init_attention_mask(input_ids); - if (config_helper.is_greedy_decoding()) { - result = ov::greedy_decoding(m_model_runner, input_ids, attention_mask_data, config, streamer_ptr, is_chat_conversation); - } else if (config_helper.is_beam_search()) { + if (config.is_greedy_decoding()) { + result = ov::genai::greedy_decoding(m_model_runner, input_ids, attention_mask_data, config, streamer_ptr, is_chat_conversation); + } else if (config.is_beam_search()) { result = beam_search(m_model_runner, input_ids, attention_mask_data, config); } else { // todo: implement multinomial sampling @@ -261,13 +273,13 @@ ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::generate( return result; } -std::string ov::LLMPipeline::generate(std::string text, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer) { +std::string ov::genai::LLMPipeline::generate(std::string text, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer) { return m_pimpl->generate(text, generation_config, streamer); } -std::string ov::LLMPipeline::generate(std::string text, const ov::AnyMap& config_map) { +std::string ov::genai::LLMPipeline::generate(std::string text, const ov::AnyMap& config_map) { OptionalStreamerVariant streamer; - auto config = GenerationConfigHelper(get_generation_config()).anymap_to_generation_config(config_map); + auto config = GenerationConfig::anymap_to_generation_config(config_map); if (config_map.count("streamer")) { streamer = config_map.at("streamer").as>(); } @@ -275,9 +287,9 @@ std::string ov::LLMPipeline::generate(std::string text, const ov::AnyMap& config return m_pimpl->generate(text, config, streamer); } -ov::EncodedResults ov::LLMPipeline::generate(ov::Tensor input_ids, const ov::AnyMap& config_map) { +ov::genai::EncodedResults ov::genai::LLMPipeline::generate(ov::Tensor input_ids, const ov::AnyMap& config_map) { OptionalStreamerVariant streamer; - auto config = GenerationConfigHelper(get_generation_config()).anymap_to_generation_config(config_map); + auto config = GenerationConfig::anymap_to_generation_config(config_map); if (config_map.count("streamer")) { streamer = config_map.at("streamer").as>(); } @@ -286,23 +298,15 @@ ov::EncodedResults ov::LLMPipeline::generate(ov::Tensor input_ids, const ov::Any return m_pimpl->generate(input_ids, attention_mask, config, streamer); } -std::string ov::LLMPipeline::operator()(std::string text, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer) { - return m_pimpl->generate(text, generation_config, streamer); -} - -std::string ov::LLMPipeline::operator()(std::string text, OptionalStreamerVariant streamer) { - return m_pimpl->generate(text, m_pimpl->m_generation_config, streamer); -} - -ov::Tokenizer ov::LLMPipeline::get_tokenizer() { +ov::genai::Tokenizer ov::genai::LLMPipeline::get_tokenizer() { return m_pimpl->m_tokenizer; } -std::string ov::LLMPipeline::apply_chat_template(std::string prompt, std::string role) const { +std::string ov::genai::LLMPipeline::apply_chat_template(std::string prompt, std::string role) const { return m_pimpl->apply_chat_template(prompt, role); } -std::string ov::LLMPipeline::LLMPipelineImpl::apply_chat_template(std::string prompt, std::string role) const { +std::string ov::genai::LLMPipeline::LLMPipelineImpl::apply_chat_template(std::string prompt, std::string role) const { jinja2::TemplateEnv env; env.GetSettings().lstripBlocks = true; env.GetSettings().trimBlocks = true; @@ -320,21 +324,21 @@ std::string ov::LLMPipeline::LLMPipelineImpl::apply_chat_template(std::string pr return tpl.RenderAsString(params).value(); } -void ov::LLMPipeline::start_chat() { +void ov::genai::LLMPipeline::start_chat() { m_pimpl->is_chat_conversation = true; } -void ov::LLMPipeline::finish_chat() { +void ov::genai::LLMPipeline::finish_chat() { m_pimpl->is_chat_conversation = false; reset_state(); } -void ov::LLMPipeline::reset_state() { +void ov::genai::LLMPipeline::reset_state() { m_pimpl->m_model_runner.reset_state(); } -void ov::LLMPipeline::set_generation_config(const GenerationConfig& generation_config) { +void ov::genai::LLMPipeline::set_generation_config(const GenerationConfig& generation_config) { m_pimpl->m_generation_config = generation_config; } -ov::LLMPipeline::~LLMPipeline() = default; +ov::genai::LLMPipeline::~LLMPipeline() = default; diff --git a/src/cpp/src/text_callback_streamer.cpp b/src/cpp/src/text_callback_streamer.cpp index f9b3ad8ccd..bb2bec09d9 100644 --- a/src/cpp/src/text_callback_streamer.cpp +++ b/src/cpp/src/text_callback_streamer.cpp @@ -1,6 +1,7 @@ #include "text_callback_streamer.hpp" namespace ov { +namespace genai { TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer, std::function callback, bool print_eos_token) { m_tokenizer = tokenizer; @@ -70,4 +71,5 @@ void TextCallbackStreamer::on_finalized_text(const std::string& subword) { } } -} // namespace ov +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text_callback_streamer.hpp b/src/cpp/src/text_callback_streamer.hpp index d9c1ba3ee5..3834dd01ba 100644 --- a/src/cpp/src/text_callback_streamer.hpp +++ b/src/cpp/src/text_callback_streamer.hpp @@ -6,6 +6,7 @@ #include "openvino/genai/tokenizer.hpp" namespace ov { +namespace genai { class TextCallbackStreamer: public StreamerBase { public: @@ -32,4 +33,5 @@ class TextCallbackStreamer: public StreamerBase { void on_finalized_text(const std::string& subword); }; -} // namespace ov +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp index 778778faec..2cecdad22a 100644 --- a/src/cpp/src/tokenizer.cpp +++ b/src/cpp/src/tokenizer.cpp @@ -42,12 +42,12 @@ std::pair pad_left(ov::Tensor&& input_ids, ov::Tensor&& } namespace ov { +namespace genai { class Tokenizer::TokenizerImpl { public: ov::InferRequest m_tokenize_request; ov::InferRequest m_detokenizer_request; - std::string m_device; int64_t m_pad_token_id = 0; int64_t m_bos_token_id = 1; int64_t m_eos_token_id = 2; @@ -56,7 +56,7 @@ class Tokenizer::TokenizerImpl { TokenizerImpl(std::string tokenizers_path, const std::string device, const std::string& ov_tokenizers_path) { ov::Core core; - if (ov::generate_utils::is_xml(tokenizers_path)) + if (ov::genai::utils::is_xml(tokenizers_path)) OPENVINO_THROW("tokenizers_path should be a path to a dir not a xml file"); if (ov_tokenizers_path.empty()) { @@ -202,4 +202,5 @@ void Tokenizer::set_eos_token_id(int64_t eos_token_id) { Tokenizer::~Tokenizer() = default; -} // namespace ov +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp index dbd18cf3f3..477a6efd54 100644 --- a/src/cpp/src/utils.cpp +++ b/src/cpp/src/utils.cpp @@ -4,7 +4,8 @@ #include "utils.hpp" namespace ov { -namespace generate_utils { +namespace genai { +namespace utils { Tensor init_attention_mask(Tensor& position_ids) { auto shape = position_ids.get_shape(); @@ -135,5 +136,6 @@ ov::Tensor extend_attention(ov::Tensor attention_mask) { return new_atten_mask; } -} // namespace generate_utils +} // namespace utils +} // namespace genai } // namespace ov diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index d7998a9594..4559a8962f 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -7,7 +7,8 @@ #include namespace ov { -namespace generate_utils { +namespace genai { +namespace utils { Tensor init_attention_mask(Tensor& position_ids); @@ -58,6 +59,7 @@ void read_anymap_param(const ov::AnyMap& config_map, const std::string& name, T& } } -} // namespace generate_utils +} // namespace utils +} // namespace genai } // namespace ov diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp index 2aee67593c..fa944bb4eb 100644 --- a/src/python/py_generate_pipeline.cpp +++ b/src/python/py_generate_pipeline.cpp @@ -7,9 +7,15 @@ #include "openvino/genai/llm_pipeline.hpp" namespace py = pybind11; -using namespace ov; - -void str_to_stop_criteria(ov::GenerationConfig& config, const std::string& stop_criteria_str){ +using ov::genai::LLMPipeline; +using ov::genai::Tokenizer; +using ov::genai::GenerationConfig; +using ov::genai::EncodedResults; +using ov::genai::DecodedResults; +using ov::genai::StopCriteria; +using ov::genai::StreamerBase; + +void str_to_stop_criteria(GenerationConfig& config, const std::string& stop_criteria_str){ if (stop_criteria_str == "early") config.stop_criteria = StopCriteria::early; else if (stop_criteria_str == "never") config.stop_criteria = StopCriteria::never; else if (stop_criteria_str == "heuristic") config.stop_criteria = StopCriteria::heuristic; @@ -17,16 +23,16 @@ void str_to_stop_criteria(ov::GenerationConfig& config, const std::string& stop_ "Allowed values are: \"early\", \"never\", \"heuristic\". "); } -std::string stop_criteria_to_str(const ov::GenerationConfig& config) { +std::string stop_criteria_to_str(const GenerationConfig& config) { switch (config.stop_criteria) { - case ov::StopCriteria::early: return "early"; - case ov::StopCriteria::heuristic: return "heuristic"; - case ov::StopCriteria::never: return "never"; + case StopCriteria::early: return "early"; + case StopCriteria::heuristic: return "heuristic"; + case StopCriteria::never: return "never"; default: throw std::runtime_error("Incorrect stop_criteria"); } } -void update_config_from_kwargs(ov::GenerationConfig& config, const py::kwargs& kwargs) { +void update_config_from_kwargs(GenerationConfig& config, const py::kwargs& kwargs) { if (kwargs.contains("max_new_tokens")) config.max_new_tokens = kwargs["max_new_tokens"].cast(); if (kwargs.contains("max_length")) config.max_length = kwargs["max_length"].cast(); if (kwargs.contains("ignore_eos")) config.ignore_eos = kwargs["ignore_eos"].cast(); @@ -50,14 +56,14 @@ void update_config_from_kwargs(ov::GenerationConfig& config, const py::kwargs& k } // operator() and generate methods are identical, operator() is just an alias for generate -std::string call_with_kwargs(ov::LLMPipeline& pipeline, const std::string& text, const py::kwargs& kwargs) { +std::string call_with_kwargs(LLMPipeline& pipeline, const std::string& text, const py::kwargs& kwargs) { // Create a new GenerationConfig instance and initialize from kwargs - ov::GenerationConfig config = pipeline.get_generation_config(); + GenerationConfig config = pipeline.get_generation_config(); update_config_from_kwargs(config, kwargs); return pipeline(text, config); } -std::string call_with_config(ov::LLMPipeline& pipe, const std::string& text, const ov::GenerationConfig& config) { +std::string call_with_config(LLMPipeline& pipe, const std::string& text, const GenerationConfig& config) { std::shared_ptr streamer; return pipe(text, config); } @@ -72,15 +78,15 @@ PYBIND11_MODULE(py_generate_pipeline, m) { m.doc() = "Pybind11 binding for LLM Pipeline"; py::class_(m, "LLMPipeline") - .def(py::init(), + .def(py::init(), py::arg("model_path"), py::arg("tokenizer"), py::arg("device") = "CPU", - py::arg("plugin_config") = ov::AnyMap{}, py::arg("ov_tokenizers_path") = ov_tokenizers_module_path()) + py::arg("plugin_config") = ov::AnyMap{}) .def(py::init(), py::arg("path"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap{}, py::arg("ov_tokenizers_path") = ov_tokenizers_module_path()) - .def("__call__", py::overload_cast(&call_with_kwargs)) - .def("__call__", py::overload_cast(&call_with_config)) - .def("generate", py::overload_cast(&call_with_kwargs)) - .def("generate", py::overload_cast(&call_with_config)) + .def("__call__", py::overload_cast(&call_with_kwargs)) + .def("__call__", py::overload_cast(&call_with_config)) + .def("generate", py::overload_cast(&call_with_kwargs)) + .def("generate", py::overload_cast(&call_with_config)) // todo: if input_ids is a ov::Tensor/numpy tensor // todo: implement calling generate/operator() with StreamerBase or lambda streamer @@ -92,15 +98,15 @@ PYBIND11_MODULE(py_generate_pipeline, m) { .def("get_tokenizer", &LLMPipeline::get_tokenizer) - .def("start_chat", &ov::LLMPipeline::start_chat) - .def("finish_chat", &ov::LLMPipeline::finish_chat) - .def("reset_state", &ov::LLMPipeline::reset_state) - .def("get_generation_config", &ov::LLMPipeline::get_generation_config, py::return_value_policy::copy) - .def("set_generation_config", &ov::LLMPipeline::set_generation_config) + .def("start_chat", &LLMPipeline::start_chat) + .def("finish_chat", &LLMPipeline::finish_chat) + .def("reset_state", &LLMPipeline::reset_state) + .def("get_generation_config", &LLMPipeline::get_generation_config, py::return_value_policy::copy) + .def("set_generation_config", &LLMPipeline::set_generation_config) .def("apply_chat_template", &LLMPipeline::apply_chat_template); // Binding for Tokenizer - py::class_(m, "Tokenizer") + py::class_(m, "Tokenizer") .def(py::init<>()) .def(py::init(), py::arg("tokenizers_path"), @@ -108,46 +114,46 @@ PYBIND11_MODULE(py_generate_pipeline, m) { py::arg("ov_tokenizers_path") = py::str(ov_tokenizers_module_path())) // todo: implement encode/decode when for numpy inputs and outputs - .def("encode", py::overload_cast(&ov::Tokenizer::encode), "Encode a single prompt") + .def("encode", py::overload_cast(&Tokenizer::encode), "Encode a single prompt") // TODO: common.h(1106...) template argument deduction/substitution failed: - // .def("encode", py::overload_cast&>(&ov::Tokenizer::encode), "Encode multiple prompts") - .def("decode", py::overload_cast>(&ov::Tokenizer::decode), "Decode a list of tokens") - .def("decode", py::overload_cast(&ov::Tokenizer::decode), "Decode a tensor of tokens") - .def("decode", py::overload_cast>>(&ov::Tokenizer::decode), "Decode multiple lines of tokens"); + // .def("encode", py::overload_cast&>(&Tokenizer::encode), "Encode multiple prompts") + .def("decode", py::overload_cast>(&Tokenizer::decode), "Decode a list of tokens") + .def("decode", py::overload_cast(&Tokenizer::decode), "Decode a tensor of tokens") + .def("decode", py::overload_cast>>(&Tokenizer::decode), "Decode multiple lines of tokens"); // Binding for GenerationConfig - py::class_(m, "GenerationConfig") + py::class_(m, "GenerationConfig") .def(py::init<>()) .def(py::init()) - .def_readwrite("max_new_tokens", &ov::GenerationConfig::max_new_tokens) - .def_readwrite("max_length", &ov::GenerationConfig::max_length) - .def_readwrite("ignore_eos", &ov::GenerationConfig::ignore_eos) - .def_readwrite("num_beam_groups", &ov::GenerationConfig::num_beam_groups) - .def_readwrite("num_beams", &ov::GenerationConfig::num_beams) - .def_readwrite("diversity_penalty", &ov::GenerationConfig::diversity_penalty) - .def_readwrite("length_penalty", &ov::GenerationConfig::length_penalty) - .def_readwrite("num_return_sequences", &ov::GenerationConfig::num_return_sequences) - .def_readwrite("no_repeat_ngram_size", &ov::GenerationConfig::no_repeat_ngram_size) + .def_readwrite("max_new_tokens", &GenerationConfig::max_new_tokens) + .def_readwrite("max_length", &GenerationConfig::max_length) + .def_readwrite("ignore_eos", &GenerationConfig::ignore_eos) + .def_readwrite("num_beam_groups", &GenerationConfig::num_beam_groups) + .def_readwrite("num_beams", &GenerationConfig::num_beams) + .def_readwrite("diversity_penalty", &GenerationConfig::diversity_penalty) + .def_readwrite("length_penalty", &GenerationConfig::length_penalty) + .def_readwrite("num_return_sequences", &GenerationConfig::num_return_sequences) + .def_readwrite("no_repeat_ngram_size", &GenerationConfig::no_repeat_ngram_size) .def_property("stop_criteria", &stop_criteria_to_str, &str_to_stop_criteria) - .def_readwrite("temperature", &ov::GenerationConfig::temperature) - .def_readwrite("top_p", &ov::GenerationConfig::top_p) - .def_readwrite("top_k", &ov::GenerationConfig::top_k) - .def_readwrite("do_sample", &ov::GenerationConfig::do_sample) - .def_readwrite("repetition_penalty", &ov::GenerationConfig::repetition_penalty) - .def_readwrite("pad_token_id", &ov::GenerationConfig::pad_token_id) - .def_readwrite("bos_token_id", &ov::GenerationConfig::bos_token_id) - .def_readwrite("eos_token_id", &ov::GenerationConfig::eos_token_id) - .def_readwrite("eos_token", &ov::GenerationConfig::eos_token) - .def_readwrite("bos_token", &ov::GenerationConfig::bos_token); - - py::class_(m, "DecodedResults") + .def_readwrite("temperature", &GenerationConfig::temperature) + .def_readwrite("top_p", &GenerationConfig::top_p) + .def_readwrite("top_k", &GenerationConfig::top_k) + .def_readwrite("do_sample", &GenerationConfig::do_sample) + .def_readwrite("repetition_penalty", &GenerationConfig::repetition_penalty) + .def_readwrite("pad_token_id", &GenerationConfig::pad_token_id) + .def_readwrite("bos_token_id", &GenerationConfig::bos_token_id) + .def_readwrite("eos_token_id", &GenerationConfig::eos_token_id) + .def_readwrite("eos_token", &GenerationConfig::eos_token) + .def_readwrite("bos_token", &GenerationConfig::bos_token); + + py::class_(m, "DecodedResults") .def(py::init<>()) - .def_readwrite("texts", &ov::DecodedResults::texts) - .def_readwrite("scores", &ov::DecodedResults::scores); + .def_readwrite("texts", &DecodedResults::texts) + .def_readwrite("scores", &DecodedResults::scores); - py::class_(m, "EncodedResults") + py::class_(m, "EncodedResults") .def(py::init<>()) - .def_readwrite("tokens", &ov::EncodedResults::tokens) - .def_readwrite("scores", &ov::EncodedResults::scores); + .def_readwrite("tokens", &EncodedResults::tokens) + .def_readwrite("scores", &EncodedResults::scores); } diff --git a/tests/python_tests/list_test_models.py b/tests/python_tests/list_test_models.py index 09addcfaba..a24a4fd13d 100644 --- a/tests/python_tests/list_test_models.py +++ b/tests/python_tests/list_test_models.py @@ -14,7 +14,10 @@ def models_list(): # ("microsoft/phi-1_5", "phi-1_5/"), # ("Qwen/Qwen1.5-7B-Chat", "Qwen1.5-7B-Chat"), ] - return model_ids + import os + prefix = os.getenv('GENAI_MODELS_PATH_PREFIX', '') + return [(model_id, os.path.join(prefix, model_path)) for model_id, model_path in model_ids] + if __name__ == "__main__": for model_id, model_path in models_list(): diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py index 1d46e227c9..e7f9adf5d5 100644 --- a/tests/python_tests/test_generate_api.py +++ b/tests/python_tests/test_generate_api.py @@ -14,6 +14,7 @@ def model_fixture(request): return model_id, path, tokenizer, model def run_hf_ov_genai_comparison(model_fixture, generation_config, prompt): + import openvino_genai as ov_genai model_id, path, tokenizer, model = model_fixture generation_config_hf = generation_config.copy() @@ -28,10 +29,13 @@ def run_hf_ov_genai_comparison(model_fixture, generation_config, prompt): hf_output = tokenizer.decode(hf_encoded_output[0, encoded_prompt.shape[1]:]) device = 'CPU' - ov_tokenizers_path = '../../build/openvino_tokenizers/src/' - import openvino_genai as ov_genai + # pipe = ov_genai.LLMPipeline(path, device) + import os + build_dir = os.getenv('GENAI_BUILD_DIR', 'build') + ov_tokenizers_path = f'{build_dir}/openvino_tokenizers/src/' pipe = ov_genai.LLMPipeline(path, device, {}, ov_tokenizers_path) + ov_output = pipe.generate(prompt, **generation_config) if hf_output != ov_output: @@ -46,7 +50,7 @@ def stop_criteria_map(): test_cases = [ (dict(max_new_tokens=20, do_sample=False), 'table is made of'), # generation_config, prompt - # (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=20, diversity_penalty=1.0), 'table is made of'), + (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=20, diversity_penalty=1.0), 'table is made of'), # (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=20, diversity_penalty=1.0), 'Alan Turing was a'), # (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=30, diversity_penalty=1.0), 'Alan Turing was a'), # (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'table is made of'), diff --git a/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp b/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp index 1afc5f93ed..474537de17 100644 --- a/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp +++ b/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp @@ -16,8 +16,8 @@ int main(int argc, char* argv[]) try { std::string model_path = argv[1]; std::string device = "CPU"; // GPU can be used as well - ov::LLMPipeline pipe(model_path, device); - ov::GenerationConfig config = pipe.get_generation_config(); + ov::genai::LLMPipeline pipe(model_path, device); + ov::genai::GenerationConfig config = pipe.get_generation_config(); config.max_new_tokens = 20; config.num_beam_groups = 3; config.num_beams = 15; diff --git a/text_generation/causal_lm/cpp/chat_sample.cpp b/text_generation/causal_lm/cpp/chat_sample.cpp index b1ecb5f5f4..3e215e5208 100644 --- a/text_generation/causal_lm/cpp/chat_sample.cpp +++ b/text_generation/causal_lm/cpp/chat_sample.cpp @@ -20,9 +20,9 @@ int main(int argc, char* argv[]) try { std::string accumulated_str = ""; std::string model_path = argv[1]; - ov::LLMPipeline pipe(model_path, "CPU"); + ov::genai::LLMPipeline pipe(model_path, "CPU"); - ov::GenerationConfig config = pipe.get_generation_config(); + ov::genai::GenerationConfig config = pipe.get_generation_config(); config.max_new_tokens = 10000; auto streamer = [](std::string word) { std::cout << word << std::flush; }; @@ -35,7 +35,7 @@ int main(int argc, char* argv[]) try { cout << prompt << endl; // auto answer_str = pipe(prompt, config, streamer); - auto answer_str = pipe.generate(prompt, ov::max_new_tokens(10000), ov::streamer_lambda(streamer)); + auto answer_str = pipe.generate(prompt, ov::genai::max_new_tokens(10000), ov::genai::streamer(streamer)); accumulated_str += answer_str; cout << "\n----------\n"; diff --git a/text_generation/causal_lm/cpp/greedy_causal_lm.cpp b/text_generation/causal_lm/cpp/greedy_causal_lm.cpp index e410d170ca..0fea9b36d3 100644 --- a/text_generation/causal_lm/cpp/greedy_causal_lm.cpp +++ b/text_generation/causal_lm/cpp/greedy_causal_lm.cpp @@ -14,8 +14,8 @@ int main(int argc, char* argv[]) try { std::string device = "CPU"; if (argc > 3) device = argv[3]; - ov::LLMPipeline pipe(model_path, device); - ov::GenerationConfig config = pipe.get_generation_config(); + ov::genai::LLMPipeline pipe(model_path, device); + ov::genai::GenerationConfig config = pipe.get_generation_config(); config.max_new_tokens = 100; config.do_sample = false; auto streamer = [](std::string subword){std::cout << subword << std::flush;};