Merge branch 'generate_pipeline' into fix-abi

openvinotoolkit · May 27, 2024 · 140b59c · 140b59c
2 parents 78666da + bbc8c25
commit 140b59c
Show file tree

Hide file tree

Showing 23 changed files with 353 additions and 353 deletions.
diff --git a/.github/workflows/genai_package.yml b/.github/workflows/genai_package.yml
@@ -18,7 +18,7 @@ jobs:
       - run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh
       - run: sudo apt-get install libtbb-dev
       - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/
-      - run: source ./ov/setupvars.sh && cmake --build ./build/ --config ${{ matrix.build-type }} --target package
+      - run: source ./ov/setupvars.sh && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j
       - run: source ./ov/setupvars.sh && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix ov
       - run: ov/samples/cpp/build_samples.sh -i ${{ github.workspace }}/s\ pace
         if: ${{ 'Release' == matrix.build-type }}  # build_samples enforces Release build
@@ -49,7 +49,7 @@ jobs:
       - run: curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.2.0-15349-765302e0de1/w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64.zip
       - run: unzip ov.zip
       - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/
-      - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake --build ./build/ --config ${{ matrix.build-type }} --target package
+      - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j
       - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64
       - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\samples\cpp\build_samples_msvc.bat -i "${{ github.workspace }}/samples_install"
         if: ${{ 'Release' == matrix.build-type }}  # build_samples enforces Release build

diff --git a/.github/workflows/genai_python_lib.yml b/.github/workflows/genai_python_lib.yml
@@ -22,7 +22,7 @@ jobs:
       # build system doesn't. Install ./requirements-build.txt to detect possible conflicts.
       - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./requirements-build.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
       - run: PYTHONPATH=./src/python/ python -c "from openvino_genai import LLMPipeline"
-      - run: source ./ov/setupvars.sh && CMAKE_BUILD_PARALLEL_LEVEL="" python -m pip install .
+      - run: source ./ov/setupvars.sh && CMAKE_BUILD_PARALLEL_LEVEL="" python -m pip install --pre . --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
       - run: python -c "from openvino_genai import LLMPipeline"
       - name: GenAI Python API tests
         run: |
@@ -51,9 +51,16 @@ jobs:
       - run: unzip ov.zip
       - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
       - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake --build ./build/ --config Release -j
+<<<<<<< HEAD
       # GitHub Actions already provides what is listed in ./requirements-build.txt but the internal
       # build system doesn't. Install ./requirements-build.txt to detect possible conflicts.
       - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./requirements-build.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+=======
+      - run: python -m pip install "numpy<1.27"
+      # GitHub Actions already provides what is listed in ./requirements-build.txt but the internal
+      # build system doesn't. Install ./requirements-build.txt to detect possible conflicts.
+      - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./requirements-build.txt
+>>>>>>> generate_pipeline
       - run: set "PYTHONPATH=./src/python;" && call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -c "from openvino_genai import LLMPipeline"  # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
       - run: set CMAKE_BUILD_PARALLEL_LEVEL=&& call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -m pip install .
       - run: python -c "from openvino_genai import LLMPipeline"
diff --git a/src/README.md b/src/README.md
@@ -8,7 +8,7 @@ optimum-cli export openvino --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --weigh
 pip install openvino-genai
 ```
 
-LLMPipeline is the main object used for decoding. You can initiliza it straigh away from the folder with the converted model. It will automanically load the main model, tokenizer, detokenizer and default generation configuration.
+`LLMPipeline` is the main object used for decoding. You can construct it straight away from the folder with the converted model. It will automatically load the main model, tokenizer, detokenizer and default generation configuration.
 
 ### Python
 
@@ -24,8 +24,8 @@ Calling generate with custom generation config parameters, e.g. config for group
 import openvino_genai as ov_genai
 pipe = ov_genai.LLMPipeline(model_path, "CPU")
 
-res = pipe.generate("The Sun is yellow bacause", max_new_tokens=30, num_groups=3, group_size=5)
-print(res)
+result = pipe.generate("The Sun is yellow bacause", max_new_tokens=30, num_groups=3, group_size=5, diversity_penalty=1.5)
+print(result)
 ```
 
 output:
@@ -38,7 +38,7 @@ A simples chat in python:
 import openvino_genai as ov_genai
 pipe = ov_ov_genai.LLMPipeline(model_path)
 
-config = {'num_groups': 3, 'group_size': 5, 'diversity_penalty': 1.1}
+config = {'num_groups': 3, 'group_size': 5, 'diversity_penalty': 1.5}
 pipe.set_generation_cofnig(config)
 
 pipe.start_chat()
@@ -49,7 +49,6 @@ while True:
         break
     print(pipe(prompt))
 pipe.finish_chat()
-
 ```
 
 Test to compare with Huggingface outputs
@@ -63,7 +62,7 @@ Minimalistc example
 
 int main(int argc, char* argv[]) {
     std::string model_path = argv[1];
-    ov::LLMPipeline pipe(model_path, "CPU");
+    ov::genai::LLMPipeline pipe(model_path, "CPU");
     std::cout << pipe.generate("The Sun is yellow bacause");
 }
 ```
@@ -75,9 +74,9 @@ Using Group Beam Search Decoding
 
 int main(int argc, char* argv[]) {
     std::string model_path = argv[1];
-    ov::LLMPipeline pipe(model_path, "CPU");
+    ov::genai::LLMPipeline pipe(model_path, "CPU");
 
-    ov::GenerationConfig config = pipe.get_generation_config();
+    ov::genai::GenerationConfig config = pipe.get_generation_config();
     config.max_new_tokens = 256;
     config.num_groups = 3;
     config.group_size = 5;
@@ -87,7 +86,7 @@ int main(int argc, char* argv[]) {
 }
 ```
 
-A simplest chat in C++
+A simple chat in C++ using grouped beam search decoding
 ``` cpp
 #include "openvino/genai/llm_pipeline.hpp"
 #include <iostream>
@@ -96,71 +95,50 @@ int main(int argc, char* argv[]) {
     std::string prompt;
 
     std::string model_path = argv[1];
-    ov::LLMPipeline pipe(model_path, "CPU");
-
-    pipe.start_chat();
-    for (size_t i = 0; i < questions.size(); i++) {
-        std::cout << "question:\n";
-        std::getline(std::cin, prompt);
-
-        std::cout << pipe(prompt) << std::endl>>;
-    }
-    pipe.finish_chat();
-}
-```
-
-Specifying generation_config to use grouped beam search
-``` cpp
-int main(int argc, char* argv[]) {
-    std::string prompt;
-
-    std::string model_path = argv[1];
-    ov::LLMPipeline pipe(model_path, "CPU");
+    ov::genai::LLMPipeline pipe(model_path, "CPU");
 
-    ov::GenerationConfig config = pipe.get_generation_config();
+    ov::genai::GenerationConfig config = pipe.get_generation_config();
     config.max_new_tokens = 256;
     config.num_groups = 3;
     config.group_size = 5;
     config.diversity_penalty = 1.0f;
 
-    auto streamer = [](std::string word) { std::cout << word << std::flush; };
-
     pipe.start_chat();
-    for (size_t i = 0; i < questions.size(); i++) {
-        
+    for (;;;) {
         std::cout << "question:\n";
-        cout << prompt << endl;
+        std::getline(std::cin, prompt);
+        if (prompt == "Stop!")
+            break;
 
-        auto answer = pipe(prompt, config, streamer);
-        // no need to print answer, streamer will do that
+        std::cout << "answer:\n";
+        auto answer = pipe(prompt, config);
+        std::cout << answer << std::endl;
     }
     pipe.finish_chat();
 }
 ```
 
-Streaming exapmle with lambda function
-
+Streaming example with lambda function
 ``` cpp
-
 #include "openvino/genai/llm_pipeline.hpp"
 #include <iostream>
 
 int main(int argc, char* argv[]) {
     std::string model_path = argv[1];
-    ov::LLMPipeline pipe(model_path, "CPU");
+    ov::genai::LLMPipeline pipe(model_path, "CPU");
         
     auto streamer = [](std::string word) { std::cout << word << std::flush; };
     std::cout << pipe.generate("The Sun is yellow bacause", streamer);
 }
 ```
 
-Streaming with custom class
+Streaming with a custom class
 ``` cpp
-#include <streamer_base.hpp>
+#include "openvino/genai/streamer_base.hpp"
 #include "openvino/genai/llm_pipeline.hpp"
 #include <iostream>
 
-class CustomStreamer: publict StreamerBase {
+class CustomStreamer: public ov::genai::StreamerBase {
 public:
     void put(int64_t token) {
         /* custom decoding/tokens processing code
@@ -179,7 +157,7 @@ int main(int argc, char* argv[]) {
     CustomStreamer custom_streamer;
 
     std::string model_path = argv[1];
-    ov::LLMPipeline pipe(model_path, "CPU");
-    cout << pipe.generate("The Sun is yellow bacause", custom_streamer);
+    ov::genai::LLMPipeline pipe(model_path, "CPU");
+    std::cout << pipe.generate("The Sun is yellow bacause", custom_streamer);
 }
 ```
diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp
@@ -12,6 +12,7 @@
 #include "openvino/genai/tokenizer.hpp"
 
 namespace ov {
+namespace genai {
 
 /**
  * @brief controls the stopping condition for grouped beam search. The following values are possible:
@@ -22,43 +23,48 @@ namespace ov {
 enum class StopCriteria { early, heuristic, never };
 
 /**
- * @brief structure to keep generation config parameters.
+ * @brief Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group
+ * and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will 
+ * be used while greedy and beam search parameters will not affect decoding at all.
  * 
+ * Generic parameters:
  * @param max_length the maximum length the generated tokens can have. Corresponds to the length of the input prompt +
  *        `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set.
  * @param max_new_tokens the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length.
  * @param ignore_eos if set to true, then generation will not stop even if <eos> token is met.
+ * @param pad_token_id token_id of <pad> (padding)
+ * @param bos_token_id token_id of <bos> (beggining of sentence)
+ * @param eos_token_id token_id of <eos> (end of sentence)
+ * @param bos_token <bos> token string representation
+ * @param eos_token <eos> token string representation
+ * 
+ * Beam search specific parameters:
  * @param num_beams number of beams for beam search. 1 disables beam search.
  * @param num_beam_groups number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
  * @param diversity_penalty this value is subtracted from a beam's score if it generates the same token as any beam from other group at a
- *        particular time. Note that `diversity_penalty` is only effective if `group beam search` is enabled.
+ *        particular time. See https://arxiv.org/pdf/1909.05858.
  * @param length_penalty exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
  *        the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
  *        likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
  *        `length_penalty` < 0.0 encourages shorter sequences.
- * @param num_return_sequences the number of sequences to return for grouped beam search decoding
+ * @param num_return_sequences the number of sequences to return for grouped beam search decoding.
  * @param no_repeat_ngram_size if set to int > 0, all ngrams of that size can only occur once.
  * @param stop_criteria controls the stopping condition for grouped beam search. It accepts the following values: 
  *        "early", where the generation stops as soon as there are `num_beams` complete candidates; "heuristic", where an 
  *        heuristic is applied and the generation stops when is it very unlikely to find better candidates;
  *        "never", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
- * @param temperature the value used to modulate token probabilities for random sampling
+ * 
+ * Random sampling parameters:
+ * @param temperature the value used to modulate token probabilities for random sampling.
  * @param top_p - if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
  * @param top_k the number of highest probability vocabulary tokens to keep for top-k-filtering.
- * @param do_sample whether or not to use multinomial random sampling
- *        that add up to `top_p` or higher are kept.
- * @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty. See https://arxiv.org/pdf/1909.05858.
- * @param pad_token_id id of padding token
- * @param bos_token_id id of <bos> token
- * @param eos_token_id id of <eos> token
- * @param bos_token <bos> token string representation
- * @param eos_token <eos> token string representation
- * @param draft_model draft model for assitive decoding
+ * @param do_sample whether or not to use multinomial random sampling that add up to `top_p` or higher are kept.
+ * @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty.
  */
 class OPENVINO_GENAI_EXPORTS GenerationConfig {
 public:
     GenerationConfig() = default;
-    GenerationConfig(std::string json_path);
+    explicit GenerationConfig(std::string json_path);
 
     // Generic
     size_t max_new_tokens = SIZE_MAX;
@@ -89,6 +95,13 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
     // used for chat scenario
     std::string bos_token = "<s>";
     std::string eos_token = "</s>";
+
+    size_t get_max_new_tokens(size_t prompt_length = 0) const;
+    bool is_greedy_decoding() const;
+    bool is_beam_search() const;
+    bool is_multimomial() const;
+    static GenerationConfig anymap_to_generation_config(const ov::AnyMap& config_map = {});
 };
 
-} // namespace ov
+}  // namespace genai
+}  // namespace ov
diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp
@@ -6,12 +6,13 @@
 #include <optional>
 #include <variant>
 
-#include <openvino/core/any.hpp>
+#include "openvino/core/any.hpp"
 #include "openvino/genai/generation_config.hpp"
 #include "openvino/genai/tokenizer.hpp"
 #include "openvino/genai/streamer_base.hpp"
 
 namespace ov {
+namespace genai {
 
 using StreamerVariant = std::variant<std::function<void (std::string)>, std::shared_ptr<StreamerBase>>;
 using OptionalGenerationConfig = std::optional<GenerationConfig>;
@@ -71,7 +72,7 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
     * @param device optional device
     * @param plugin_config optional plugin_config
     */
-    LLMPipeline(std::string& path, std::string device="CPU", 
+    LLMPipeline(const std::string& path, const std::string& device="CPU", 
                 const ov::AnyMap& plugin_config={}, 
                 const std::string& ov_tokenizers_path="");
 
@@ -84,11 +85,10 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
     * @param plugin_config optional plugin_config
     */
     LLMPipeline(
-        const std::string model_path,
-        const ov::Tokenizer& tokenizer,
-        const std::string device="CPU",
-        const ov::AnyMap& plugin_config = {},
-        const std::string& ov_tokenizers_path=""
+        const std::string& model_path,
+        const ov::genai::Tokenizer& tokenizer,
+        const std::string& device="CPU",
+        const ov::AnyMap& plugin_config = {}
     );
 
     ~LLMPipeline();
@@ -127,8 +127,7 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
     * @param generation_config optional GenerationConfig
     * @return DecodedResults a structure with resulting texts & scores
     */
-    DecodedResults generate(std::vector<std::string> texts, OptionalGenerationConfig generation_config);
-    DecodedResults generate(std::initializer_list<std::string> text, OptionalGenerationConfig generation_config);
+    DecodedResults generate(const std::vector<std::string>& texts, OptionalGenerationConfig generation_config);
 
     /**
     * @brief Low level generate to be called with already encoded input_ids tokens.
@@ -153,14 +152,19 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
         return generate(text, AnyMap{std::forward<Properties>(properties)...});
     }
 
-    DecodedResults operator()(std::vector<std::string> text, OptionalGenerationConfig generation_config=std::nullopt);
-    DecodedResults operator()(std::initializer_list<std::string> text, OptionalGenerationConfig generation_config=std::nullopt);
+    DecodedResults operator()(const std::vector<std::string>& text, OptionalGenerationConfig generation_config=std::nullopt) {
+        return generate(text, generation_config);
+    }
 
-    // generate with streamers
-    std::string operator()(std::string text, OptionalGenerationConfig generation_config=std::nullopt, OptionalStreamerVariant streamer=std::nullopt);
-    std::string operator()(std::string text, OptionalStreamerVariant streamer);
+    std::string operator()(
+        std::string text, 
+        OptionalGenerationConfig generation_config=std::nullopt, 
+        OptionalStreamerVariant streamer=std::nullopt
+    ) {
+        return generate(text, generation_config, streamer);
+    }
 
-    ov::Tokenizer get_tokenizer();
+    ov::genai::Tokenizer get_tokenizer();
     GenerationConfig get_generation_config() const;
     void set_generation_config(const GenerationConfig& generation_config);
 
@@ -174,10 +178,9 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
 };
 
 /*
- * utils that allow to use generate and operarator() in the folllowing way:
+ * utils that allow to use generate and operator() in the following way:
  * pipe.generate(input_ids, ov::max_new_tokens(200), ov::temperature(1.0f),...)
  * pipe(text, ov::max_new_tokens(200), ov::temperature(1.0f),...)
- * All names match to names in cofnig except streamer.
 */
 static constexpr ov::Property<size_t> max_new_tokens{"max_new_tokens"};
 static constexpr ov::Property<size_t> max_length{"max_length"};
@@ -207,6 +210,7 @@ static constexpr ov::Property<std::string> eos_token{"eos_token"};
 
 // only lambda streamer can be set via ov::streamer(),... syntaxic sugar,
 // because std::variant<StremaerBase, std::function<>> can not be stored in AnyMap
-static constexpr ov::Property<std::function<void (std::string)>> streamer_lambda{"streamer"};
+static constexpr ov::Property<std::function<void (std::string)>> streamer{"streamer"};
 
-} // namespace ov
+}  // namespace genai
+}  // namespace ov