Generate pipeline (#334)

LLM return logits with probabilities of each token, these probabilities can be converted to tokens/words with different technics: greedy decoding, beam search decoding, random sampling, etc. This requires writing user unfriendly post-processing even for the simplest scenario of greedy decoding. In order to make live easier we we combined all decoding scenarios into a single function call, where the decoding method and parameters are specified by arguments. In this PR we provide a user friendly API for text generation inspired by `generate` method from HuggingFace transformers library. - [x] enable calling tokenizers/detokenizers from LLMPipeline - [ ] add callback for streaming mode - done partially, need to improve - [x] rewritten samples with the current approach: [causal_lm/cpp/generate_pipeline/generate_sample.cpp#L73-L83](https://github.com/pavel-esir/openvino.genai/blob/generate_pipeline/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp#L73-L83) - [x] Multibatch greedy decoding - [ ] Speculative decoding - [ ] Grouped Beam Search decoding: ready for batch 1, need to rebase multibatch support after merging openvinotoolkit/openvino.genai#349 - [x] Random sampling Example 1: Greedy search generation ``` LLMPipeline pipe(model_path, device); // Will try to load config from generation_config.json. // but if not found default velues for gready search will be used GenerationConfig config = pipe.generation_config(); cout << pipe(prompt, config.max_new_tokens(20)); ``` Example 2: TextStreaming mode ``` LLMPipeline pipe(model_path, device); GenerationConfig config = pipe.generation_config(); auto text_streamer = TextStreamer{pipe}; auto text_streamer_callback = [&text_streamer](std::vector<int64_t>&& tokens, LLMPipeline& pipe){ text_streamer.put(tokens[0]); }; pipe(prompt, config.max_new_tokens(20).set_callback(text_streamer_callback)); text_streamer.end(); ``` CVS-132907 CVS-137920 --------- Co-authored-by: Wovchena <[email protected]> Co-authored-by: Ilya Lavrenov <[email protected]> Co-authored-by: Alexander Suvorov <[email protected]> Co-authored-by: Yaroslav Tarkan <[email protected]> Co-authored-by: Xiake Sun <[email protected]> Co-authored-by: wenyi5608 <[email protected]> Co-authored-by: Ekaterina Aidova <[email protected]> Co-authored-by: guozhong wang <[email protected]> Co-authored-by: Chen Peter <[email protected]>
ScottZhang812 · Jun 7, 2024 · e852acf · e852acf
commit e852acf
Show file tree

Hide file tree

Showing 17 changed files with 2,752 additions and 0 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -0,0 +1,103 @@
+# Copyright (C) 2018-2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+# Dependencies
+
+include(FetchContent)
+
+FetchContent_Declare(nlohmann_json
+    URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.tar.gz
+    URL_HASH SHA256=0d8ef5af7f9794e3263480193c491549b2ba6cc74bb018906202ada498a79406)
+FetchContent_MakeAvailable(nlohmann_json)
+
+function(ov_genai_build_jinja2cpp)
+    FetchContent_Declare(jinja2cpp
+        URL https://github.com/jinja2cpp/Jinja2Cpp/archive/9ae7e1fc45d707e1686dd425a154d30963801944.tar.gz
+        URL_HASH SHA256=aa41ae425225623ba91be5de3ef1e0d942e682d519311e6235b04b4e7d880e01)
+
+    FetchContent_GetProperties(jinja2cpp)
+    if(NOT jinja2cpp_POPULATED)
+        FetchContent_Populate(jinja2cpp)
+
+        set(BUILD_SHARED_LIBS OFF)
+        set(JINJA2CPP_INSTALL OFF CACHE BOOL "")
+        set(JINJA2CPP_CXX_STANDARD 17 CACHE STRING "")
+        set(JINJA2CPP_BUILD_SHARED OFF CACHE BOOL "")
+        set(JINJA2CPP_USE_REGEX "std" CACHE STRING "")
+        set(JINJA2CPP_WITH_JSON_BINDINGS "none" CACHE STRING "")
+        set(JINJA2CPP_STRICT_WARNINGS OFF CACHE BOOL "")
+        set(JINJA2CPP_PIC ON CACHE BOOL "")
+
+        add_subdirectory("${jinja2cpp_SOURCE_DIR}" "${jinja2cpp_BINARY_DIR}" EXCLUDE_FROM_ALL)
+    endif()
+endfunction()
+
+ov_genai_build_jinja2cpp()
+
+# Library
+
+file(GLOB SOURCE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp")
+
+set(TARGET_NAME openvino_genai)
+add_library(${TARGET_NAME} SHARED ${SOURCE_FILES})
+add_library(openvino::genai ALIAS ${TARGET_NAME})
+
+target_include_directories(${TARGET_NAME}
+    PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>" "$<INSTALL_INTERFACE:runtime/include>")
+
+target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime PRIVATE nlohmann_json::nlohmann_json jinja2cpp)
+
+target_compile_features(${TARGET_NAME} PUBLIC cxx_std_17)
+
+# Extract two last digits from CMAKE_PROJECT_VERSION_MAJOR because SOVERSION can only contain up to 4 symbols.
+string(REGEX MATCH [=[[0-9][0-9]$]=] MAJOR_SUFFIX ${CMAKE_PROJECT_VERSION_MAJOR})
+set_target_properties(${TARGET_NAME} PROPERTIES
+    EXPORT_NAME genai
+    VERSION ${CMAKE_PROJECT_VERSION}
+    SOVERSION ${MAJOR_SUFFIX}${CMAKE_PROJECT_VERSION_MINOR}${CMAKE_PROJECT_VERSION_PATCH}
+    ARCHIVE_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>"
+    LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>"
+    RUNTIME_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>"
+)
+
+find_package(Python3 REQUIRED COMPONENTS Interpreter Development)
+install(TARGETS ${TARGET_NAME}
+    LIBRARY DESTINATION python/openvino_genai/ COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR}
+    RUNTIME DESTINATION python/openvino_genai/ COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR})
+
+# - Windows: `<openvino_dir>\runtime\bin\intel64\Release\`
+# - MacOS_x86: `<openvino_dir>/runtime/lib/intel64/Release`
+# - MacOS_arm64: `<openvino_dir>/runtime/lib/arm64/Release/`
+# - Linux_x86: `<openvino_dir>/runtime/lib/intel64/`
+# - Linux_arm64: `<openvino_dir>/runtime/lib/aarch64/`
+string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" ARCH_DIR)
+if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
+    set(ARCH_DIR intel64)
+elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(arm64.*|aarch64.*|AARCH64.*|ARM64.*)")
+    if(APPLE)
+        set(ARCH_DIR "arm64")
+    else()
+        set(ARCH_DIR "aarch64")
+    endif()
+elseif(ARCH_DIR STREQUAL "x86_64" OR ARCH_DIR STREQUAL "amd64"  # Windows detects Intel's 64-bit CPU as AMD64
+        OR CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64")
+    set(ARCH_DIR intel64)
+endif()
+if(MSVC OR APPLE)
+    set(ARCH_DIR ${ARCH_DIR}/${CMAKE_BUILD_TYPE})
+endif()
+install(TARGETS ${TARGET_NAME} EXPORT OpenVINOGenAITargets
+    LIBRARY DESTINATION runtime/lib/${ARCH_DIR} COMPONENT core_genai
+        NAMELINK_COMPONENT core_genai_dev
+    ARCHIVE DESTINATION runtime/lib/${ARCH_DIR} COMPONENT core_genai_dev
+    RUNTIME DESTINATION runtime/bin/${ARCH_DIR} COMPONENT core_genai
+    INCLUDES DESTINATION runtime/include)
+install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/ DESTINATION runtime/include COMPONENT core_genai_dev)
+install(EXPORT OpenVINOGenAITargets FILE OpenVINOGenAITargets.cmake NAMESPACE openvino:: DESTINATION runtime/cmake)
+include(CMakePackageConfigHelpers)
+configure_package_config_file(OpenVINOGenAIConfig.cmake.in "${CMAKE_BINARY_DIR}/OpenVINOGenAIConfig.cmake" INSTALL_DESTINATION runtime/cmake)
+install(FILES "${CMAKE_BINARY_DIR}/OpenVINOGenAIConfig.cmake" "${CMAKE_BINARY_DIR}/OpenVINOGenAIConfig.cmake" DESTINATION runtime/cmake COMPONENT core_genai_dev)
+include(CMakePackageConfigHelpers)
+write_basic_package_version_file("${CMAKE_BINARY_DIR}/OpenVINOGenAIConfigVersion.cmake" VERSION ${CMAKE_PROJECT_VERSION} COMPATIBILITY AnyNewerVersion)
+export(EXPORT OpenVINOGenAITargets FILE "${CMAKE_BINARY_DIR}/OpenVINOGenAITargets.cmake" NAMESPACE openvino::)
diff --git a/OpenVINOGenAIConfig.cmake.in b/OpenVINOGenAIConfig.cmake.in
@@ -0,0 +1,10 @@
+@PACKAGE_INIT@
+
+include(CMakeFindDependencyMacro)
+find_dependency(OpenVINO COMPONENTS Runtime)
+
+if(NOT TARGET openvino_genai)
+    include("${CMAKE_CURRENT_LIST_DIR}/OpenVINOGenAITargets.cmake")
+endif()
+
+check_required_components(OpenVINOGenAI)
diff --git a/include/openvino/genai/generation_config.hpp b/include/openvino/genai/generation_config.hpp
@@ -0,0 +1,125 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <limits>
+#include <variant>
+#include <string>
+
+#include "openvino/runtime/compiled_model.hpp"
+#include "openvino/runtime/infer_request.hpp"
+#include "openvino/genai/tokenizer.hpp"
+
+namespace ov {
+namespace genai {
+
+/**
+ * @brief controls the stopping condition for grouped beam search. The following values are possible:
+ *        "EARLY" stops as soon as there are `num_beams` complete candidates.
+          "HEURISTIC" stops when is it unlikely to find better candidates.
+          "NEVER" stops when there cannot be better candidates.
+ */
+enum class StopCriteria { EARLY, HEURISTIC, NEVER };
+
+/**
+ * @brief Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group
+ * and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will 
+ * be used while greedy and beam search parameters will not affect decoding at all.
+ * 
+ * Generic parameters:
+ * @param max_length the maximum length the generated tokens can have. Corresponds to the length of the input prompt +
+ *        `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set.
+ * @param max_new_tokens the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length.
+ * @param ignore_eos if set to true, then generation will not stop even if <eos> token is met.
+ * @param eos_token_id token_id of <eos> (end of sentence)
+ * 
+ * Beam search specific parameters:
+ * @param num_beams number of beams for beam search. 1 disables beam search.
+ * @param num_beam_groups number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
+ * @param diversity_penalty this value is subtracted from a beam's score if it generates the same token as any beam from other group at a
+ *        particular time. See https://arxiv.org/pdf/1909.05858.
+ * @param length_penalty exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
+ *        the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
+ *        likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
+ *        `length_penalty` < 0.0 encourages shorter sequences.
+ * @param num_return_sequences the number of sequences to return for grouped beam search decoding.
+ * @param no_repeat_ngram_size if set to int > 0, all ngrams of that size can only occur once.
+ * @param stop_criteria controls the stopping condition for grouped beam search. It accepts the following values: 
+ *        "EARLY", where the generation stops as soon as there are `num_beams` complete candidates; "HEURISTIC", where an 
+ *        "HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates;
+ *        "NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
+ * 
+ * Random sampling parameters:
+ * @param temperature the value used to modulate token probabilities for random sampling.
+ * @param top_p - if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+ * @param top_k the number of highest probability vocabulary tokens to keep for top-k-filtering.
+ * @param do_sample whether or not to use multinomial random sampling that add up to `top_p` or higher are kept.
+ * @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty.
+ */
+class OPENVINO_GENAI_EXPORTS GenerationConfig {
+public:
+    GenerationConfig() = default;
+    explicit GenerationConfig(const std::string& json_path);
+
+    // Generic
+    size_t max_new_tokens = SIZE_MAX;
+    size_t max_length = SIZE_MAX;
+    bool ignore_eos = false;
+
+    // Beam search specific
+    size_t num_beam_groups = 1;
+    size_t num_beams = 1;
+    float diversity_penalty = 1.0f;
+    float length_penalty = 1.0f;
+    size_t num_return_sequences = 1;
+    size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max();
+    StopCriteria stop_criteria = StopCriteria::HEURISTIC;
+
+    // Multinomial
+    float temperature = 1.0f;
+    float top_p = 1.0f;
+    size_t top_k = 50;
+    bool do_sample = false;
+    float repetition_penalty = 1.0f;
+
+    // EOS special token
+    int64_t eos_token_id = -1;
+
+    size_t get_max_new_tokens(size_t prompt_length = 0) const;
+    bool is_greedy_decoding() const;
+    bool is_beam_search() const;
+    bool is_multinomial() const;
+    void update_generation_config(const ov::AnyMap& config_map = {});
+
+    /// @brief checks that are no conflicting parameters, e.g. do_sample=true and num_beams > 1.
+    /// @throws Exception if config is invalid.
+    void validate() const;
+};
+
+/*
+ * utils that allow to use generate and operator() in the following way:
+ * pipe.generate(input_ids, ov::max_new_tokens(200), ov::temperature(1.0f),...)
+ * pipe(text, ov::max_new_tokens(200), ov::temperature(1.0f),...)
+*/
+static constexpr ov::Property<size_t> max_new_tokens{"max_new_tokens"};
+static constexpr ov::Property<size_t> max_length{"max_length"};
+static constexpr ov::Property<bool> ignore_eos{"ignore_eos"};
+
+static constexpr ov::Property<size_t> num_beam_groups{"num_beam_groups"};
+static constexpr ov::Property<size_t> num_beams{"num_beams"};
+static constexpr ov::Property<float> diversity_penalty{"diversity_penalty"};
+static constexpr ov::Property<float> length_penalty{"length_penalty"};
+static constexpr ov::Property<size_t> num_return_sequences{"num_return_sequences"};
+static constexpr ov::Property<size_t> no_repeat_ngram_size{"no_repeat_ngram_size"};
+static constexpr ov::Property<StopCriteria> stop_criteria{"stop_criteria"};
+
+static constexpr ov::Property<float> temperature{"temperature"};
+static constexpr ov::Property<float> top_p{"top_p"};
+static constexpr ov::Property<int> top_k{"top_k"};
+static constexpr ov::Property<bool> do_sample{"do_sample"};
+static constexpr ov::Property<float> repetition_penalty{"repetition_penalty"};
+static constexpr ov::Property<int64_t> eos_token_id{"eos_token_id"};
+
+}  // namespace genai
+}  // namespace ov