-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
LLM return logits with probabilities of each token, these probabilities can be converted to tokens/words with different technics: greedy decoding, beam search decoding, random sampling, etc. This requires writing user unfriendly post-processing even for the simplest scenario of greedy decoding. In order to make live easier we we combined all decoding scenarios into a single function call, where the decoding method and parameters are specified by arguments. In this PR we provide a user friendly API for text generation inspired by `generate` method from HuggingFace transformers library. - [x] enable calling tokenizers/detokenizers from LLMPipeline - [ ] add callback for streaming mode - done partially, need to improve - [x] rewritten samples with the current approach: [causal_lm/cpp/generate_pipeline/generate_sample.cpp#L73-L83](https://github.com/pavel-esir/openvino.genai/blob/generate_pipeline/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp#L73-L83) - [x] Multibatch greedy decoding - [ ] Speculative decoding - [ ] Grouped Beam Search decoding: ready for batch 1, need to rebase multibatch support after merging openvinotoolkit/openvino.genai#349 - [x] Random sampling Example 1: Greedy search generation ``` LLMPipeline pipe(model_path, device); // Will try to load config from generation_config.json. // but if not found default velues for gready search will be used GenerationConfig config = pipe.generation_config(); cout << pipe(prompt, config.max_new_tokens(20)); ``` Example 2: TextStreaming mode ``` LLMPipeline pipe(model_path, device); GenerationConfig config = pipe.generation_config(); auto text_streamer = TextStreamer{pipe}; auto text_streamer_callback = [&text_streamer](std::vector<int64_t>&& tokens, LLMPipeline& pipe){ text_streamer.put(tokens[0]); }; pipe(prompt, config.max_new_tokens(20).set_callback(text_streamer_callback)); text_streamer.end(); ``` CVS-132907 CVS-137920 --------- Co-authored-by: Wovchena <[email protected]> Co-authored-by: Ilya Lavrenov <[email protected]> Co-authored-by: Alexander Suvorov <[email protected]> Co-authored-by: Yaroslav Tarkan <[email protected]> Co-authored-by: Xiake Sun <[email protected]> Co-authored-by: wenyi5608 <[email protected]> Co-authored-by: Ekaterina Aidova <[email protected]> Co-authored-by: guozhong wang <[email protected]> Co-authored-by: Chen Peter <[email protected]>
- Loading branch information
Showing
17 changed files
with
2,752 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
# Copyright (C) 2018-2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# | ||
|
||
# Dependencies | ||
|
||
include(FetchContent) | ||
|
||
FetchContent_Declare(nlohmann_json | ||
URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.tar.gz | ||
URL_HASH SHA256=0d8ef5af7f9794e3263480193c491549b2ba6cc74bb018906202ada498a79406) | ||
FetchContent_MakeAvailable(nlohmann_json) | ||
|
||
function(ov_genai_build_jinja2cpp) | ||
FetchContent_Declare(jinja2cpp | ||
URL https://github.com/jinja2cpp/Jinja2Cpp/archive/9ae7e1fc45d707e1686dd425a154d30963801944.tar.gz | ||
URL_HASH SHA256=aa41ae425225623ba91be5de3ef1e0d942e682d519311e6235b04b4e7d880e01) | ||
|
||
FetchContent_GetProperties(jinja2cpp) | ||
if(NOT jinja2cpp_POPULATED) | ||
FetchContent_Populate(jinja2cpp) | ||
|
||
set(BUILD_SHARED_LIBS OFF) | ||
set(JINJA2CPP_INSTALL OFF CACHE BOOL "") | ||
set(JINJA2CPP_CXX_STANDARD 17 CACHE STRING "") | ||
set(JINJA2CPP_BUILD_SHARED OFF CACHE BOOL "") | ||
set(JINJA2CPP_USE_REGEX "std" CACHE STRING "") | ||
set(JINJA2CPP_WITH_JSON_BINDINGS "none" CACHE STRING "") | ||
set(JINJA2CPP_STRICT_WARNINGS OFF CACHE BOOL "") | ||
set(JINJA2CPP_PIC ON CACHE BOOL "") | ||
|
||
add_subdirectory("${jinja2cpp_SOURCE_DIR}" "${jinja2cpp_BINARY_DIR}" EXCLUDE_FROM_ALL) | ||
endif() | ||
endfunction() | ||
|
||
ov_genai_build_jinja2cpp() | ||
|
||
# Library | ||
|
||
file(GLOB SOURCE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp") | ||
|
||
set(TARGET_NAME openvino_genai) | ||
add_library(${TARGET_NAME} SHARED ${SOURCE_FILES}) | ||
add_library(openvino::genai ALIAS ${TARGET_NAME}) | ||
|
||
target_include_directories(${TARGET_NAME} | ||
PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>" "$<INSTALL_INTERFACE:runtime/include>") | ||
|
||
target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime PRIVATE nlohmann_json::nlohmann_json jinja2cpp) | ||
|
||
target_compile_features(${TARGET_NAME} PUBLIC cxx_std_17) | ||
|
||
# Extract two last digits from CMAKE_PROJECT_VERSION_MAJOR because SOVERSION can only contain up to 4 symbols. | ||
string(REGEX MATCH [=[[0-9][0-9]$]=] MAJOR_SUFFIX ${CMAKE_PROJECT_VERSION_MAJOR}) | ||
set_target_properties(${TARGET_NAME} PROPERTIES | ||
EXPORT_NAME genai | ||
VERSION ${CMAKE_PROJECT_VERSION} | ||
SOVERSION ${MAJOR_SUFFIX}${CMAKE_PROJECT_VERSION_MINOR}${CMAKE_PROJECT_VERSION_PATCH} | ||
ARCHIVE_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" | ||
LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" | ||
RUNTIME_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" | ||
) | ||
|
||
find_package(Python3 REQUIRED COMPONENTS Interpreter Development) | ||
install(TARGETS ${TARGET_NAME} | ||
LIBRARY DESTINATION python/openvino_genai/ COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR} | ||
RUNTIME DESTINATION python/openvino_genai/ COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR}) | ||
|
||
# - Windows: `<openvino_dir>\runtime\bin\intel64\Release\` | ||
# - MacOS_x86: `<openvino_dir>/runtime/lib/intel64/Release` | ||
# - MacOS_arm64: `<openvino_dir>/runtime/lib/arm64/Release/` | ||
# - Linux_x86: `<openvino_dir>/runtime/lib/intel64/` | ||
# - Linux_arm64: `<openvino_dir>/runtime/lib/aarch64/` | ||
string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" ARCH_DIR) | ||
if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") | ||
set(ARCH_DIR intel64) | ||
elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(arm64.*|aarch64.*|AARCH64.*|ARM64.*)") | ||
if(APPLE) | ||
set(ARCH_DIR "arm64") | ||
else() | ||
set(ARCH_DIR "aarch64") | ||
endif() | ||
elseif(ARCH_DIR STREQUAL "x86_64" OR ARCH_DIR STREQUAL "amd64" # Windows detects Intel's 64-bit CPU as AMD64 | ||
OR CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64") | ||
set(ARCH_DIR intel64) | ||
endif() | ||
if(MSVC OR APPLE) | ||
set(ARCH_DIR ${ARCH_DIR}/${CMAKE_BUILD_TYPE}) | ||
endif() | ||
install(TARGETS ${TARGET_NAME} EXPORT OpenVINOGenAITargets | ||
LIBRARY DESTINATION runtime/lib/${ARCH_DIR} COMPONENT core_genai | ||
NAMELINK_COMPONENT core_genai_dev | ||
ARCHIVE DESTINATION runtime/lib/${ARCH_DIR} COMPONENT core_genai_dev | ||
RUNTIME DESTINATION runtime/bin/${ARCH_DIR} COMPONENT core_genai | ||
INCLUDES DESTINATION runtime/include) | ||
install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/ DESTINATION runtime/include COMPONENT core_genai_dev) | ||
install(EXPORT OpenVINOGenAITargets FILE OpenVINOGenAITargets.cmake NAMESPACE openvino:: DESTINATION runtime/cmake) | ||
include(CMakePackageConfigHelpers) | ||
configure_package_config_file(OpenVINOGenAIConfig.cmake.in "${CMAKE_BINARY_DIR}/OpenVINOGenAIConfig.cmake" INSTALL_DESTINATION runtime/cmake) | ||
install(FILES "${CMAKE_BINARY_DIR}/OpenVINOGenAIConfig.cmake" "${CMAKE_BINARY_DIR}/OpenVINOGenAIConfig.cmake" DESTINATION runtime/cmake COMPONENT core_genai_dev) | ||
include(CMakePackageConfigHelpers) | ||
write_basic_package_version_file("${CMAKE_BINARY_DIR}/OpenVINOGenAIConfigVersion.cmake" VERSION ${CMAKE_PROJECT_VERSION} COMPATIBILITY AnyNewerVersion) | ||
export(EXPORT OpenVINOGenAITargets FILE "${CMAKE_BINARY_DIR}/OpenVINOGenAITargets.cmake" NAMESPACE openvino::) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
@PACKAGE_INIT@ | ||
|
||
include(CMakeFindDependencyMacro) | ||
find_dependency(OpenVINO COMPONENTS Runtime) | ||
|
||
if(NOT TARGET openvino_genai) | ||
include("${CMAKE_CURRENT_LIST_DIR}/OpenVINOGenAITargets.cmake") | ||
endif() | ||
|
||
check_required_components(OpenVINOGenAI) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
// Copyright (C) 2023-2024 Intel Corporation | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
#pragma once | ||
|
||
#include <limits> | ||
#include <variant> | ||
#include <string> | ||
|
||
#include "openvino/runtime/compiled_model.hpp" | ||
#include "openvino/runtime/infer_request.hpp" | ||
#include "openvino/genai/tokenizer.hpp" | ||
|
||
namespace ov { | ||
namespace genai { | ||
|
||
/** | ||
* @brief controls the stopping condition for grouped beam search. The following values are possible: | ||
* "EARLY" stops as soon as there are `num_beams` complete candidates. | ||
"HEURISTIC" stops when is it unlikely to find better candidates. | ||
"NEVER" stops when there cannot be better candidates. | ||
*/ | ||
enum class StopCriteria { EARLY, HEURISTIC, NEVER }; | ||
|
||
/** | ||
* @brief Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group | ||
* and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will | ||
* be used while greedy and beam search parameters will not affect decoding at all. | ||
* | ||
* Generic parameters: | ||
* @param max_length the maximum length the generated tokens can have. Corresponds to the length of the input prompt + | ||
* `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set. | ||
* @param max_new_tokens the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. | ||
* @param ignore_eos if set to true, then generation will not stop even if <eos> token is met. | ||
* @param eos_token_id token_id of <eos> (end of sentence) | ||
* | ||
* Beam search specific parameters: | ||
* @param num_beams number of beams for beam search. 1 disables beam search. | ||
* @param num_beam_groups number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. | ||
* @param diversity_penalty this value is subtracted from a beam's score if it generates the same token as any beam from other group at a | ||
* particular time. See https://arxiv.org/pdf/1909.05858. | ||
* @param length_penalty exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to | ||
* the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log | ||
* likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while | ||
* `length_penalty` < 0.0 encourages shorter sequences. | ||
* @param num_return_sequences the number of sequences to return for grouped beam search decoding. | ||
* @param no_repeat_ngram_size if set to int > 0, all ngrams of that size can only occur once. | ||
* @param stop_criteria controls the stopping condition for grouped beam search. It accepts the following values: | ||
* "EARLY", where the generation stops as soon as there are `num_beams` complete candidates; "HEURISTIC", where an | ||
* "HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates; | ||
* "NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). | ||
* | ||
* Random sampling parameters: | ||
* @param temperature the value used to modulate token probabilities for random sampling. | ||
* @param top_p - if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. | ||
* @param top_k the number of highest probability vocabulary tokens to keep for top-k-filtering. | ||
* @param do_sample whether or not to use multinomial random sampling that add up to `top_p` or higher are kept. | ||
* @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty. | ||
*/ | ||
class OPENVINO_GENAI_EXPORTS GenerationConfig { | ||
public: | ||
GenerationConfig() = default; | ||
explicit GenerationConfig(const std::string& json_path); | ||
|
||
// Generic | ||
size_t max_new_tokens = SIZE_MAX; | ||
size_t max_length = SIZE_MAX; | ||
bool ignore_eos = false; | ||
|
||
// Beam search specific | ||
size_t num_beam_groups = 1; | ||
size_t num_beams = 1; | ||
float diversity_penalty = 1.0f; | ||
float length_penalty = 1.0f; | ||
size_t num_return_sequences = 1; | ||
size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max(); | ||
StopCriteria stop_criteria = StopCriteria::HEURISTIC; | ||
|
||
// Multinomial | ||
float temperature = 1.0f; | ||
float top_p = 1.0f; | ||
size_t top_k = 50; | ||
bool do_sample = false; | ||
float repetition_penalty = 1.0f; | ||
|
||
// EOS special token | ||
int64_t eos_token_id = -1; | ||
|
||
size_t get_max_new_tokens(size_t prompt_length = 0) const; | ||
bool is_greedy_decoding() const; | ||
bool is_beam_search() const; | ||
bool is_multinomial() const; | ||
void update_generation_config(const ov::AnyMap& config_map = {}); | ||
|
||
/// @brief checks that are no conflicting parameters, e.g. do_sample=true and num_beams > 1. | ||
/// @throws Exception if config is invalid. | ||
void validate() const; | ||
}; | ||
|
||
/* | ||
* utils that allow to use generate and operator() in the following way: | ||
* pipe.generate(input_ids, ov::max_new_tokens(200), ov::temperature(1.0f),...) | ||
* pipe(text, ov::max_new_tokens(200), ov::temperature(1.0f),...) | ||
*/ | ||
static constexpr ov::Property<size_t> max_new_tokens{"max_new_tokens"}; | ||
static constexpr ov::Property<size_t> max_length{"max_length"}; | ||
static constexpr ov::Property<bool> ignore_eos{"ignore_eos"}; | ||
|
||
static constexpr ov::Property<size_t> num_beam_groups{"num_beam_groups"}; | ||
static constexpr ov::Property<size_t> num_beams{"num_beams"}; | ||
static constexpr ov::Property<float> diversity_penalty{"diversity_penalty"}; | ||
static constexpr ov::Property<float> length_penalty{"length_penalty"}; | ||
static constexpr ov::Property<size_t> num_return_sequences{"num_return_sequences"}; | ||
static constexpr ov::Property<size_t> no_repeat_ngram_size{"no_repeat_ngram_size"}; | ||
static constexpr ov::Property<StopCriteria> stop_criteria{"stop_criteria"}; | ||
|
||
static constexpr ov::Property<float> temperature{"temperature"}; | ||
static constexpr ov::Property<float> top_p{"top_p"}; | ||
static constexpr ov::Property<int> top_k{"top_k"}; | ||
static constexpr ov::Property<bool> do_sample{"do_sample"}; | ||
static constexpr ov::Property<float> repetition_penalty{"repetition_penalty"}; | ||
static constexpr ov::Property<int64_t> eos_token_id{"eos_token_id"}; | ||
|
||
} // namespace genai | ||
} // namespace ov |
Oops, something went wrong.