Skip to content

Commit

Permalink
Generate pipeline (#334)
Browse files Browse the repository at this point in the history
LLM return logits with probabilities of each token, these probabilities
can be converted to tokens/words with different technics: greedy
decoding, beam search decoding, random sampling, etc.

This requires writing user unfriendly post-processing even for the
simplest scenario of greedy decoding. In order to make live easier we we
combined all decoding scenarios into a single function call, where the
decoding method and parameters are specified by arguments.

In this PR we provide a user friendly API for text generation inspired
by `generate` method from HuggingFace transformers library.

- [x] enable calling tokenizers/detokenizers from LLMPipeline
- [ ] add callback for streaming mode - done partially, need to improve
- [x] rewritten samples with the current approach:
[causal_lm/cpp/generate_pipeline/generate_sample.cpp#L73-L83](https://github.com/pavel-esir/openvino.genai/blob/generate_pipeline/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp#L73-L83)
- [x] Multibatch greedy decoding
- [ ] Speculative decoding
- [ ] Grouped Beam Search decoding: ready for batch 1, need to rebase
multibatch support after merging
openvinotoolkit/openvino.genai#349
- [x] Random sampling

Example 1: Greedy search generation
```
LLMPipeline pipe(model_path, device);

// Will try to load config from generation_config.json.
// but if not found default velues for gready search will be used
GenerationConfig config = pipe.generation_config();

cout << pipe(prompt, config.max_new_tokens(20));
```

Example 2: TextStreaming mode
```
LLMPipeline pipe(model_path, device);

GenerationConfig config = pipe.generation_config();

auto text_streamer = TextStreamer{pipe};
auto text_streamer_callback = [&text_streamer](std::vector<int64_t>&& tokens, LLMPipeline& pipe){
    text_streamer.put(tokens[0]);
};

pipe(prompt, config.max_new_tokens(20).set_callback(text_streamer_callback));
text_streamer.end();
```

CVS-132907 CVS-137920

---------

Co-authored-by: Wovchena <[email protected]>
Co-authored-by: Ilya Lavrenov <[email protected]>
Co-authored-by: Alexander Suvorov <[email protected]>
Co-authored-by: Yaroslav Tarkan <[email protected]>
Co-authored-by: Xiake Sun <[email protected]>
Co-authored-by: wenyi5608 <[email protected]>
Co-authored-by: Ekaterina Aidova <[email protected]>
Co-authored-by: guozhong wang <[email protected]>
Co-authored-by: Chen Peter <[email protected]>
  • Loading branch information
10 people authored Jun 7, 2024
0 parents commit e852acf
Show file tree
Hide file tree
Showing 17 changed files with 2,752 additions and 0 deletions.
103 changes: 103 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# Copyright (C) 2018-2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
#

# Dependencies

include(FetchContent)

FetchContent_Declare(nlohmann_json
URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.tar.gz
URL_HASH SHA256=0d8ef5af7f9794e3263480193c491549b2ba6cc74bb018906202ada498a79406)
FetchContent_MakeAvailable(nlohmann_json)

function(ov_genai_build_jinja2cpp)
FetchContent_Declare(jinja2cpp
URL https://github.com/jinja2cpp/Jinja2Cpp/archive/9ae7e1fc45d707e1686dd425a154d30963801944.tar.gz
URL_HASH SHA256=aa41ae425225623ba91be5de3ef1e0d942e682d519311e6235b04b4e7d880e01)

FetchContent_GetProperties(jinja2cpp)
if(NOT jinja2cpp_POPULATED)
FetchContent_Populate(jinja2cpp)

set(BUILD_SHARED_LIBS OFF)
set(JINJA2CPP_INSTALL OFF CACHE BOOL "")
set(JINJA2CPP_CXX_STANDARD 17 CACHE STRING "")
set(JINJA2CPP_BUILD_SHARED OFF CACHE BOOL "")
set(JINJA2CPP_USE_REGEX "std" CACHE STRING "")
set(JINJA2CPP_WITH_JSON_BINDINGS "none" CACHE STRING "")
set(JINJA2CPP_STRICT_WARNINGS OFF CACHE BOOL "")
set(JINJA2CPP_PIC ON CACHE BOOL "")

add_subdirectory("${jinja2cpp_SOURCE_DIR}" "${jinja2cpp_BINARY_DIR}" EXCLUDE_FROM_ALL)
endif()
endfunction()

ov_genai_build_jinja2cpp()

# Library

file(GLOB SOURCE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp")

set(TARGET_NAME openvino_genai)
add_library(${TARGET_NAME} SHARED ${SOURCE_FILES})
add_library(openvino::genai ALIAS ${TARGET_NAME})

target_include_directories(${TARGET_NAME}
PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>" "$<INSTALL_INTERFACE:runtime/include>")

target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime PRIVATE nlohmann_json::nlohmann_json jinja2cpp)

target_compile_features(${TARGET_NAME} PUBLIC cxx_std_17)

# Extract two last digits from CMAKE_PROJECT_VERSION_MAJOR because SOVERSION can only contain up to 4 symbols.
string(REGEX MATCH [=[[0-9][0-9]$]=] MAJOR_SUFFIX ${CMAKE_PROJECT_VERSION_MAJOR})
set_target_properties(${TARGET_NAME} PROPERTIES
EXPORT_NAME genai
VERSION ${CMAKE_PROJECT_VERSION}
SOVERSION ${MAJOR_SUFFIX}${CMAKE_PROJECT_VERSION_MINOR}${CMAKE_PROJECT_VERSION_PATCH}
ARCHIVE_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>"
LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>"
RUNTIME_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>"
)

find_package(Python3 REQUIRED COMPONENTS Interpreter Development)
install(TARGETS ${TARGET_NAME}
LIBRARY DESTINATION python/openvino_genai/ COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR}
RUNTIME DESTINATION python/openvino_genai/ COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR})

# - Windows: `<openvino_dir>\runtime\bin\intel64\Release\`
# - MacOS_x86: `<openvino_dir>/runtime/lib/intel64/Release`
# - MacOS_arm64: `<openvino_dir>/runtime/lib/arm64/Release/`
# - Linux_x86: `<openvino_dir>/runtime/lib/intel64/`
# - Linux_arm64: `<openvino_dir>/runtime/lib/aarch64/`
string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" ARCH_DIR)
if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
set(ARCH_DIR intel64)
elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(arm64.*|aarch64.*|AARCH64.*|ARM64.*)")
if(APPLE)
set(ARCH_DIR "arm64")
else()
set(ARCH_DIR "aarch64")
endif()
elseif(ARCH_DIR STREQUAL "x86_64" OR ARCH_DIR STREQUAL "amd64" # Windows detects Intel's 64-bit CPU as AMD64
OR CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64")
set(ARCH_DIR intel64)
endif()
if(MSVC OR APPLE)
set(ARCH_DIR ${ARCH_DIR}/${CMAKE_BUILD_TYPE})
endif()
install(TARGETS ${TARGET_NAME} EXPORT OpenVINOGenAITargets
LIBRARY DESTINATION runtime/lib/${ARCH_DIR} COMPONENT core_genai
NAMELINK_COMPONENT core_genai_dev
ARCHIVE DESTINATION runtime/lib/${ARCH_DIR} COMPONENT core_genai_dev
RUNTIME DESTINATION runtime/bin/${ARCH_DIR} COMPONENT core_genai
INCLUDES DESTINATION runtime/include)
install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/ DESTINATION runtime/include COMPONENT core_genai_dev)
install(EXPORT OpenVINOGenAITargets FILE OpenVINOGenAITargets.cmake NAMESPACE openvino:: DESTINATION runtime/cmake)
include(CMakePackageConfigHelpers)
configure_package_config_file(OpenVINOGenAIConfig.cmake.in "${CMAKE_BINARY_DIR}/OpenVINOGenAIConfig.cmake" INSTALL_DESTINATION runtime/cmake)
install(FILES "${CMAKE_BINARY_DIR}/OpenVINOGenAIConfig.cmake" "${CMAKE_BINARY_DIR}/OpenVINOGenAIConfig.cmake" DESTINATION runtime/cmake COMPONENT core_genai_dev)
include(CMakePackageConfigHelpers)
write_basic_package_version_file("${CMAKE_BINARY_DIR}/OpenVINOGenAIConfigVersion.cmake" VERSION ${CMAKE_PROJECT_VERSION} COMPATIBILITY AnyNewerVersion)
export(EXPORT OpenVINOGenAITargets FILE "${CMAKE_BINARY_DIR}/OpenVINOGenAITargets.cmake" NAMESPACE openvino::)
10 changes: 10 additions & 0 deletions OpenVINOGenAIConfig.cmake.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
@PACKAGE_INIT@

include(CMakeFindDependencyMacro)
find_dependency(OpenVINO COMPONENTS Runtime)

if(NOT TARGET openvino_genai)
include("${CMAKE_CURRENT_LIST_DIR}/OpenVINOGenAITargets.cmake")
endif()

check_required_components(OpenVINOGenAI)
125 changes: 125 additions & 0 deletions include/openvino/genai/generation_config.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
// Copyright (C) 2023-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include <limits>
#include <variant>
#include <string>

#include "openvino/runtime/compiled_model.hpp"
#include "openvino/runtime/infer_request.hpp"
#include "openvino/genai/tokenizer.hpp"

namespace ov {
namespace genai {

/**
* @brief controls the stopping condition for grouped beam search. The following values are possible:
* "EARLY" stops as soon as there are `num_beams` complete candidates.
"HEURISTIC" stops when is it unlikely to find better candidates.
"NEVER" stops when there cannot be better candidates.
*/
enum class StopCriteria { EARLY, HEURISTIC, NEVER };

/**
* @brief Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group
* and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will
* be used while greedy and beam search parameters will not affect decoding at all.
*
* Generic parameters:
* @param max_length the maximum length the generated tokens can have. Corresponds to the length of the input prompt +
* `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set.
* @param max_new_tokens the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length.
* @param ignore_eos if set to true, then generation will not stop even if <eos> token is met.
* @param eos_token_id token_id of <eos> (end of sentence)
*
* Beam search specific parameters:
* @param num_beams number of beams for beam search. 1 disables beam search.
* @param num_beam_groups number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
* @param diversity_penalty this value is subtracted from a beam's score if it generates the same token as any beam from other group at a
* particular time. See https://arxiv.org/pdf/1909.05858.
* @param length_penalty exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
* the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
* likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
* `length_penalty` < 0.0 encourages shorter sequences.
* @param num_return_sequences the number of sequences to return for grouped beam search decoding.
* @param no_repeat_ngram_size if set to int > 0, all ngrams of that size can only occur once.
* @param stop_criteria controls the stopping condition for grouped beam search. It accepts the following values:
* "EARLY", where the generation stops as soon as there are `num_beams` complete candidates; "HEURISTIC", where an
* "HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates;
* "NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
*
* Random sampling parameters:
* @param temperature the value used to modulate token probabilities for random sampling.
* @param top_p - if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
* @param top_k the number of highest probability vocabulary tokens to keep for top-k-filtering.
* @param do_sample whether or not to use multinomial random sampling that add up to `top_p` or higher are kept.
* @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty.
*/
class OPENVINO_GENAI_EXPORTS GenerationConfig {
public:
GenerationConfig() = default;
explicit GenerationConfig(const std::string& json_path);

// Generic
size_t max_new_tokens = SIZE_MAX;
size_t max_length = SIZE_MAX;
bool ignore_eos = false;

// Beam search specific
size_t num_beam_groups = 1;
size_t num_beams = 1;
float diversity_penalty = 1.0f;
float length_penalty = 1.0f;
size_t num_return_sequences = 1;
size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max();
StopCriteria stop_criteria = StopCriteria::HEURISTIC;

// Multinomial
float temperature = 1.0f;
float top_p = 1.0f;
size_t top_k = 50;
bool do_sample = false;
float repetition_penalty = 1.0f;

// EOS special token
int64_t eos_token_id = -1;

size_t get_max_new_tokens(size_t prompt_length = 0) const;
bool is_greedy_decoding() const;
bool is_beam_search() const;
bool is_multinomial() const;
void update_generation_config(const ov::AnyMap& config_map = {});

/// @brief checks that are no conflicting parameters, e.g. do_sample=true and num_beams > 1.
/// @throws Exception if config is invalid.
void validate() const;
};

/*
* utils that allow to use generate and operator() in the following way:
* pipe.generate(input_ids, ov::max_new_tokens(200), ov::temperature(1.0f),...)
* pipe(text, ov::max_new_tokens(200), ov::temperature(1.0f),...)
*/
static constexpr ov::Property<size_t> max_new_tokens{"max_new_tokens"};
static constexpr ov::Property<size_t> max_length{"max_length"};
static constexpr ov::Property<bool> ignore_eos{"ignore_eos"};

static constexpr ov::Property<size_t> num_beam_groups{"num_beam_groups"};
static constexpr ov::Property<size_t> num_beams{"num_beams"};
static constexpr ov::Property<float> diversity_penalty{"diversity_penalty"};
static constexpr ov::Property<float> length_penalty{"length_penalty"};
static constexpr ov::Property<size_t> num_return_sequences{"num_return_sequences"};
static constexpr ov::Property<size_t> no_repeat_ngram_size{"no_repeat_ngram_size"};
static constexpr ov::Property<StopCriteria> stop_criteria{"stop_criteria"};

static constexpr ov::Property<float> temperature{"temperature"};
static constexpr ov::Property<float> top_p{"top_p"};
static constexpr ov::Property<int> top_k{"top_k"};
static constexpr ov::Property<bool> do_sample{"do_sample"};
static constexpr ov::Property<float> repetition_penalty{"repetition_penalty"};
static constexpr ov::Property<int64_t> eos_token_id{"eos_token_id"};

} // namespace genai
} // namespace ov
Loading

0 comments on commit e852acf

Please sign in to comment.