Skip to content

Commit

Permalink
Merge branch 'generate_pipeline' into fix-abi
Browse files Browse the repository at this point in the history
  • Loading branch information
Wovchena committed May 27, 2024
2 parents 78666da + bbc8c25 commit 140b59c
Show file tree
Hide file tree
Showing 23 changed files with 353 additions and 353 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/genai_package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
- run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh
- run: sudo apt-get install libtbb-dev
- run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/
- run: source ./ov/setupvars.sh && cmake --build ./build/ --config ${{ matrix.build-type }} --target package
- run: source ./ov/setupvars.sh && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j
- run: source ./ov/setupvars.sh && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix ov
- run: ov/samples/cpp/build_samples.sh -i ${{ github.workspace }}/s\ pace
if: ${{ 'Release' == matrix.build-type }} # build_samples enforces Release build
Expand Down Expand Up @@ -49,7 +49,7 @@ jobs:
- run: curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.2.0-15349-765302e0de1/w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64.zip
- run: unzip ov.zip
- run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/
- run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake --build ./build/ --config ${{ matrix.build-type }} --target package
- run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j
- run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64
- run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\samples\cpp\build_samples_msvc.bat -i "${{ github.workspace }}/samples_install"
if: ${{ 'Release' == matrix.build-type }} # build_samples enforces Release build
Expand Down
9 changes: 8 additions & 1 deletion .github/workflows/genai_python_lib.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
# build system doesn't. Install ./requirements-build.txt to detect possible conflicts.
- run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./requirements-build.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
- run: PYTHONPATH=./src/python/ python -c "from openvino_genai import LLMPipeline"
- run: source ./ov/setupvars.sh && CMAKE_BUILD_PARALLEL_LEVEL="" python -m pip install .
- run: source ./ov/setupvars.sh && CMAKE_BUILD_PARALLEL_LEVEL="" python -m pip install --pre . --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
- run: python -c "from openvino_genai import LLMPipeline"
- name: GenAI Python API tests
run: |
Expand Down Expand Up @@ -51,9 +51,16 @@ jobs:
- run: unzip ov.zip
- run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
- run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake --build ./build/ --config Release -j
<<<<<<< HEAD
# GitHub Actions already provides what is listed in ./requirements-build.txt but the internal
# build system doesn't. Install ./requirements-build.txt to detect possible conflicts.
- run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./requirements-build.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
=======
- run: python -m pip install "numpy<1.27"
# GitHub Actions already provides what is listed in ./requirements-build.txt but the internal
# build system doesn't. Install ./requirements-build.txt to detect possible conflicts.
- run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./requirements-build.txt
>>>>>>> generate_pipeline
- run: set "PYTHONPATH=./src/python;" && call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -c "from openvino_genai import LLMPipeline" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
- run: set CMAKE_BUILD_PARALLEL_LEVEL=&& call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -m pip install .
- run: python -c "from openvino_genai import LLMPipeline"
70 changes: 24 additions & 46 deletions src/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ optimum-cli export openvino --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --weigh
pip install openvino-genai
```

LLMPipeline is the main object used for decoding. You can initiliza it straigh away from the folder with the converted model. It will automanically load the main model, tokenizer, detokenizer and default generation configuration.
`LLMPipeline` is the main object used for decoding. You can construct it straight away from the folder with the converted model. It will automatically load the main model, tokenizer, detokenizer and default generation configuration.

### Python

Expand All @@ -24,8 +24,8 @@ Calling generate with custom generation config parameters, e.g. config for group
import openvino_genai as ov_genai
pipe = ov_genai.LLMPipeline(model_path, "CPU")

res = pipe.generate("The Sun is yellow bacause", max_new_tokens=30, num_groups=3, group_size=5)
print(res)
result = pipe.generate("The Sun is yellow bacause", max_new_tokens=30, num_groups=3, group_size=5, diversity_penalty=1.5)
print(result)
```

output:
Expand All @@ -38,7 +38,7 @@ A simples chat in python:
import openvino_genai as ov_genai
pipe = ov_ov_genai.LLMPipeline(model_path)

config = {'num_groups': 3, 'group_size': 5, 'diversity_penalty': 1.1}
config = {'num_groups': 3, 'group_size': 5, 'diversity_penalty': 1.5}
pipe.set_generation_cofnig(config)

pipe.start_chat()
Expand All @@ -49,7 +49,6 @@ while True:
        break
    print(pipe(prompt))
pipe.finish_chat()

```

Test to compare with Huggingface outputs
Expand All @@ -63,7 +62,7 @@ Minimalistc example

int main(int argc, char* argv[]) {
std::string model_path = argv[1];
ov::LLMPipeline pipe(model_path, "CPU");
ov::genai::LLMPipeline pipe(model_path, "CPU");
std::cout << pipe.generate("The Sun is yellow bacause");
}
```
Expand All @@ -75,9 +74,9 @@ Using Group Beam Search Decoding
int main(int argc, char* argv[]) {
std::string model_path = argv[1];
ov::LLMPipeline pipe(model_path, "CPU");
ov::genai::LLMPipeline pipe(model_path, "CPU");
ov::GenerationConfig config = pipe.get_generation_config();
ov::genai::GenerationConfig config = pipe.get_generation_config();
config.max_new_tokens = 256;
config.num_groups = 3;
config.group_size = 5;
Expand All @@ -87,7 +86,7 @@ int main(int argc, char* argv[]) {
}
```

A simplest chat in C++
A simple chat in C++ using grouped beam search decoding
``` cpp
#include "openvino/genai/llm_pipeline.hpp"
#include <iostream>
Expand All @@ -96,71 +95,50 @@ int main(int argc, char* argv[]) {
std::string prompt;

std::string model_path = argv[1];
ov::LLMPipeline pipe(model_path, "CPU");

pipe.start_chat();
for (size_t i = 0; i < questions.size(); i++) {
std::cout << "question:\n";
std::getline(std::cin, prompt);

std::cout << pipe(prompt) << std::endl>>;
}
pipe.finish_chat();
}
```
Specifying generation_config to use grouped beam search
``` cpp
int main(int argc, char* argv[]) {
std::string prompt;
std::string model_path = argv[1];
ov::LLMPipeline pipe(model_path, "CPU");
ov::genai::LLMPipeline pipe(model_path, "CPU");

ov::GenerationConfig config = pipe.get_generation_config();
ov::genai::GenerationConfig config = pipe.get_generation_config();
config.max_new_tokens = 256;
config.num_groups = 3;
config.group_size = 5;
config.diversity_penalty = 1.0f;

auto streamer = [](std::string word) { std::cout << word << std::flush; };
pipe.start_chat();
for (size_t i = 0; i < questions.size(); i++) {
for (;;;) {
std::cout << "question:\n";
cout << prompt << endl;
std::getline(std::cin, prompt);
if (prompt == "Stop!")
break;

auto answer = pipe(prompt, config, streamer);
// no need to print answer, streamer will do that
std::cout << "answer:\n";
auto answer = pipe(prompt, config);
std::cout << answer << std::endl;
}
pipe.finish_chat();
}
```
Streaming exapmle with lambda function

Streaming example with lambda function
``` cpp

#include "openvino/genai/llm_pipeline.hpp"
#include <iostream>
int main(int argc, char* argv[]) {
std::string model_path = argv[1];
ov::LLMPipeline pipe(model_path, "CPU");
ov::genai::LLMPipeline pipe(model_path, "CPU");
auto streamer = [](std::string word) { std::cout << word << std::flush; };
std::cout << pipe.generate("The Sun is yellow bacause", streamer);
}
```

Streaming with custom class
Streaming with a custom class
``` cpp
#include <streamer_base.hpp>
#include "openvino/genai/streamer_base.hpp"
#include "openvino/genai/llm_pipeline.hpp"
#include <iostream>

class CustomStreamer: publict StreamerBase {
class CustomStreamer: public ov::genai::StreamerBase {
public:
void put(int64_t token) {
/* custom decoding/tokens processing code
Expand All @@ -179,7 +157,7 @@ int main(int argc, char* argv[]) {
CustomStreamer custom_streamer;

std::string model_path = argv[1];
ov::LLMPipeline pipe(model_path, "CPU");
cout << pipe.generate("The Sun is yellow bacause", custom_streamer);
ov::genai::LLMPipeline pipe(model_path, "CPU");
std::cout << pipe.generate("The Sun is yellow bacause", custom_streamer);
}
```
43 changes: 28 additions & 15 deletions src/cpp/include/openvino/genai/generation_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "openvino/genai/tokenizer.hpp"

namespace ov {
namespace genai {

/**
* @brief controls the stopping condition for grouped beam search. The following values are possible:
Expand All @@ -22,43 +23,48 @@ namespace ov {
enum class StopCriteria { early, heuristic, never };

/**
* @brief structure to keep generation config parameters.
* @brief Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group
* and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will
* be used while greedy and beam search parameters will not affect decoding at all.
*
* Generic parameters:
* @param max_length the maximum length the generated tokens can have. Corresponds to the length of the input prompt +
* `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set.
* @param max_new_tokens the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length.
* @param ignore_eos if set to true, then generation will not stop even if <eos> token is met.
* @param pad_token_id token_id of <pad> (padding)
* @param bos_token_id token_id of <bos> (beggining of sentence)
* @param eos_token_id token_id of <eos> (end of sentence)
* @param bos_token <bos> token string representation
* @param eos_token <eos> token string representation
*
* Beam search specific parameters:
* @param num_beams number of beams for beam search. 1 disables beam search.
* @param num_beam_groups number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
* @param diversity_penalty this value is subtracted from a beam's score if it generates the same token as any beam from other group at a
* particular time. Note that `diversity_penalty` is only effective if `group beam search` is enabled.
* particular time. See https://arxiv.org/pdf/1909.05858.
* @param length_penalty exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
* the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
* likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
* `length_penalty` < 0.0 encourages shorter sequences.
* @param num_return_sequences the number of sequences to return for grouped beam search decoding
* @param num_return_sequences the number of sequences to return for grouped beam search decoding.
* @param no_repeat_ngram_size if set to int > 0, all ngrams of that size can only occur once.
* @param stop_criteria controls the stopping condition for grouped beam search. It accepts the following values:
* "early", where the generation stops as soon as there are `num_beams` complete candidates; "heuristic", where an
* heuristic is applied and the generation stops when is it very unlikely to find better candidates;
* "never", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
* @param temperature the value used to modulate token probabilities for random sampling
*
* Random sampling parameters:
* @param temperature the value used to modulate token probabilities for random sampling.
* @param top_p - if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
* @param top_k the number of highest probability vocabulary tokens to keep for top-k-filtering.
* @param do_sample whether or not to use multinomial random sampling
* that add up to `top_p` or higher are kept.
* @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty. See https://arxiv.org/pdf/1909.05858.
* @param pad_token_id id of padding token
* @param bos_token_id id of <bos> token
* @param eos_token_id id of <eos> token
* @param bos_token <bos> token string representation
* @param eos_token <eos> token string representation
* @param draft_model draft model for assitive decoding
* @param do_sample whether or not to use multinomial random sampling that add up to `top_p` or higher are kept.
* @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty.
*/
class OPENVINO_GENAI_EXPORTS GenerationConfig {
public:
GenerationConfig() = default;
GenerationConfig(std::string json_path);
explicit GenerationConfig(std::string json_path);

// Generic
size_t max_new_tokens = SIZE_MAX;
Expand Down Expand Up @@ -89,6 +95,13 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
// used for chat scenario
std::string bos_token = "<s>";
std::string eos_token = "</s>";

size_t get_max_new_tokens(size_t prompt_length = 0) const;
bool is_greedy_decoding() const;
bool is_beam_search() const;
bool is_multimomial() const;
static GenerationConfig anymap_to_generation_config(const ov::AnyMap& config_map = {});
};

} // namespace ov
} // namespace genai
} // namespace ov
42 changes: 23 additions & 19 deletions src/cpp/include/openvino/genai/llm_pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@
#include <optional>
#include <variant>

#include <openvino/core/any.hpp>
#include "openvino/core/any.hpp"
#include "openvino/genai/generation_config.hpp"
#include "openvino/genai/tokenizer.hpp"
#include "openvino/genai/streamer_base.hpp"

namespace ov {
namespace genai {

using StreamerVariant = std::variant<std::function<void (std::string)>, std::shared_ptr<StreamerBase>>;
using OptionalGenerationConfig = std::optional<GenerationConfig>;
Expand Down Expand Up @@ -71,7 +72,7 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
* @param device optional device
* @param plugin_config optional plugin_config
*/
LLMPipeline(std::string& path, std::string device="CPU",
LLMPipeline(const std::string& path, const std::string& device="CPU",
const ov::AnyMap& plugin_config={},
const std::string& ov_tokenizers_path="");

Expand All @@ -84,11 +85,10 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
* @param plugin_config optional plugin_config
*/
LLMPipeline(
const std::string model_path,
const ov::Tokenizer& tokenizer,
const std::string device="CPU",
const ov::AnyMap& plugin_config = {},
const std::string& ov_tokenizers_path=""
const std::string& model_path,
const ov::genai::Tokenizer& tokenizer,
const std::string& device="CPU",
const ov::AnyMap& plugin_config = {}
);

~LLMPipeline();
Expand Down Expand Up @@ -127,8 +127,7 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
* @param generation_config optional GenerationConfig
* @return DecodedResults a structure with resulting texts & scores
*/
DecodedResults generate(std::vector<std::string> texts, OptionalGenerationConfig generation_config);
DecodedResults generate(std::initializer_list<std::string> text, OptionalGenerationConfig generation_config);
DecodedResults generate(const std::vector<std::string>& texts, OptionalGenerationConfig generation_config);

/**
* @brief Low level generate to be called with already encoded input_ids tokens.
Expand All @@ -153,14 +152,19 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
return generate(text, AnyMap{std::forward<Properties>(properties)...});
}

DecodedResults operator()(std::vector<std::string> text, OptionalGenerationConfig generation_config=std::nullopt);
DecodedResults operator()(std::initializer_list<std::string> text, OptionalGenerationConfig generation_config=std::nullopt);
DecodedResults operator()(const std::vector<std::string>& text, OptionalGenerationConfig generation_config=std::nullopt) {
return generate(text, generation_config);
}

// generate with streamers
std::string operator()(std::string text, OptionalGenerationConfig generation_config=std::nullopt, OptionalStreamerVariant streamer=std::nullopt);
std::string operator()(std::string text, OptionalStreamerVariant streamer);
std::string operator()(
std::string text,
OptionalGenerationConfig generation_config=std::nullopt,
OptionalStreamerVariant streamer=std::nullopt
) {
return generate(text, generation_config, streamer);
}

ov::Tokenizer get_tokenizer();
ov::genai::Tokenizer get_tokenizer();
GenerationConfig get_generation_config() const;
void set_generation_config(const GenerationConfig& generation_config);

Expand All @@ -174,10 +178,9 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
};

/*
* utils that allow to use generate and operarator() in the folllowing way:
* utils that allow to use generate and operator() in the following way:
* pipe.generate(input_ids, ov::max_new_tokens(200), ov::temperature(1.0f),...)
* pipe(text, ov::max_new_tokens(200), ov::temperature(1.0f),...)
* All names match to names in cofnig except streamer.
*/
static constexpr ov::Property<size_t> max_new_tokens{"max_new_tokens"};
static constexpr ov::Property<size_t> max_length{"max_length"};
Expand Down Expand Up @@ -207,6 +210,7 @@ static constexpr ov::Property<std::string> eos_token{"eos_token"};

// only lambda streamer can be set via ov::streamer(),... syntaxic sugar,
// because std::variant<StremaerBase, std::function<>> can not be stored in AnyMap
static constexpr ov::Property<std::function<void (std::string)>> streamer_lambda{"streamer"};
static constexpr ov::Property<std::function<void (std::string)>> streamer{"streamer"};

} // namespace ov
} // namespace genai
} // namespace ov
Loading

0 comments on commit 140b59c

Please sign in to comment.