From 7cfedba3390138fc0d8e42dcf78c3705a3e25654 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Tue, 22 Oct 2024 06:24:47 +0400 Subject: [PATCH] Align pipelines (#1031) - Added template constructors for Pipelines to pass arbitrary number of properties - Dropped `_draft_model` WA and have only `draft_model` - Aligned all pipelines to accept `std::filesystem::path` as model path. - Added `Path` support to Python - Dropped Whisper ctor which accepts `Tokenizer` - Dropped `CPU` as default device. It conflicts with OpenVINO where default device is `AUTO` - Aligned argument names - models_path and properties --- README.md | 22 ++- .../beam_search_causal_lm.cpp | 4 +- .../cpp/benchmark_genai/benchmark_genai.cpp | 4 +- samples/cpp/chat_sample/chat_sample.cpp | 4 +- .../cpp/greedy_causal_lm/greedy_causal_lm.cpp | 4 +- .../lora_greedy_causal_lm.cpp | 4 +- .../multinomial_causal_lm.cpp | 4 +- .../whisper_speech_recognition.cpp | 9 +- .../python/benchmark_genai/benchmark_genai.py | 4 +- .../whisper_speech_recognition.py | 6 +- src/README.md | 42 ++--- .../genai/continuous_batching_pipeline.hpp | 23 +-- .../openvino/genai/generation_config.hpp | 5 +- .../include/openvino/genai/llm_pipeline.hpp | 83 ++++----- .../include/openvino/genai/lora_adapter.hpp | 4 +- .../genai/text2image/autoencoder_kl.hpp | 9 +- .../genai/text2image/clip_text_model.hpp | 9 +- .../clip_text_model_with_projection.hpp | 17 +- .../openvino/genai/text2image/pipeline.hpp | 15 +- .../text2image/unet2d_condition_model.hpp | 13 +- src/cpp/include/openvino/genai/tokenizer.hpp | 18 +- .../genai/visual_language/pipeline.hpp | 42 ++++- .../genai/whisper_generation_config.hpp | 3 +- .../openvino/genai/whisper_pipeline.hpp | 33 ++-- src/cpp/src/continuous_batching_impl.cpp | 18 +- src/cpp/src/continuous_batching_impl.hpp | 15 +- .../continuous_batching_impl_interface.cpp | 2 + src/cpp/src/continuous_batching_pipeline.cpp | 30 +-- src/cpp/src/generation_config.cpp | 4 +- src/cpp/src/llm_pipeline.cpp | 75 ++++---- src/cpp/src/llm_pipeline_static.cpp | 58 +++--- src/cpp/src/lora_adapter.cpp | 8 +- .../speculative_decoding_impl.cpp | 32 ++-- .../speculative_decoding_impl.hpp | 20 +- src/cpp/src/text2image/diffusion_pipeline.hpp | 4 +- .../src/text2image/models/autoencoder_kl.cpp | 16 +- .../src/text2image/models/clip_text_model.cpp | 18 +- .../clip_text_model_with_projection.cpp | 12 +- .../models/unet2d_condition_model.cpp | 16 +- src/cpp/src/text2image/schedulers/ddim.cpp | 4 +- src/cpp/src/text2image/schedulers/ddim.hpp | 5 +- .../text2image/schedulers/euler_discrete.cpp | 4 +- .../text2image/schedulers/euler_discrete.hpp | 5 +- src/cpp/src/text2image/schedulers/lcm.cpp | 4 +- src/cpp/src/text2image/schedulers/lcm.hpp | 5 +- .../text2image/schedulers/lms_discrete.cpp | 4 +- .../text2image/schedulers/lms_discrete.hpp | 5 +- .../src/text2image/schedulers/scheduler.cpp | 2 +- .../text2image/stable_diffusion_pipeline.hpp | 25 +-- .../stable_diffusion_xl_pipeline.hpp | 28 +-- .../src/text2image/text2image_pipeline.cpp | 4 +- src/cpp/src/tokenizer.cpp | 27 ++- src/cpp/src/tokenizers_path.hpp | 3 +- src/cpp/src/utils.cpp | 22 +-- src/cpp/src/utils.hpp | 8 +- src/cpp/src/visual_language/pipeline.cpp | 16 +- src/cpp/src/whisper/whisper_config.cpp | 4 +- src/cpp/src/whisper/whisper_config.hpp | 4 +- .../src/whisper/whisper_feature_extractor.cpp | 6 +- .../src/whisper/whisper_feature_extractor.hpp | 6 +- src/cpp/src/whisper_generation_config.cpp | 4 +- src/cpp/src/whisper_pipeline.cpp | 50 ++--- src/python/py_generate_pipeline.cpp | 173 +++++++++--------- src/python/py_lora_adapter.cpp | 3 +- src/python/py_text2image_models.cpp | 37 ++-- src/python/py_text2image_pipeline.cpp | 23 ++- src/python/py_utils.cpp | 2 +- src/python/py_vlm_pipeline.cpp | 45 ++--- src/python/py_whisper_pipeline.cpp | 61 +++--- tests/python_tests/common.py | 28 +-- tests/python_tests/ov_genai_test_utils.py | 10 +- .../python_tests/test_cache_optimizations.py | 18 +- tests/python_tests/test_chat_generate_api.py | 2 +- tests/python_tests/test_generate_api.py | 2 +- tests/python_tests/test_preemption.py | 12 +- tests/python_tests/test_sampling.py | 14 +- tests/python_tests/test_vlm_api.py | 12 +- .../python_tests/test_whisper_generate_api.py | 45 +---- ...ntinuous_batching_speculative_decoding.cpp | 6 +- tools/llm_bench/llm_bench_utils/ov_utils.py | 2 +- 80 files changed, 722 insertions(+), 727 deletions(-) diff --git a/README.md b/README.md index c2225ae374..3da882175b 100644 --- a/README.md +++ b/README.md @@ -86,8 +86,8 @@ Code below requires installation of C++ compatible package (see [here](https://d #include int main(int argc, char* argv[]) { - std::string model_path = argv[1]; - ov::genai::LLMPipeline pipe(model_path, "CPU"); + std::string models_path = argv[1]; + ov::genai::LLMPipeline pipe(models_path, "CPU"); std::cout << pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(100)) << '\n'; } ``` @@ -129,8 +129,8 @@ Code below requires installation of C++ compatible package (see [here](https://d #include int main(int argc, char* argv[]) { - std::string model_path = argv[1]; - ov::genai::VLMPipeline pipe(model_path, "CPU"); + std::string models_path = argv[1]; + ov::genai::VLMPipeline pipe(models_path, "CPU"); ov::Tensor rgb = utils::load_image(argv[2]); std::cout << pipe.generate( prompt, @@ -244,9 +244,10 @@ def main(): parser.add_argument("wav_file_path") args = parser.parse_args() - raw_speech = read_wav(args.wav_file_path) + device = 'CPU' # GPU can be used as well + pipe = openvino_genai.WhisperPipeline(args.model_dir, device) - pipe = openvino_genai.WhisperPipeline(args.model_dir) + raw_speech = read_wav(args.wav_file_path) def streamer(word: str) -> bool: print(word, end="") @@ -275,14 +276,15 @@ NOTE: This sample is a simplified version of the full sample that is available [ int main(int argc, char* argv[]) try { - std::string model_path = argv[1]; + std::filesystem::path models_path = argv[1]; std::string wav_file_path = argv[2]; + std::string device = "CPU"; // GPU can be used as well - ov::genai::RawSpeechInput raw_speech = utils::audio::read_wav(wav_file_path); + ov::genai::WhisperPipeline pipeline(models_path, device); - ov::genai::WhisperPipeline pipeline{model_path}; + ov::genai::RawSpeechInput raw_speech = utils::audio::read_wav(wav_file_path); - ov::genai::WhisperGenerationConfig config{model_path + "/generation_config.json"}; + ov::genai::WhisperGenerationConfig config(models_path / "generation_config.json"); config.max_new_tokens = 100; // 'task' and 'language' parameters are supported for multilingual models only config.language = "<|en|>"; diff --git a/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp b/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp index 5f3187f33b..236b31b351 100644 --- a/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp +++ b/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp @@ -8,10 +8,10 @@ int main(int argc, char* argv[]) try { throw std::runtime_error(std::string{"Usage: "} + argv[0] + " '' ['' ...]"); } auto prompts = std::vector(argv + 2, argv + argc); - std::string model_path = argv[1]; + std::string models_path = argv[1]; std::string device = "CPU"; // GPU can be used as well - ov::genai::LLMPipeline pipe(model_path, device); + ov::genai::LLMPipeline pipe(models_path, device); ov::genai::GenerationConfig config; config.max_new_tokens = 20; diff --git a/samples/cpp/benchmark_genai/benchmark_genai.cpp b/samples/cpp/benchmark_genai/benchmark_genai.cpp index 8fadeac444..d389e94432 100644 --- a/samples/cpp/benchmark_genai/benchmark_genai.cpp +++ b/samples/cpp/benchmark_genai/benchmark_genai.cpp @@ -31,7 +31,7 @@ int main(int argc, char* argv[]) try { } std::string prompt = result["prompt"].as(); - const std::string model_path = result["model"].as(); + const std::string models_path = result["model"].as(); std::string device = result["device"].as(); size_t num_warmup = result["num_warmup"].as(); size_t num_iter = result["num_iter"].as(); @@ -39,7 +39,7 @@ int main(int argc, char* argv[]) try { ov::genai::GenerationConfig config; config.max_new_tokens = result["max_new_tokens"].as(); - ov::genai::LLMPipeline pipe(model_path, device); + ov::genai::LLMPipeline pipe(models_path, device); for (size_t i = 0; i < num_warmup; i++) pipe.generate(prompt, config); diff --git a/samples/cpp/chat_sample/chat_sample.cpp b/samples/cpp/chat_sample/chat_sample.cpp index 827c08ae57..41d63fc0f1 100644 --- a/samples/cpp/chat_sample/chat_sample.cpp +++ b/samples/cpp/chat_sample/chat_sample.cpp @@ -8,10 +8,10 @@ int main(int argc, char* argv[]) try { throw std::runtime_error(std::string{"Usage: "} + argv[0] + " "); } std::string prompt; - std::string model_path = argv[1]; + std::string models_path = argv[1]; std::string device = "CPU"; // GPU, NPU can be used as well - ov::genai::LLMPipeline pipe(model_path, device); + ov::genai::LLMPipeline pipe(models_path, device); ov::genai::GenerationConfig config; config.max_new_tokens = 100; diff --git a/samples/cpp/greedy_causal_lm/greedy_causal_lm.cpp b/samples/cpp/greedy_causal_lm/greedy_causal_lm.cpp index 09e6af65e8..b5ca59095b 100644 --- a/samples/cpp/greedy_causal_lm/greedy_causal_lm.cpp +++ b/samples/cpp/greedy_causal_lm/greedy_causal_lm.cpp @@ -7,11 +7,11 @@ int main(int argc, char* argv[]) try { if (3 > argc) throw std::runtime_error(std::string{"Usage: "} + argv[0] + " \"\""); - std::string model_path = argv[1]; + std::string models_path = argv[1]; std::string prompt = argv[2]; std::string device = "CPU"; // GPU can be used as well - ov::genai::LLMPipeline pipe(model_path, device); + ov::genai::LLMPipeline pipe(models_path, device); ov::genai::GenerationConfig config; config.max_new_tokens = 100; std::string result = pipe.generate(prompt, config); diff --git a/samples/cpp/lora_greedy_causal_lm/lora_greedy_causal_lm.cpp b/samples/cpp/lora_greedy_causal_lm/lora_greedy_causal_lm.cpp index e9e3d8c2a4..b854b95fd3 100644 --- a/samples/cpp/lora_greedy_causal_lm/lora_greedy_causal_lm.cpp +++ b/samples/cpp/lora_greedy_causal_lm/lora_greedy_causal_lm.cpp @@ -7,7 +7,7 @@ int main(int argc, char* argv[]) try { if (4 > argc) throw std::runtime_error(std::string{"Usage: "} + argv[0] + " \"\""); - std::string model_path = argv[1]; + std::string models_path = argv[1]; std::string adapter_path = argv[2]; std::string prompt = argv[3]; std::string device = "CPU"; // GPU can be used as well @@ -15,7 +15,7 @@ int main(int argc, char* argv[]) try { using namespace ov::genai; Adapter adapter(adapter_path); - LLMPipeline pipe(model_path, device, adapters(adapter)); // register all required adapters here + LLMPipeline pipe(models_path, device, adapters(adapter)); // register all required adapters here std::cout << "Generate with LoRA adapter and alpha set to 0.75:" << std::endl; std::cout << pipe.generate(prompt, max_new_tokens(100), adapters(adapter, 0.75)) << std::endl; diff --git a/samples/cpp/multinomial_causal_lm/multinomial_causal_lm.cpp b/samples/cpp/multinomial_causal_lm/multinomial_causal_lm.cpp index 1525cbc38a..bb74deffb1 100644 --- a/samples/cpp/multinomial_causal_lm/multinomial_causal_lm.cpp +++ b/samples/cpp/multinomial_causal_lm/multinomial_causal_lm.cpp @@ -8,11 +8,11 @@ int main(int argc, char* argv[]) try { throw std::runtime_error(std::string{"Usage: "} + argv[0] + " ''"); } - std::string model_path = argv[1]; + std::string models_path = argv[1]; std::string prompt = argv[2]; std::string device = "CPU"; // GPU can be used as well - ov::genai::LLMPipeline pipe(model_path, device); + ov::genai::LLMPipeline pipe(models_path, device); ov::genai::GenerationConfig config; config.max_new_tokens = 100; diff --git a/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp b/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp index 6ce345d691..48770dd61b 100644 --- a/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp +++ b/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp @@ -9,14 +9,15 @@ int main(int argc, char* argv[]) try { throw std::runtime_error(std::string{"Usage: "} + argv[0] + " \"\""); } - std::string model_path = argv[1]; + std::filesystem::path models_path = argv[1]; std::string wav_file_path = argv[2]; + std::string device = "CPU"; // GPU can be used as well - ov::genai::RawSpeechInput raw_speech = utils::audio::read_wav(wav_file_path); + ov::genai::WhisperPipeline pipeline(models_path, device); - ov::genai::WhisperPipeline pipeline{model_path}; + ov::genai::RawSpeechInput raw_speech = utils::audio::read_wav(wav_file_path); - ov::genai::WhisperGenerationConfig config{model_path + "/generation_config.json"}; + ov::genai::WhisperGenerationConfig config(models_path / "generation_config.json"); config.max_new_tokens = 100; // 'task' and 'language' parameters are supported for multilingual models only config.language = "<|en|>"; diff --git a/samples/python/benchmark_genai/benchmark_genai.py b/samples/python/benchmark_genai/benchmark_genai.py index 9851483880..d2cc91e04d 100755 --- a/samples/python/benchmark_genai/benchmark_genai.py +++ b/samples/python/benchmark_genai/benchmark_genai.py @@ -18,7 +18,7 @@ def main(): # Perf metrics is stored in DecodedResults. # In order to get DecodedResults instead of a string input should be a list. prompt = [args.prompt] - model_path = args.model + models_path = args.model device = args.device num_warmup = args.num_warmup num_iter = args.num_iter @@ -26,7 +26,7 @@ def main(): config = ov_genai.GenerationConfig() config.max_new_tokens = args.max_new_tokens - pipe = ov_genai.LLMPipeline(model_path, device) + pipe = ov_genai.LLMPipeline(models_path, device) for _ in range(num_warmup): pipe.generate(prompt, config) diff --git a/samples/python/whisper_speech_recognition/whisper_speech_recognition.py b/samples/python/whisper_speech_recognition/whisper_speech_recognition.py index 38a025dbed..74e0941816 100755 --- a/samples/python/whisper_speech_recognition/whisper_speech_recognition.py +++ b/samples/python/whisper_speech_recognition/whisper_speech_recognition.py @@ -18,8 +18,6 @@ def main(): parser.add_argument("wav_file_path") args = parser.parse_args() - raw_speech = read_wav(args.wav_file_path) - config = openvino_genai.WhisperGenerationConfig( args.model_dir + "/generation_config.json" ) @@ -29,12 +27,14 @@ def main(): config.task = "transcribe" config.return_timestamps = True - pipe = openvino_genai.WhisperPipeline(args.model_dir) + device = 'CPU' # GPU can be used as well + pipe = openvino_genai.WhisperPipeline(args.model_dir, device) def streamer(word: str) -> bool: print(word, end="") return False + raw_speech = read_wav(args.wav_file_path) result = pipe.generate(raw_speech, config, streamer) print() diff --git a/src/README.md b/src/README.md index 73fc97d1e9..b4635767cd 100644 --- a/src/README.md +++ b/src/README.md @@ -55,14 +55,14 @@ If you want to try OpenVINO GenAI with different dependencies versions (**not** A simple example: ```python import openvino_genai as ov_genai -pipe = ov_genai.LLMPipeline(model_path, "CPU") +pipe = ov_genai.LLMPipeline(models_path, "CPU") print(pipe.generate("The Sun is yellow because", max_new_tokens=100)) ``` Calling generate with custom generation config parameters, e.g. config for grouped beam search: ```python import openvino_genai as ov_genai -pipe = ov_genai.LLMPipeline(model_path, "CPU") +pipe = ov_genai.LLMPipeline(models_path, "CPU") result = pipe.generate("The Sun is yellow because", max_new_tokens=100, num_beam_groups=3, num_beams=15, diversity_penalty=1.5) print(result) @@ -76,7 +76,7 @@ output: A simple chat in Python: ```python import openvino_genai as ov_genai -pipe = ov_genai.LLMPipeline(model_path) +pipe = ov_genai.LLMPipeline(models_path) config = {'max_new_tokens': 100, 'num_beam_groups': 3, 'num_beams': 15, 'diversity_penalty': 1.5} pipe.set_generation_config(config) @@ -101,8 +101,8 @@ A simple example: #include int main(int argc, char* argv[]) { - std::string model_path = argv[1]; - ov::genai::LLMPipeline pipe(model_path, "CPU"); + std::string models_path = argv[1]; + ov::genai::LLMPipeline pipe(models_path, "CPU"); std::cout << pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(256)); } ``` @@ -113,8 +113,8 @@ Using group beam search decoding: #include int main(int argc, char* argv[]) { - std::string model_path = argv[1]; - ov::genai::LLMPipeline pipe(model_path, "CPU"); + std::string models_path = argv[1]; + ov::genai::LLMPipeline pipe(models_path, "CPU"); ov::genai::GenerationConfig config; config.max_new_tokens = 256; @@ -134,8 +134,8 @@ A simple chat in C++ using grouped beam search decoding: int main(int argc, char* argv[]) { std::string prompt; - std::string model_path = argv[1]; - ov::genai::LLMPipeline pipe(model_path, "CPU"); + std::string models_path = argv[1]; + ov::genai::LLMPipeline pipe(models_path, "CPU"); ov::genai::GenerationConfig config; config.max_new_tokens = 100; @@ -164,8 +164,8 @@ Streaming example with lambda function: #include int main(int argc, char* argv[]) { - std::string model_path = argv[1]; - ov::genai::LLMPipeline pipe(model_path, "CPU"); + std::string models_path = argv[1]; + ov::genai::LLMPipeline pipe(models_path, "CPU"); auto streamer = [](std::string word) { std::cout << word << std::flush; @@ -202,8 +202,8 @@ public: int main(int argc, char* argv[]) { CustomStreamer custom_streamer; - std::string model_path = argv[1]; - ov::genai::LLMPipeline pipe(model_path, "CPU"); + std::string models_path = argv[1]; + ov::genai::LLMPipeline pipe(models_path, "CPU"); std::cout << pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(15), ov::genai::streamer(custom_streamer)); } ``` @@ -226,7 +226,7 @@ class CustomStreamer(ov_genai.StreamerBase): def end(self): # Custom finalization logic. -pipe = ov_genai.LLMPipeline(model_path, "CPU") +pipe = ov_genai.LLMPipeline(models_path, "CPU") custom_streamer = CustomStreamer() pipe.generate("The Sun is yellow because", max_new_tokens=15, streamer=custom_streamer) @@ -245,7 +245,7 @@ int main(int argc, char* argv[]) { // fill other fields in scheduler_config with custom data if required scheduler_config.cache_size = 1; // minimal possible KV cache size in GB, adjust as required - ov::genai::LLMPipeline pipe(model_path, "CPU", ov::genai::scheduler_config(scheduler_config)); + ov::genai::LLMPipeline pipe(models_path, "CPU", ov::genai::scheduler_config(scheduler_config)); } ``` @@ -268,7 +268,7 @@ Performance metrics are stored either in the `DecodedResults` or `EncodedResults ```python import openvino_genai as ov_genai -pipe = ov_genai.LLMPipeline(model_path, "CPU") +pipe = ov_genai.LLMPipeline(models_path, "CPU") result = pipe.generate(["The Sun is yellow because"], max_new_tokens=20) perf_metrics = result.perf_metrics @@ -283,8 +283,8 @@ print(f'Throughput: {perf_metrics.get_throughput().mean:.2f} tokens/s') #include int main(int argc, char* argv[]) { - std::string model_path = argv[1]; - ov::genai::LLMPipeline pipe(model_path, "CPU"); + std::string models_path = argv[1]; + ov::genai::LLMPipeline pipe(models_path, "CPU"); auto result = pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(20)); auto perf_metrics = result.perf_metrics; @@ -311,8 +311,8 @@ Several `perf_metrics` can be added to each other. In that case `raw_metrics` ar #include int main(int argc, char* argv[]) { - std::string model_path = argv[1]; - ov::genai::LLMPipeline pipe(model_path, "CPU"); + std::string models_path = argv[1]; + ov::genai::LLMPipeline pipe(models_path, "CPU"); auto result_1 = pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(20)); auto result_2 = pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(20)); auto perf_metrics = result_1.perf_metrics + result_2.perf_metrics @@ -327,7 +327,7 @@ int main(int argc, char* argv[]) { ```python import openvino_genai as ov_genai -pipe = ov_genai.LLMPipeline(model_path, "CPU") +pipe = ov_genai.LLMPipeline(models_path, "CPU") res_1 = pipe.generate(["The Sun is yellow because"], max_new_tokens=20) res_2 = pipe.generate(["Why Sky is blue because"], max_new_tokens=20) perf_metrics = res_1.perf_metrics + res_2.perf_metrics diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp index d1da15aa29..2bf5dd773b 100644 --- a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp +++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp @@ -4,6 +4,8 @@ #pragma once #include +#include + #include #include "openvino/genai/scheduler_config.hpp" @@ -63,27 +65,27 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline { ContinuousBatchingPipeline() = default; public: - ContinuousBatchingPipeline(const std::string& models_path, + ContinuousBatchingPipeline(const std::filesystem::path& models_path, const SchedulerConfig& scheduler_config, - const std::string& device = "CPU", - const ov::AnyMap& llm_plugin_config = {}, - const ov::AnyMap& tokenizer_plugin_config = {}); + const std::string& device, + const ov::AnyMap& properties = {}, + const ov::AnyMap& tokenizer_properties = {}); /** * @brief Constructs a ContinuousBatchingPipeline when ov::genai::Tokenizer is initialized manually using file from the different dirs. * - * @param model_path Path to the dir with model, tokenizer .xml/.bin files, and generation_configs.json + * @param models_path Path to the dir with model, tokenizer .xml/.bin files, and generation_configs.json * @param scheduler_config * @param tokenizer manually initialized ov::genai::Tokenizer * @param device optional device - * @param plugin_config optional plugin_config + * @param properties optional properties */ ContinuousBatchingPipeline( - const std::string& model_path, + const std::filesystem::path& models_path, const ov::genai::Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, - const std::string& device="CPU", - const ov::AnyMap& plugin_config={} + const std::string& device, + const ov::AnyMap& properties = {} ); ov::genai::Tokenizer get_tokenizer(); @@ -109,10 +111,9 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline { /** * @brief start chat with keeping history in kv cache. - * * @param system_message optional system message. */ - void start_chat(const std::string& system_message = ""); + void start_chat(const std::string& system_message = {}); /** * @brief finish chat and clear kv cache. diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp index ee6997fc2b..bcc5aad713 100644 --- a/src/cpp/include/openvino/genai/generation_config.hpp +++ b/src/cpp/include/openvino/genai/generation_config.hpp @@ -3,6 +3,7 @@ #pragma once +#include #include #include #include @@ -11,7 +12,7 @@ #include "openvino/runtime/infer_request.hpp" #include "openvino/genai/tokenizer.hpp" #include "openvino/genai/scheduler_config.hpp" -#include "lora_adapter.hpp" +#include "openvino/genai/lora_adapter.hpp" namespace ov { namespace genai { @@ -76,7 +77,7 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { public: GenerationConfig() = default; - explicit GenerationConfig(const std::string& json_path); + explicit GenerationConfig(const std::filesystem::path& json_path); // Generic size_t max_new_tokens = SIZE_MAX; diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp index 73be97d7a1..245669552c 100644 --- a/src/cpp/include/openvino/genai/llm_pipeline.hpp +++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp @@ -6,6 +6,7 @@ #include #include #include +#include #include "openvino/core/any.hpp" #include "openvino/genai/generation_config.hpp" @@ -23,13 +24,6 @@ using OptionalGenerationConfig = std::optional; using EncodedInputs = std::variant; using StringInputs = std::variant>; -/** -* @brief scheduler_config property serves to activate continuous batching pipeline. -* Create SchedulerConfig and fill it with sutable values. Copy or move it to plugin_config. -* And create LLMPipeline instance with this config. -*/ -static constexpr ov::Property scheduler_config{"scheduler_config"}; - /** * @brief Structure to store resulting batched tokens and scores for each batch sequence. * The first num_return_sequences elements correspond to the first batch element. @@ -106,23 +100,27 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { /** * @brief Constructs an LLMPipeline from xml/bin files, tokenizers and configuration in the same dir. * - * @param model_path Path to the dir model xml/bin files, tokenizers and generation_configs.json + * @param models_path Path to the dir model xml/bin files, tokenizers and generation_configs.json * @param device optional device - * @param plugin_config optional plugin_config - * Add ov::genai::scheduler_config property to plugin_config to create continuous batching pipeline. - * Add ov::genai::adapters property to plugin_config to register LoRA adapters. + * @param properties optional properties + * Add ov::genai::scheduler_config property to properties to create continuous batching pipeline. + * Add ov::genai::adapters property to properties to register LoRA adapters. */ LLMPipeline( - const std::string& path, - const std::string& device="CPU", - const ov::AnyMap& plugin_config={} + const std::filesystem::path& models_path, + const std::string& device, + const ov::AnyMap& properties = {} ); + OPENVINO_DEPRECATED("Please, specify device explicitly when create LLMPipeline. This overload will be removed in 2025.0.0 release") + explicit LLMPipeline(const std::filesystem::path& path) : + LLMPipeline(path, "CPU") { } + /** * @brief Constructs an LLMPipeline from xml/bin files, tokenizers and configuration in the same dir. * Accepts arbitrary list of optional properties. * - * @param model_path Path to the dir model xml/bin files, tokenizers and generation_config.json + * @param models_path Path to the dir model xml/bin files, tokenizers and generation_config.json * @param device optional device * @param properties optional plugin properties, ov::genai::adapters property for LoRA adapters and * ov::genai::scheduler_config property to create continuous batching pipeline. Properties can be @@ -130,10 +128,10 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { */ template ::value, bool>::type = true> LLMPipeline( - const std::string& path, - const std::string& device="CPU", + const std::filesystem::path& models_path, + const std::string& device, Properties&&... properties) - : LLMPipeline(path, device, AnyMap{std::forward(properties)...}) { + : LLMPipeline(models_path, device, AnyMap{std::forward(properties)...}) { } /** @@ -152,19 +150,23 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { /** * @brief Constructs a LLMPipeline when ov::genai::Tokenizer is initialized manually using file from the different dirs. * - * @param model_path Path to the dir with model, tokenizer .xml/.bin files, and generation_configs.json + * @param models_path Path to the dir with model, tokenizer .xml/.bin files, and generation_configs.json * @param tokenizer manually initialized ov::genai::Tokenizer * @param device optional device - * @param plugin_config optional plugin_config + * @param properties optional plugin_config * Add ov::genai::scheduler_config property to plugin_config to create continuous batching pipeline */ LLMPipeline( - const std::string& model_path, + const std::filesystem::path& models_path, const ov::genai::Tokenizer& tokenizer, - const std::string& device="CPU", - const ov::AnyMap& plugin_config = {} + const std::string& device, + const ov::AnyMap& properties = {} ); + OPENVINO_DEPRECATED("Please, specify device explicitly when create LLMPipeline. This overload will be removed in 2025.0.0 release") + LLMPipeline(const std::filesystem::path& models_path, const ov::genai::Tokenizer& tokenizer) : + LLMPipeline(models_path, tokenizer, "CPU") { } + ~LLMPipeline(); /** @@ -177,7 +179,7 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { */ DecodedResults generate( StringInputs inputs, - OptionalGenerationConfig generation_config=std::nullopt, + OptionalGenerationConfig generation_config = std::nullopt, StreamerVariant streamer=std::monostate() ); @@ -225,7 +227,7 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { */ EncodedResults generate( const EncodedInputs& inputs, - OptionalGenerationConfig generation_config=std::nullopt, + OptionalGenerationConfig generation_config = std::nullopt, StreamerVariant streamer=std::monostate() ); @@ -258,7 +260,7 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { * * @param system_message optional system message. */ - void start_chat(const std::string& system_message = ""); + void start_chat(const std::string& system_message = {}); /** * @brief finish chat and clear kv cache. @@ -272,35 +274,34 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { OPENVINO_GENAI_EXPORTS std::pair streamer(StreamerVariant func); OPENVINO_GENAI_EXPORTS std::pair generation_config(const GenerationConfig& config); -OPENVINO_GENAI_EXPORTS std::pair _draft_model( - const std::string& model_path, - const std::string& device, - const ov::AnyMap& llm_config); +OPENVINO_GENAI_EXPORTS std::pair draft_model( + const std::filesystem::path& models_path, + const std::string& device = {}, + const ov::AnyMap& properties = {}); template ::value, bool>::type = true> inline std::pair draft_model( - const std::string& model_path, + const std::filesystem::path& models_path, const std::string& device, Properties&&... properties) { - return _draft_model(model_path, device, ov::AnyMap{std::forward(properties)...}); + return draft_model(models_path, device, ov::AnyMap{std::forward(properties)...}); } template ::value, bool>::type = true> inline std::pair draft_model( - const std::string& model_path, + const std::filesystem::path& models_path, Properties&&... properties) { - return _draft_model(model_path, "", ov::AnyMap{std::forward(properties)...}); + return draft_model(models_path, {}, ov::AnyMap{std::forward(properties)...}); } +/** +* @brief scheduler_config property serves to activate continuous batching pipeline. +* Create SchedulerConfig and fill it with sutable values. Copy or move it to plugin_config. +* And create LLMPipeline instance with this config. +*/ +static constexpr ov::Property scheduler_config{"scheduler_config"}; -inline std::pair -draft_model( - const std::string& model_path, - const std::string& device = "", - const ov::AnyMap& llm_config = ov::AnyMap()) { - return _draft_model(model_path, device, llm_config); -} } // namespace genai } // namespace ov diff --git a/src/cpp/include/openvino/genai/lora_adapter.hpp b/src/cpp/include/openvino/genai/lora_adapter.hpp index 03a42e9039..fd3e57af1e 100644 --- a/src/cpp/include/openvino/genai/lora_adapter.hpp +++ b/src/cpp/include/openvino/genai/lora_adapter.hpp @@ -7,6 +7,7 @@ #include #include #include +#include #include "openvino/op/constant.hpp" #include "openvino/runtime/compiled_model.hpp" @@ -25,12 +26,13 @@ struct AdapterControllerImpl; class OPENVINO_GENAI_EXPORTS Adapter { class Impl; std::shared_ptr m_pimpl; + friend AdapterController; friend AdapterControllerImpl; friend bool operator== (const Adapter& a, const Adapter& b); friend bool operator< (const Adapter& a, const Adapter& b); public: - explicit Adapter(const std::string& path); + explicit Adapter(const std::filesystem::path& path); Adapter() = default; operator bool() const { diff --git a/src/cpp/include/openvino/genai/text2image/autoencoder_kl.hpp b/src/cpp/include/openvino/genai/text2image/autoencoder_kl.hpp index adea040f11..b1088f7448 100644 --- a/src/cpp/include/openvino/genai/text2image/autoencoder_kl.hpp +++ b/src/cpp/include/openvino/genai/text2image/autoencoder_kl.hpp @@ -3,6 +3,7 @@ #pragma once +#include #include #include @@ -25,18 +26,18 @@ class OPENVINO_GENAI_EXPORTS AutoencoderKL { float scaling_factor = 0.18215f; std::vector block_out_channels = { 64 }; - explicit Config(const std::string& config_path); + explicit Config(const std::filesystem::path& config_path); }; - explicit AutoencoderKL(const std::string& root_dir); + explicit AutoencoderKL(const std::filesystem::path& root_dir); - AutoencoderKL(const std::string& root_dir, + AutoencoderKL(const std::filesystem::path& root_dir, const std::string& device, const ov::AnyMap& properties = {}); template ::value, bool>::type = true> - AutoencoderKL(const std::string& root_dir, + AutoencoderKL(const std::filesystem::path& root_dir, const std::string& device, Properties&&... properties) : AutoencoderKL(root_dir, device, ov::AnyMap{std::forward(properties)...}) { } diff --git a/src/cpp/include/openvino/genai/text2image/clip_text_model.hpp b/src/cpp/include/openvino/genai/text2image/clip_text_model.hpp index f4a5a44b4f..63d2b4bafd 100644 --- a/src/cpp/include/openvino/genai/text2image/clip_text_model.hpp +++ b/src/cpp/include/openvino/genai/text2image/clip_text_model.hpp @@ -3,6 +3,7 @@ #pragma once +#include #include #include "openvino/genai/visibility.hpp" @@ -24,18 +25,18 @@ class OPENVINO_GENAI_EXPORTS CLIPTextModel { size_t hidden_size = 512; size_t num_hidden_layers = 13; - explicit Config(const std::string& config_path); + explicit Config(const std::filesystem::path& config_path); }; - explicit CLIPTextModel(const std::string root_dir); + explicit CLIPTextModel(const std::filesystem::path& root_dir); - CLIPTextModel(const std::string& root_dir, + CLIPTextModel(const std::filesystem::path& root_dir, const std::string& device, const ov::AnyMap& properties = {}); template ::value, bool>::type = true> - CLIPTextModel(const std::string& root_dir, + CLIPTextModel(const std::filesystem::path& root_dir, const std::string& device, Properties&&... properties) : CLIPTextModel(root_dir, device, ov::AnyMap{std::forward(properties)...}) { } diff --git a/src/cpp/include/openvino/genai/text2image/clip_text_model_with_projection.hpp b/src/cpp/include/openvino/genai/text2image/clip_text_model_with_projection.hpp index e755613cba..f176f461ab 100644 --- a/src/cpp/include/openvino/genai/text2image/clip_text_model_with_projection.hpp +++ b/src/cpp/include/openvino/genai/text2image/clip_text_model_with_projection.hpp @@ -3,6 +3,7 @@ #pragma once +#include #include #include "openvino/genai/visibility.hpp" @@ -24,20 +25,20 @@ class OPENVINO_GENAI_EXPORTS CLIPTextModelWithProjection { size_t hidden_size = 512; size_t num_hidden_layers = 33; - explicit Config(const std::string& config_path); + explicit Config(const std::filesystem::path& config_path); }; - explicit CLIPTextModelWithProjection(const std::string root_dir); + explicit CLIPTextModelWithProjection(const std::filesystem::path& root_dir); - CLIPTextModelWithProjection(const std::string& root_dir, - const std::string& device, - const ov::AnyMap& properties = {}); + CLIPTextModelWithProjection(const std::filesystem::path& root_dir, + const std::string& device, + const ov::AnyMap& properties = {}); template ::value, bool>::type = true> - CLIPTextModelWithProjection(const std::string& root_dir, - const std::string& device, - Properties&&... properties) + CLIPTextModelWithProjection(const std::filesystem::path& root_dir, + const std::string& device, + Properties&&... properties) : CLIPTextModelWithProjection(root_dir, device, ov::AnyMap{std::forward(properties)...}) { } CLIPTextModelWithProjection(const CLIPTextModelWithProjection&); diff --git a/src/cpp/include/openvino/genai/text2image/pipeline.hpp b/src/cpp/include/openvino/genai/text2image/pipeline.hpp index e3a59cf025..1101a8d084 100644 --- a/src/cpp/include/openvino/genai/text2image/pipeline.hpp +++ b/src/cpp/include/openvino/genai/text2image/pipeline.hpp @@ -6,6 +6,7 @@ #include #include #include +#include #include "openvino/core/any.hpp" #include "openvino/runtime/properties.hpp" @@ -59,7 +60,7 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline { EULER_DISCRETE }; - static std::shared_ptr from_config(const std::string& scheduler_config_path, + static std::shared_ptr from_config(const std::filesystem::path& scheduler_config_path, Type scheduler_type = AUTO); virtual ~Scheduler(); @@ -97,16 +98,16 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline { } }; - explicit Text2ImagePipeline(const std::string& root_dir); + explicit Text2ImagePipeline(const std::filesystem::path& models_path); - Text2ImagePipeline(const std::string& root_dir, const std::string& device, const ov::AnyMap& properties = {}); + Text2ImagePipeline(const std::filesystem::path& models_path, const std::string& device, const ov::AnyMap& properties = {}); template ::value, bool>::type = true> - Text2ImagePipeline(const std::string& root_dir, - const std::string& device, - Properties&&... properties) - : Text2ImagePipeline(root_dir, device, ov::AnyMap{std::forward(properties)...}) { } + Text2ImagePipeline(const std::filesystem::path& models_path, + const std::string& device, + Properties&&... properties) + : Text2ImagePipeline(models_path, device, ov::AnyMap{std::forward(properties)...}) { } // creates either LCM or SD pipeline from building blocks static Text2ImagePipeline stable_diffusion( diff --git a/src/cpp/include/openvino/genai/text2image/unet2d_condition_model.hpp b/src/cpp/include/openvino/genai/text2image/unet2d_condition_model.hpp index da1bec7759..c9a3f16f2d 100644 --- a/src/cpp/include/openvino/genai/text2image/unet2d_condition_model.hpp +++ b/src/cpp/include/openvino/genai/text2image/unet2d_condition_model.hpp @@ -3,6 +3,7 @@ #pragma once +#include #include #include #include @@ -27,20 +28,20 @@ class OPENVINO_GENAI_EXPORTS UNet2DConditionModel { std::vector block_out_channels = { 320, 640, 1280, 1280 }; int time_cond_proj_dim = -1; - explicit Config(const std::string& config_path); + explicit Config(const std::filesystem::path& config_path); }; - explicit UNet2DConditionModel(const std::string root_dir); + explicit UNet2DConditionModel(const std::filesystem::path& root_dir); - UNet2DConditionModel(const std::string& root_dir, + UNet2DConditionModel(const std::filesystem::path& root_dir, const std::string& device, const ov::AnyMap& properties = {}); template ::value, bool>::type = true> - UNet2DConditionModel(const std::string& root_dir, - const std::string& device, - Properties&&... properties) + UNet2DConditionModel(const std::filesystem::path& root_dir, + const std::string& device, + Properties&&... properties) : UNet2DConditionModel(root_dir, device, ov::AnyMap{std::forward(properties)...}) { } UNet2DConditionModel(const UNet2DConditionModel&); diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp index 5aa7655c97..bcb8da68a3 100644 --- a/src/cpp/include/openvino/genai/tokenizer.hpp +++ b/src/cpp/include/openvino/genai/tokenizer.hpp @@ -6,6 +6,7 @@ #include #include #include +#include #include "openvino/runtime/tensor.hpp" #include "openvino/genai/visibility.hpp" @@ -29,8 +30,19 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { /** * @brief ov::genai::Tokenizer constructor. * @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path + * @param properties Properties passed to ov::Core::compile_model */ - Tokenizer(const std::string& tokenizer_path, const ov::AnyMap& plugin_config = {}); + Tokenizer(const std::filesystem::path& tokenizer_path, const ov::AnyMap& properties = {}); + + /** + * @brief ov::genai::Tokenizer constructor with variable number of properties + * @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path + * @param properties optional properties + */ + template ::value, bool>::type = true> + Tokenizer(const std::filesystem::path& tokenizer_path, + Properties&&... properties) + : Tokenizer(tokenizer_path, ov::AnyMap{std::forward(properties)...}) { } /** * @brief encode a single prompt @@ -105,9 +117,9 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { * @return A string with the transformed and concatenated prompts from the chat history. * @throws Exception if the chat template was unable to parse the input history. */ - std::string apply_chat_template(ChatHistory history, + std::string apply_chat_template(ChatHistory history, bool add_generation_prompt, - const std::string& chat_template="") const; + const std::string& chat_template = {}) const; /// @brief Override a chat_template read from tokenizer_config.json. /// @param chat_template The new template to override with. diff --git a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp index f2f42347c4..82f69677d0 100644 --- a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp +++ b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp @@ -3,30 +3,45 @@ #pragma once +#include +#include +#include + #include "openvino/genai/llm_pipeline.hpp" #include "openvino/genai/streamer_base.hpp" #include "openvino/genai/tokenizer.hpp" -#include namespace ov::genai { + /// @brief A Visual language modeling pipeline class used to generate a /// response or run a chat given a prompt and an image. class OPENVINO_GENAI_EXPORTS VLMPipeline { public: /// @brief Construct a pipeline form a folder containing tokenizer /// and model IRs. - /// @param model_dir A folder to read tokenizer and model IRs. + /// @param models_path A folder to read tokenizer and model IRs. /// @param device Inference device. A tokenizer is always compiled /// for CPU. - /// @param device_config A config to pass to ov::Core.set_property() - /// and ov::Core::compile_model(). - /// @param core ov::Core instance to use. - explicit VLMPipeline( - const std::filesystem::path& model_dir, - const std::string& device="CPU", - const ov::AnyMap device_config={} + /// @param properties A config to pass to ov::Core::compile_model(). + VLMPipeline( + const std::filesystem::path& models_path, + const std::string& device, + const ov::AnyMap& properties = {} ); + /// @brief Construct a pipeline form a folder containing tokenizer + /// and model IRs. Accepts arbitrary list of optional properties. + /// @param models_path A folder to read tokenizer and model IRs. + /// @param device Inference device. A tokenizer is always compiled + /// for CPU. + /// @param properties A config to pass to ov::Core::compile_model(). + template ::value, bool>::type = true> + VLMPipeline( + const std::filesystem::path& models_path, + const std::string& device, + Properties&&... properties) + : VLMPipeline(models_path, device, ov::AnyMap{std::forward(properties)...}) { } + /// @brief Default destructor. ~VLMPipeline(); @@ -43,6 +58,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { const GenerationConfig& generation_config, const StreamerVariant& streamer ); + /// @brief Generate a response given a prompt and config. /// @param prompt A prompt to respond to. /// @param config_map A config may contain GenerationConfig, values @@ -53,6 +69,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { const std::string& prompt, const ov::AnyMap& config_map ); + /// @brief Generate a response given a prompt and arbitrary number /// of ov::Property instances. /// Example: @@ -70,6 +87,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { prompt, AnyMap{std::forward(properties)...} ); } + /// @brief Activate chat mode. Chat preserves previous history and /// applies chat_template to input prompts. Calling start_chat() /// again or finish_chat() drops the memorized history. @@ -80,8 +98,10 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { /// in addition to user and assistant roles. Set a message for that /// role. void start_chat(const std::string& system_message=""); + /// @brief Deactivate chat mode. void finish_chat(); + /// @brief Set a custom chat template. Can be used to deactivate /// chat_template application for chat mode if called with /// "{% for message in messages %}{{ message['content'] }}{% endfor %}" @@ -89,15 +109,19 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { /// model chat_template. /// @param new_template A new template to override with. void set_chat_template(const std::string& new_template); + /// @brief Get a Tokenizer used to tokenize input and detokenize /// output. ov::genai::Tokenizer get_tokenizer() const; + /// @brief Extract GenerationConfig used to get default values. /// @return Default values used. GenerationConfig get_generation_config() const; + /// @brief Override default values for GenerationConfig /// @param new_config A config to override default values with. void set_generation_config(const GenerationConfig& new_config); + private: class VLMPipelineImpl; std::unique_ptr m_pimpl; diff --git a/src/cpp/include/openvino/genai/whisper_generation_config.hpp b/src/cpp/include/openvino/genai/whisper_generation_config.hpp index 5fdbfd6075..2bf9b7ab31 100644 --- a/src/cpp/include/openvino/genai/whisper_generation_config.hpp +++ b/src/cpp/include/openvino/genai/whisper_generation_config.hpp @@ -4,6 +4,7 @@ #pragma once #include +#include #include "openvino/genai/tokenizer.hpp" #include "openvino/runtime/compiled_model.hpp" @@ -17,7 +18,7 @@ namespace genai { class OPENVINO_GENAI_EXPORTS WhisperGenerationConfig { public: WhisperGenerationConfig() = default; - explicit WhisperGenerationConfig(const std::string& json_path); + explicit WhisperGenerationConfig(const std::filesystem::path& json_path); // Generic diff --git a/src/cpp/include/openvino/genai/whisper_pipeline.hpp b/src/cpp/include/openvino/genai/whisper_pipeline.hpp index 689dd0eb35..f3ef215f70 100644 --- a/src/cpp/include/openvino/genai/whisper_pipeline.hpp +++ b/src/cpp/include/openvino/genai/whisper_pipeline.hpp @@ -3,9 +3,10 @@ #pragma once -#include +#include #include #include +#include #include "openvino/core/any.hpp" #include "openvino/genai/llm_pipeline.hpp" @@ -38,30 +39,30 @@ class OPENVINO_GENAI_EXPORTS WhisperPipeline { public: /** - * @brief Constructs an WhisperSpeechRecognitionPipeline from xml/bin files, tokenizers and configuration in the + * @brief Constructs a WhisperPipeline from xml/bin files, tokenizers and configuration in the * same dir. * - * @param model_path Path to the dir model xml/bin files, tokenizers and generation_configs.json + * @param models_path Path to the dir model xml/bin files, tokenizers and generation_configs.json * @param device optional device - * @param plugin_config optional plugin_config + * @param properties optional properties */ - WhisperPipeline(const std::string& model_path, - const std::string& device = "CPU", - const ov::AnyMap& plugin_config = {}); + WhisperPipeline(const std::filesystem::path& models_path, + const std::string& device, + const ov::AnyMap& properties = {}); /** - * @brief Constructs a WhisperPipeline when ov::genai::Tokenizer is initialized manually using file - * from the different dirs. + * @brief Constructs a WhisperPipeline from xml/bin files, tokenizers and configuration in the + * same dir. Accepts arbitrary list of optional properties. * - * @param model_path Path to the dir with model, tokenizer .xml/.bin files, and generation_configs.json - * @param tokenizer manually initialized ov::genai::Tokenizer + * @param models_path Path to the dir model xml/bin files, tokenizers and generation_configs.json * @param device optional device - * @param plugin_config optional plugin_config + * @param properties optional properties */ - WhisperPipeline(const std::string& model_path, - const ov::genai::Tokenizer& tokenizer, - const std::string& device = "CPU", - const ov::AnyMap& plugin_config = {}); + template ::value, bool>::type = true> + WhisperPipeline(const std::string& models_path, + const std::string& device, + Properties&&... properties) + : WhisperPipeline(models_path, device, ov::AnyMap{std::forward(properties)...}) { } ~WhisperPipeline(); diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp index 9844354271..58fa9f0fc0 100644 --- a/src/cpp/src/continuous_batching_impl.cpp +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -11,27 +11,27 @@ template struct overloaded : Ts... {using Ts::operator()...;}; template overloaded(Ts...) -> overloaded; ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl( - const std::string& models_path, + const std::filesystem::path& models_path, const Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, - const ov::AnyMap& plugin_config) { + const ov::AnyMap& properties) { m_tokenizer = tokenizer; ov::Core core; - auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_complile_config(plugin_config); - core.set_property(core_plugin_config); + auto [core_properties, compile_properties] = ov::genai::utils::split_core_complile_config(properties); + core.set_property(core_properties); // The model can be compiled for GPU as well - std::shared_ptr model = core.read_model(models_path + "/openvino_model.xml"); + std::shared_ptr model = core.read_model((models_path / "openvino_model.xml").string()); - DeviceConfig device_config(core, scheduler_config, device, compile_plugin_config); + DeviceConfig device_config(core, scheduler_config, device, compile_properties); bool is_need_per_layer_cache_control = scheduler_config.use_cache_eviction; utils::apply_paged_attention_transformations(model, device_config, is_need_per_layer_cache_control); - init(model, scheduler_config, compile_plugin_config, device_config, core); + init(model, scheduler_config, compile_properties, device_config, core); } void ContinuousBatchingPipeline::ContinuousBatchingImpl::_pull_awaiting_requests() { @@ -43,10 +43,10 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_pull_awaiting_requests void ContinuousBatchingPipeline::ContinuousBatchingImpl::init( std::shared_ptr model, const SchedulerConfig& scheduler_config, - const ov::AnyMap& plugin_config, + const ov::AnyMap& properties, const DeviceConfig& device_config, ov::Core& core) { - ov::InferRequest infer_request = core.compile_model(model, device_config.get_device(), plugin_config).create_infer_request(); + ov::InferRequest infer_request = core.compile_model(model, device_config.get_device(), properties).create_infer_request(); // setup KV caches m_cache_manager = std::make_shared(device_config, core); diff --git a/src/cpp/src/continuous_batching_impl.hpp b/src/cpp/src/continuous_batching_impl.hpp index c9b606fb42..c7ef681224 100644 --- a/src/cpp/src/continuous_batching_impl.hpp +++ b/src/cpp/src/continuous_batching_impl.hpp @@ -54,23 +54,22 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc void _pull_awaiting_requests(); public: - ContinuousBatchingImpl(const std::string& models_path, + ContinuousBatchingImpl(const std::filesystem::path& models_path, const Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, - const ov::AnyMap& plugin_config); + const ov::AnyMap& properties); - ContinuousBatchingImpl(const std::string& models_path, + ContinuousBatchingImpl(const std::filesystem::path& models_path, const SchedulerConfig& scheduler_config, const std::string& device, - const ov::AnyMap& llm_plugin_config, - const ov::AnyMap& tokenizer_plugin_config) + const ov::AnyMap& properties, + const ov::AnyMap& tokenizer_properties) : ContinuousBatchingImpl{ models_path, - Tokenizer(models_path, tokenizer_plugin_config), + Tokenizer(models_path, tokenizer_properties), scheduler_config, device, - llm_plugin_config } {}; - + properties } {} GenerationHandle add_request(uint64_t request_id, const ov::Tensor& input_ids, diff --git a/src/cpp/src/continuous_batching_impl_interface.cpp b/src/cpp/src/continuous_batching_impl_interface.cpp index c5a2fb087b..10fc102aa0 100644 --- a/src/cpp/src/continuous_batching_impl_interface.cpp +++ b/src/cpp/src/continuous_batching_impl_interface.cpp @@ -4,6 +4,7 @@ #include "continuous_batching_impl_interface.hpp" namespace ov::genai { + GenerationConfig ContinuousBatchingPipeline::ImplInterface::get_config() const { return m_generation_config; } @@ -27,6 +28,7 @@ void ContinuousBatchingPipeline::ImplInterface::finish_chat() { m_is_chat_conversation = false; m_history.clear(); }; + std::vector ContinuousBatchingPipeline::ImplInterface::generate( const std::vector& prompts, diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp index b0ce404fa2..6dcbf342eb 100644 --- a/src/cpp/src/continuous_batching_pipeline.cpp +++ b/src/cpp/src/continuous_batching_pipeline.cpp @@ -28,32 +28,32 @@ extract_draft_model_from_config(ov::AnyMap& config) { return draft_model; } -ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::string& models_path, +ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::filesystem::path& models_path, const SchedulerConfig& scheduler_config, const std::string& device, - const ov::AnyMap& llm_plugin_config, - const ov::AnyMap& tokenizer_plugin_config) { - auto llm_plugin_config_without_draft_model = llm_plugin_config; - auto draft_model = extract_draft_model_from_config(llm_plugin_config_without_draft_model); - if (draft_model.model_path.empty()) { - m_impl = std::make_shared(models_path, scheduler_config, device, llm_plugin_config, tokenizer_plugin_config); + const ov::AnyMap& properties, + const ov::AnyMap& tokenizer_properties) { + auto properties_without_draft_model = properties; + auto draft_model = extract_draft_model_from_config(properties_without_draft_model); + if (draft_model.models_path.empty()) { + m_impl = std::make_shared(models_path, scheduler_config, device, properties, tokenizer_properties); } else { - m_impl = std::make_shared(models_path, scheduler_config, device, llm_plugin_config_without_draft_model, draft_model, tokenizer_plugin_config); + m_impl = std::make_shared(models_path, scheduler_config, device, properties_without_draft_model, draft_model, tokenizer_properties); } } ContinuousBatchingPipeline::ContinuousBatchingPipeline( - const std::string& model_path, + const std::filesystem::path& models_path, const Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, - const ov::AnyMap& plugin_config) { - auto plugin_config_without_draft_model = plugin_config; - auto draft_model = extract_draft_model_from_config(plugin_config_without_draft_model); - if (draft_model.model_path.empty()) { - m_impl = std::make_shared(model_path, tokenizer, scheduler_config, device, plugin_config); + const ov::AnyMap& properties) { + auto properties_without_draft_model = properties; + auto draft_model = extract_draft_model_from_config(properties_without_draft_model); + if (draft_model.models_path.empty()) { + m_impl = std::make_shared(models_path, tokenizer, scheduler_config, device, properties); } else { - m_impl = std::make_shared(model_path, scheduler_config, device, plugin_config_without_draft_model, draft_model); + m_impl = std::make_shared(models_path, scheduler_config, device, properties_without_draft_model, draft_model); } } diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp index c66bdf5ec9..3b6742db59 100644 --- a/src/cpp/src/generation_config.cpp +++ b/src/cpp/src/generation_config.cpp @@ -14,11 +14,11 @@ namespace ov { namespace genai { -GenerationConfig::GenerationConfig(const std::string& json_path) { +GenerationConfig::GenerationConfig(const std::filesystem::path& json_path) { using utils::read_json_param; std::ifstream f(json_path); - OPENVINO_ASSERT(f.is_open(), "Failed to open '" + json_path + "' with generation config"); + OPENVINO_ASSERT(f.is_open(), "Failed to open '", json_path, "' with generation config"); nlohmann::json data = nlohmann::json::parse(f); diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 2a2c14f490..2605f79160 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -58,31 +58,30 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { bool m_is_cache_empty = true; std::optional m_selected_beam = std::nullopt; ChatHistory m_history; - std::string m_templated_chat_history = ""; + std::string m_templated_chat_history = {}; StatefulLLMPipeline( const ov::InferRequest& request, const ov::genai::Tokenizer& tokenizer, OptionalGenerationConfig generation_config=std::nullopt - ): LLMPipelineImplBase(tokenizer), + ) : LLMPipelineImplBase(tokenizer), m_model_runner(request) { GenerationConfig default_config; m_generation_config = (generation_config.has_value()) ? *generation_config : default_config; } StatefulLLMPipeline( - const std::filesystem::path& model_path, + const std::filesystem::path& models_path, const ov::genai::Tokenizer& tokenizer, const std::string& device, const ov::AnyMap& plugin_config - ): - LLMPipelineImplBase(tokenizer, utils::from_config_json_if_exists(model_path)) + ) : LLMPipelineImplBase(tokenizer, utils::from_config_json_if_exists(models_path)) { ov::Core core; - if(auto filtered_plugin_config = extract_adapters_from_properties(plugin_config, &m_generation_config.adapters)) { + if (auto filtered_plugin_config = extract_adapters_from_properties(plugin_config, &m_generation_config.adapters)) { auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_complile_config(*filtered_plugin_config); core.set_property(core_plugin_config); - auto model = core.read_model(model_path / "openvino_model.xml"); + auto model = core.read_model(models_path / "openvino_model.xml"); m_generation_config.adapters.set_tensor_name_prefix("base_model.model.model."); m_adapter_controller = AdapterController(model, m_generation_config.adapters, device); // TODO: Make the prefix name configurable utils::slice_matmul_statefull_model(model); @@ -91,7 +90,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { } else { auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_complile_config(plugin_config); core.set_property(core_plugin_config); - auto model = core.read_model(model_path / "openvino_model.xml"); + auto model = core.read_model(models_path / "openvino_model.xml"); utils::slice_matmul_statefull_model(model); m_model_runner = core.compile_model(model, device, compile_plugin_config).create_infer_request(); } @@ -102,10 +101,10 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { } StatefulLLMPipeline( - const std::filesystem::path& model_path, + const std::filesystem::path& models_path, const std::string& device, const ov::AnyMap& plugin_config - ): StatefulLLMPipeline{model_path, Tokenizer(model_path.string()), device, plugin_config} {} + ) : StatefulLLMPipeline{models_path, Tokenizer(models_path.string()), device, plugin_config} {} DecodedResults generate( StringInputs inputs, @@ -369,18 +368,18 @@ std::pair generation_config(const GenerationConfig& config) { return {utils::CONFIG_ARG_NAME, Any::make(config)}; } -std::pair _draft_model( - const std::string& model_path, +std::pair draft_model( + const std::filesystem::path& models_path, const std::string& device, - const ov::AnyMap& llm_config) { - ov::AnyMap plugin_config = llm_config; - if (plugin_config.count(ov::genai::scheduler_config.name())) { - auto scheduler_config = plugin_config.at(ov::genai::scheduler_config.name()).as(); - plugin_config.erase(ov::genai::scheduler_config.name()); - return { utils::DRAFT_MODEL_ARG_NAME, Any::make(model_path, device, plugin_config, scheduler_config) }; - } + const ov::AnyMap& properties) { + ov::AnyMap plugin_config = properties; + auto it = plugin_config.find(ov::genai::scheduler_config.name()); SchedulerConfig scheduler_config; - return { utils::DRAFT_MODEL_ARG_NAME, Any::make(model_path, device, plugin_config, scheduler_config) }; + if (it != plugin_config.end()) { + scheduler_config = it->second.as(); + plugin_config.erase(it); + } + return { utils::DRAFT_MODEL_ARG_NAME, Any::make(models_path, device, plugin_config, scheduler_config) }; } } // namespace genai @@ -405,16 +404,16 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase { const ov::InferRequest& request, const Tokenizer& tokenizer, OptionalGenerationConfig generation_config - ): LLMPipelineImplBase{dont_construct()}, m_impl{"", {}} {} + ): LLMPipelineImplBase{dont_construct()}, m_impl{{}, {}, {}} {} ContinuousBatchingAdapter( - const std::filesystem::path& model_path, + const std::filesystem::path& models_path, const Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const ov::AnyMap& plugin_config ): LLMPipelineImplBase{tokenizer}, m_impl{ - model_path.string(), + models_path.string(), tokenizer, scheduler_config, device, @@ -422,12 +421,12 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase { } {} ContinuousBatchingAdapter( - const std::filesystem::path& model_path, + const std::filesystem::path& models_path, const SchedulerConfig& scheduler_config, const std::string& device, const ov::AnyMap& plugin_config - ): LLMPipelineImplBase{Tokenizer(model_path.string())}, m_impl{ - model_path.string(), + ): LLMPipelineImplBase{Tokenizer(models_path.string())}, m_impl{ + models_path.string(), m_tokenizer, scheduler_config, device, @@ -546,28 +545,28 @@ ov::genai::LLMPipeline::LLMPipeline( } ov::genai::LLMPipeline::LLMPipeline( - const std::string& model_path, + const std::filesystem::path& models_path, const ov::genai::Tokenizer& tokenizer, const std::string& device, - const ov::AnyMap& plugin_config + const ov::AnyMap& properties ){ auto start_time = std::chrono::steady_clock::now(); - if (plugin_config.find(ov::genai::scheduler_config.name()) != plugin_config.end()) { - auto config_without_scheduler_config = plugin_config; + if (properties.find(ov::genai::scheduler_config.name()) != properties.end()) { + auto config_without_scheduler_config = properties; config_without_scheduler_config.erase(ov::genai::scheduler_config.name()); - auto& scheduler_config = plugin_config.at(ov::genai::scheduler_config.name()).as(); - m_pimpl = std::make_unique(model_path, tokenizer, scheduler_config, device, config_without_scheduler_config); + auto& scheduler_config = properties.at(ov::genai::scheduler_config.name()).as(); + m_pimpl = std::make_unique(models_path, tokenizer, scheduler_config, device, config_without_scheduler_config); } else if ("NPU" == device) { - m_pimpl = std::make_unique(model_path, tokenizer, device, plugin_config); + m_pimpl = std::make_unique(models_path, tokenizer, device, properties); } else { - m_pimpl = std::make_unique(model_path, tokenizer, device, plugin_config); + m_pimpl = std::make_unique(models_path, tokenizer, device, properties); } auto stop_time = std::chrono::steady_clock::now(); m_pimpl->m_load_time_ms = std::chrono::duration_cast(stop_time - start_time).count(); } ov::genai::LLMPipeline::LLMPipeline( - const std::string& path, + const std::filesystem::path& models_path, const std::string& device, const ov::AnyMap& config ){ @@ -576,11 +575,11 @@ ov::genai::LLMPipeline::LLMPipeline( auto config_without_scheduler_config = config; config_without_scheduler_config.erase(ov::genai::scheduler_config.name()); auto& scheduler_config = config.at(ov::genai::scheduler_config.name()).as(); - m_pimpl = std::make_unique(path, scheduler_config, device, config_without_scheduler_config); + m_pimpl = std::make_unique(models_path, scheduler_config, device, config_without_scheduler_config); } else if ("NPU" == device) { - m_pimpl = std::make_unique(path, device, config); + m_pimpl = std::make_unique(models_path, device, config); } else { - m_pimpl = std::make_unique(path, device, config); + m_pimpl = std::make_unique(models_path, device, config); } auto stop_time = std::chrono::steady_clock::now(); m_pimpl->m_load_time_ms = std::chrono::duration_cast(stop_time - start_time).count(); diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 0b1d6abc9c..eaed1b912c 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -334,13 +334,13 @@ namespace ov { namespace genai { StaticLLMPipeline::StaticLLMPipeline( - const std::filesystem::path& path, + const std::filesystem::path& models_path, const ov::genai::Tokenizer& tokenizer, const std::string& device, const ov::AnyMap& config ) : LLMPipelineImplBase(tokenizer, - utils::from_config_json_if_exists(path)) { - auto pipeline_config = config; + utils::from_config_json_if_exists(models_path)) { + auto properties = config; /* NB: Static LLM pipeline consists of two models, first to process the input prompt (prefill), second to use in generation loop (kvcache) @@ -353,27 +353,27 @@ StaticLLMPipeline::StaticLLMPipeline( 2. When both models are directly imported from provided prefill and generation precompiled blobs, that is "USE_BLOBS=YES" way. */ - const auto use_blobs = pop_or_default(pipeline_config, "USE_BLOBS", false); + const auto use_blobs = pop_or_default(properties, "USE_BLOBS", false); if (!use_blobs) { - setupAndCompileModels(path, device, pipeline_config); + setupAndCompileModels(models_path, device, properties); } else { - setupAndImportModels(path, device, pipeline_config); + setupAndImportModels(models_path, device, properties); } // Initialize tensors prepare_for_new_conversation(); }; StaticLLMPipeline::StaticLLMPipeline( - const std::filesystem::path& path, + const std::filesystem::path& models_path, const std::string& device, - const ov::AnyMap& config -) : StaticLLMPipeline(path, path.string(), device, config) { + const ov::AnyMap& properties +) : StaticLLMPipeline(models_path, Tokenizer(models_path), device, properties) { } void StaticLLMPipeline::setupAndCompileModels( - const std::filesystem::path& path, + const std::filesystem::path& models_path, const std::string& device, - ov::AnyMap& pipeline_config) { + ov::AnyMap& properties) { /* Initialization assumes multiple steps if user passes "USE_BLOBS=NO": 1) Read the template model - this will be kvcache model 2) Expose KV-cache input and output layers from kvcache model @@ -390,7 +390,7 @@ void StaticLLMPipeline::setupAndCompileModels( auto npudesc = extract_npu_descriptor(core); // (1) Read the template model - this will be kvcache model - m_kvcache_model = core.read_model(path / "openvino_model.xml"); + m_kvcache_model = core.read_model((models_path / "openvino_model.xml").string()); // (2) Expose KV-cache input and output layers from kvcache model ov::pass::StatefulToStateless().run_on_model(m_kvcache_model); // (3) Align u4 ZP constants @@ -403,21 +403,21 @@ void StaticLLMPipeline::setupAndCompileModels( m_prefill_model = m_kvcache_model->clone(); m_prefill_model->set_friendly_name(m_kvcache_model->get_friendly_name() + "_prefill"); // (7) Reshape both models to static shape - const uint32_t kMaxPromptLen = pop_int_and_cast(pipeline_config, "MAX_PROMPT_LEN").value_or(1024u); - const uint32_t kMinResponseLen = pop_int_and_cast(pipeline_config, "MIN_RESPONSE_LEN").value_or(128u); - KVAxesPosition axes = get_kv_axes(get_model_type_from_json(path / "config.json")); + const uint32_t kMaxPromptLen = pop_int_and_cast(properties, "MAX_PROMPT_LEN").value_or(1024u); + const uint32_t kMinResponseLen = pop_int_and_cast(properties, "MIN_RESPONSE_LEN").value_or(128u); + KVAxesPosition axes = get_kv_axes(get_model_type_from_json(models_path / "config.json")); m_kvcache_desc = KVCacheDesc { kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, axes.seq_len }; reshape_to_static(m_prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes); reshape_to_static(m_kvcache_model, 1u, m_kvcache_desc.total_size, axes); // (8) Compile both model auto prefill_config = pop_or_default( - pipeline_config, "PREFILL_CONFIG", get_default_prefill_config(m_prefill_model, npudesc) + properties, "PREFILL_CONFIG", get_default_prefill_config(m_prefill_model, npudesc) ); auto generate_config = pop_or_default( - pipeline_config, "GENERATE_CONFIG", get_default_generate_config(m_kvcache_model) + properties, "GENERATE_CONFIG", get_default_generate_config(m_kvcache_model) ); - merge_config_with(prefill_config, pipeline_config); - merge_config_with(generate_config, pipeline_config); + merge_config_with(prefill_config, properties); + merge_config_with(generate_config, properties); // FIXME: Drop CACHE_DIR option if NPUW is enabled drop_cache_dir(prefill_config); drop_cache_dir(generate_config); @@ -431,9 +431,9 @@ void StaticLLMPipeline::setupAndCompileModels( } void StaticLLMPipeline::setupAndImportModels( - const std::filesystem::path& path, + const std::filesystem::path& models_path, const std::string& device, - ov::AnyMap& pipeline_config) { + ov::AnyMap& properties) { /* To initialize pipeline in case when user passes "USE_BLOBS=YES", next steps are required: 1) Check that neither MAX_PROMPT_LEN nor MIN_RESPONSE_LEN is @@ -446,15 +446,15 @@ void StaticLLMPipeline::setupAndImportModels( ov::Core core; auto import_blob = [this, - &path, - &pipeline_config, + &models_path, + &properties, &core, &device](const std::string& model_name, ov::AnyMap& model_config) { auto blob_path = pop_or_default(model_config, "BLOB_PATH", std::string{}); if (blob_path.empty()) { - blob_path = (path / + blob_path = (models_path / (std::string("openvino_") + model_name + ".blob")).string(); } @@ -463,7 +463,7 @@ void StaticLLMPipeline::setupAndImportModels( + blob_path); } - merge_config_with(model_config, pipeline_config); + merge_config_with(model_config, properties); std::fstream fs(blob_path, std::ios::in | std::ios::binary); @@ -484,17 +484,17 @@ void StaticLLMPipeline::setupAndImportModels( // (1) Check that neither MAX_PROMPT_LEN nor MIN_RESPONSE_LEN is // exposed in the config - if (pipeline_config.count("MAX_PROMPT_LEN") || - pipeline_config.count("MIN_RESPONSE_LEN")) { + if (properties.count("MAX_PROMPT_LEN") || + properties.count("MIN_RESPONSE_LEN")) { OPENVINO_THROW("Neither \"MAX_PROMPT_LEN\" nor \"MIN_RESPONSE_LEN\"" " can be specified in \"USE_BLOBS=YES\" configuration!"); } // (2) Import prefill model from model directory or specified path - auto prefill_config = pop_or_default(pipeline_config, "PREFILL_CONFIG", ov::AnyMap()); + auto prefill_config = pop_or_default(properties, "PREFILL_CONFIG", ov::AnyMap()); auto prefill_model = import_blob("prefill", prefill_config); m_prefill_request = prefill_model.create_infer_request(); // (3) Import generate model from model directory or specified path - auto generate_config = pop_or_default(pipeline_config, "GENERATE_CONFIG", ov::AnyMap()); + auto generate_config = pop_or_default(properties, "GENERATE_CONFIG", ov::AnyMap()); auto generate_model = import_blob("generate", generate_config); m_kvcache_request = generate_model.create_infer_request(); // (4) Fill in m_kvcache_desc diff --git a/src/cpp/src/lora_adapter.cpp b/src/cpp/src/lora_adapter.cpp index 5167db73f4..18d0132138 100644 --- a/src/cpp/src/lora_adapter.cpp +++ b/src/cpp/src/lora_adapter.cpp @@ -86,7 +86,7 @@ using LoRATensors = std::map; // Read binary file to memory. -BufferPtr read_file_helper(const std::string& filename) { +BufferPtr read_file_helper(const std::filesystem::path& filename) { std::ifstream file(filename, std::ios::binary | std::ios::ate); OPENVINO_ASSERT(file.is_open(), "Cannot open file with LoRA weights: ", filename); @@ -136,7 +136,7 @@ struct AutoSafetensor: public safetensors_File { // The key in the map is a tensor name and the Constant uses a region of memory from the memory block. // Each Constant holds a shared pointer to the block in the runtime info. // The memory block will be deallocated when the last Constant is destroyed. -ConstantMap read_safetensors(const std::string& filename) { +ConstantMap read_safetensors(const std::filesystem::path& filename) { auto buffer = read_file_helper(filename); AutoSafetensor safe_tensors_file{}; @@ -771,7 +771,7 @@ namespace genai { class Adapter::Impl { public: - Impl(const std::string& path) : + Impl(const std::filesystem::path& path) : tensors(group_lora_tensors(read_safetensors(path), default_lora_patterns())) { std::set keys; @@ -799,7 +799,7 @@ class Adapter::Impl { }; -Adapter::Adapter(const std::string& path) : +Adapter::Adapter(const std::filesystem::path& path) : m_pimpl(std::make_shared(path)) { } diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp index 708b921fd5..c373b2953f 100644 --- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp +++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp @@ -16,21 +16,21 @@ bool operator==(const SchedulerConfig& lhs, const SchedulerConfig& rhs) { } ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl( - const std::string& main_models_path, + const std::filesystem::path& main_models_path, const SchedulerConfig& main_scheduler_config, const std::string& main_device, - const ov::AnyMap& main_plugin_config, + const ov::AnyMap& main_properties, const ov::genai::ModelDesc draft_model_desc, - const ov::AnyMap& tokenizer_plugin_config) { + const ov::AnyMap& tokenizer_properties) { ov::Core core; - auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_complile_config(main_plugin_config); - core.set_property(core_plugin_config); + auto [core_properties, compile_properties] = ov::genai::utils::split_core_complile_config(main_properties); + core.set_property(core_properties); - std::string openvino_model_name = "/openvino_model.xml", - draft_model_path = draft_model_desc.model_path; + std::filesystem::path openvino_model_name = "openvino_model.xml", + draft_models_path = draft_model_desc.models_path; - std::shared_ptr main_model = core.read_model(main_models_path + openvino_model_name), - draft_model = core.read_model(draft_model_path + openvino_model_name); + std::shared_ptr main_model = core.read_model((main_models_path / openvino_model_name).string()), + draft_model = core.read_model((draft_models_path / openvino_model_name).string()); utils::apply_paged_attention_transformations(main_model, main_scheduler_config.use_cache_eviction); utils::apply_paged_attention_transformations(draft_model, main_scheduler_config.use_cache_eviction); @@ -58,24 +58,24 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl( draft_scheduler_config.cache_size = draft_cache_size; } - ov::AnyMap draft_plugin_config = draft_model_desc.plugin_config == ov::AnyMap{} ? compile_plugin_config : draft_model_desc.plugin_config; + ov::AnyMap draft_properties = draft_model_desc.properties == ov::AnyMap{} ? compile_properties : draft_model_desc.properties; - DeviceConfig main_device_config(core, main_scheduler_config, main_device, compile_plugin_config), - draft_device_config(core, draft_scheduler_config, draft_device, draft_plugin_config); + DeviceConfig main_device_config(core, main_scheduler_config, main_device, compile_properties), + draft_device_config(core, draft_scheduler_config, draft_device, draft_properties); utils::set_kv_cache_type_and_shape(main_model, main_device_config); utils::set_kv_cache_type_and_shape(draft_model, draft_device_config); // main and draft model can have different tokenizers // to do: support retokenization: 154103 - Tokenizer main_model_tokenizer(main_models_path, tokenizer_plugin_config), - draft_model_tokenizer(draft_model_path, tokenizer_plugin_config); + Tokenizer main_model_tokenizer(main_models_path, tokenizer_properties), + draft_model_tokenizer(draft_models_path, tokenizer_properties); m_tokenizer = main_model_tokenizer; // to create `main_pipeline` with enabled validation_mode and `draft_pipeline` with disabled validation mode - m_main_pipeline = std::make_shared(core, main_model, main_model_tokenizer, main_device_config, main_scheduler_config, main_device, compile_plugin_config, true); - m_draft_pipeline = std::make_shared(core, draft_model, draft_model_tokenizer, draft_device_config, draft_scheduler_config, draft_device, draft_plugin_config, false); + m_main_pipeline = std::make_shared(core, main_model, main_model_tokenizer, main_device_config, main_scheduler_config, main_device, compile_properties, true); + m_draft_pipeline = std::make_shared(core, draft_model, draft_model_tokenizer, draft_device_config, draft_scheduler_config, draft_device, draft_properties, false); } GenerationHandle diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp index 3b6e43544d..b427e311b4 100644 --- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp +++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp @@ -11,18 +11,18 @@ namespace ov::genai { struct ModelDesc { - std::string model_path; + std::filesystem::path models_path; std::string device; ov::genai::SchedulerConfig scheduler_config; - ov::AnyMap plugin_config; + ov::AnyMap properties; - ModelDesc(const std::string& model_path, - const std::string& device = "", - const ov::AnyMap& plugin_config = {}, + ModelDesc(const std::filesystem::path& models_path, + const std::string& device = {}, + const ov::AnyMap& properties = {}, const ov::genai::SchedulerConfig& scheduler_config = {}) : - model_path(model_path), + models_path(models_path), device(device), - plugin_config(plugin_config), + properties(properties), scheduler_config(scheduler_config) {} }; @@ -32,12 +32,12 @@ class ContinuousBatchingPipeline::SpeculativeDecodingImpl : public ContinuousBat SpeculativeDecodingMetrics m_sd_metrics; public: - SpeculativeDecodingImpl(const std::string& main_models_path, + SpeculativeDecodingImpl(const std::filesystem::path& main_models_path, const SchedulerConfig& scheduler_config, const std::string& device, - const ov::AnyMap& plugin_config, + const ov::AnyMap& properties, const ov::genai::ModelDesc draft_model_desc, - const ov::AnyMap& tokenizer_config = {}); + const ov::AnyMap& tokenizer_properties = {}); GenerationHandle add_request(uint64_t request_id, const ov::Tensor& input_ids, diff --git a/src/cpp/src/text2image/diffusion_pipeline.hpp b/src/cpp/src/text2image/diffusion_pipeline.hpp index a63690d7eb..3c53a5b66b 100644 --- a/src/cpp/src/text2image/diffusion_pipeline.hpp +++ b/src/cpp/src/text2image/diffusion_pipeline.hpp @@ -25,8 +25,8 @@ void batch_copy(ov::Tensor src, ov::Tensor dst, size_t src_batch, size_t dst_bat ov::Tensor(src, src_start, src_end).copy_to(ov::Tensor(dst, dst_start, dst_end)); } -const std::string get_class_name(const std::string& root_dir) { - const std::string model_index_path = root_dir + "/model_index.json"; +const std::string get_class_name(const std::filesystem::path& root_dir) { + const std::filesystem::path model_index_path = root_dir / "model_index.json"; std::ifstream file(model_index_path); OPENVINO_ASSERT(file.is_open(), "Failed to open ", model_index_path); diff --git a/src/cpp/src/text2image/models/autoencoder_kl.cpp b/src/cpp/src/text2image/models/autoencoder_kl.cpp index 080bbd7931..f78d9df182 100644 --- a/src/cpp/src/text2image/models/autoencoder_kl.cpp +++ b/src/cpp/src/text2image/models/autoencoder_kl.cpp @@ -21,7 +21,7 @@ namespace ov { namespace genai { -AutoencoderKL::Config::Config(const std::string& config_path) { +AutoencoderKL::Config::Config(const std::filesystem::path& config_path) { std::ifstream file(config_path); OPENVINO_ASSERT(file.is_open(), "Failed to open ", config_path); @@ -35,19 +35,19 @@ AutoencoderKL::Config::Config(const std::string& config_path) { read_json_param(data, "block_out_channels", block_out_channels); } -AutoencoderKL::AutoencoderKL(const std::string& root_dir) - : m_config(root_dir + "/config.json") { +AutoencoderKL::AutoencoderKL(const std::filesystem::path& root_dir) + : m_config(root_dir / "config.json") { ov::Core core = utils::singleton_core(); - m_model = core.read_model(root_dir + "/openvino_model.xml"); + m_model = core.read_model((root_dir / "openvino_model.xml").string()); // apply VaeImageProcessor postprocessing steps by merging them into the VAE decoder model merge_vae_image_processor(); } -AutoencoderKL::AutoencoderKL(const std::string& root_dir, - const std::string& device, - const ov::AnyMap& properties) +AutoencoderKL::AutoencoderKL(const std::filesystem::path& root_dir, + const std::string& device, + const ov::AnyMap& properties) : AutoencoderKL(root_dir) { - if(auto filtered_properties = extract_adapters_from_properties(properties)) { + if (auto filtered_properties = extract_adapters_from_properties(properties)) { compile(device, *filtered_properties); } else { compile(device, properties); diff --git a/src/cpp/src/text2image/models/clip_text_model.cpp b/src/cpp/src/text2image/models/clip_text_model.cpp index 169bd1eb53..f82ed7a4aa 100644 --- a/src/cpp/src/text2image/models/clip_text_model.cpp +++ b/src/cpp/src/text2image/models/clip_text_model.cpp @@ -12,7 +12,7 @@ namespace ov { namespace genai { -CLIPTextModel::Config::Config(const std::string& config_path) { +CLIPTextModel::Config::Config(const std::filesystem::path& config_path) { std::ifstream file(config_path); OPENVINO_ASSERT(file.is_open(), "Failed to open ", config_path); @@ -24,19 +24,19 @@ CLIPTextModel::Config::Config(const std::string& config_path) { read_json_param(data, "num_hidden_layers", num_hidden_layers); } -CLIPTextModel::CLIPTextModel(const std::string root_dir) : - m_clip_tokenizer(root_dir + "/../tokenizer"), - m_config(root_dir + "/config.json") { +CLIPTextModel::CLIPTextModel(const std::filesystem::path& root_dir) : + m_clip_tokenizer(root_dir.parent_path() / "tokenizer"), + m_config(root_dir / "config.json") { ov::Core core = utils::singleton_core(); - m_model = core.read_model(root_dir + "/openvino_model.xml"); + m_model = core.read_model((root_dir / "openvino_model.xml").string()); } -CLIPTextModel::CLIPTextModel(const std::string& root_dir, - const std::string& device, - const ov::AnyMap& properties) : +CLIPTextModel::CLIPTextModel(const std::filesystem::path& root_dir, + const std::string& device, + const ov::AnyMap& properties) : CLIPTextModel(root_dir) { AdapterConfig adapters; - if(auto filtered_properties = extract_adapters_from_properties(properties, &adapters)) { + if (auto filtered_properties = extract_adapters_from_properties(properties, &adapters)) { adapters.set_tensor_name_prefix(adapters.get_tensor_name_prefix().value_or("lora_te")); m_adapter_controller = AdapterController(m_model, adapters, device); compile(device, *filtered_properties); diff --git a/src/cpp/src/text2image/models/clip_text_model_with_projection.cpp b/src/cpp/src/text2image/models/clip_text_model_with_projection.cpp index 8d374ad92c..878d660b62 100644 --- a/src/cpp/src/text2image/models/clip_text_model_with_projection.cpp +++ b/src/cpp/src/text2image/models/clip_text_model_with_projection.cpp @@ -12,7 +12,7 @@ namespace ov { namespace genai { -CLIPTextModelWithProjection::Config::Config(const std::string& config_path) { +CLIPTextModelWithProjection::Config::Config(const std::filesystem::path& config_path) { std::ifstream file(config_path); OPENVINO_ASSERT(file.is_open(), "Failed to open ", config_path); @@ -24,14 +24,14 @@ CLIPTextModelWithProjection::Config::Config(const std::string& config_path) { read_json_param(data, "num_hidden_layers", num_hidden_layers); } -CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::string root_dir) : - m_clip_tokenizer(root_dir + "/../tokenizer_2"), - m_config(root_dir + "/config.json") { +CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::filesystem::path& root_dir) : + m_clip_tokenizer(root_dir.parent_path() / "tokenizer_2"), + m_config(root_dir / "config.json") { ov::Core core = utils::singleton_core(); - m_model = core.read_model(root_dir + "/openvino_model.xml"); + m_model = core.read_model((root_dir / "openvino_model.xml").string()); } -CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::string& root_dir, +CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::filesystem::path& root_dir, const std::string& device, const ov::AnyMap& properties) : CLIPTextModelWithProjection(root_dir) { diff --git a/src/cpp/src/text2image/models/unet2d_condition_model.cpp b/src/cpp/src/text2image/models/unet2d_condition_model.cpp index 9efda91a2c..182d2f7012 100644 --- a/src/cpp/src/text2image/models/unet2d_condition_model.cpp +++ b/src/cpp/src/text2image/models/unet2d_condition_model.cpp @@ -12,7 +12,7 @@ namespace ov { namespace genai { -UNet2DConditionModel::Config::Config(const std::string& config_path) { +UNet2DConditionModel::Config::Config(const std::filesystem::path& config_path) { std::ifstream file(config_path); OPENVINO_ASSERT(file.is_open(), "Failed to open ", config_path); @@ -25,20 +25,20 @@ UNet2DConditionModel::Config::Config(const std::string& config_path) { read_json_param(data, "time_cond_proj_dim", time_cond_proj_dim); } -UNet2DConditionModel::UNet2DConditionModel(const std::string root_dir) : - m_config(root_dir + "/config.json") { +UNet2DConditionModel::UNet2DConditionModel(const std::filesystem::path& root_dir) : + m_config(root_dir / "config.json") { ov::Core core = utils::singleton_core(); - m_model = core.read_model(root_dir + "/openvino_model.xml"); + m_model = core.read_model((root_dir / "openvino_model.xml").string()); // compute VAE scale factor m_vae_scale_factor = std::pow(2, m_config.block_out_channels.size() - 1); } -UNet2DConditionModel::UNet2DConditionModel(const std::string& root_dir, - const std::string& device, - const ov::AnyMap& properties) : +UNet2DConditionModel::UNet2DConditionModel(const std::filesystem::path& root_dir, + const std::string& device, + const ov::AnyMap& properties) : UNet2DConditionModel(root_dir) { AdapterConfig adapters; - if(auto filtered_properties = extract_adapters_from_properties(properties, &adapters)) { + if (auto filtered_properties = extract_adapters_from_properties(properties, &adapters)) { adapters.set_tensor_name_prefix(adapters.get_tensor_name_prefix().value_or("lora_unet")); m_adapter_controller = AdapterController(m_model, adapters, device); compile(device, *filtered_properties); diff --git a/src/cpp/src/text2image/schedulers/ddim.cpp b/src/cpp/src/text2image/schedulers/ddim.cpp index fd1a4fb2c0..eaeb210cd1 100644 --- a/src/cpp/src/text2image/schedulers/ddim.cpp +++ b/src/cpp/src/text2image/schedulers/ddim.cpp @@ -12,7 +12,7 @@ namespace ov { namespace genai { -DDIMScheduler::Config::Config(const std::string& scheduler_config_path) { +DDIMScheduler::Config::Config(const std::filesystem::path& scheduler_config_path) { std::ifstream file(scheduler_config_path); OPENVINO_ASSERT(file.is_open(), "Failed to open ", scheduler_config_path); @@ -36,7 +36,7 @@ DDIMScheduler::Config::Config(const std::string& scheduler_config_path) { read_json_param(data, "rescale_betas_zero_snr", rescale_betas_zero_snr); } -DDIMScheduler::DDIMScheduler(const std::string scheduler_config_path) +DDIMScheduler::DDIMScheduler(const std::filesystem::path& scheduler_config_path) : DDIMScheduler(Config(scheduler_config_path)) { } diff --git a/src/cpp/src/text2image/schedulers/ddim.hpp b/src/cpp/src/text2image/schedulers/ddim.hpp index 936f4991ea..d0ab53d0f5 100644 --- a/src/cpp/src/text2image/schedulers/ddim.hpp +++ b/src/cpp/src/text2image/schedulers/ddim.hpp @@ -3,6 +3,7 @@ #pragma once +#include #include #include @@ -28,10 +29,10 @@ class DDIMScheduler : public IScheduler { bool rescale_betas_zero_snr = false; Config() = default; - explicit Config(const std::string& scheduler_config_path); + explicit Config(const std::filesystem::path& scheduler_config_path); }; - explicit DDIMScheduler(const std::string scheduler_config_path); + explicit DDIMScheduler(const std::filesystem::path& scheduler_config_path); explicit DDIMScheduler(const Config& scheduler_config); void set_timesteps(size_t num_inference_steps) override; diff --git a/src/cpp/src/text2image/schedulers/euler_discrete.cpp b/src/cpp/src/text2image/schedulers/euler_discrete.cpp index c7d8c5e532..6ac65177d8 100644 --- a/src/cpp/src/text2image/schedulers/euler_discrete.cpp +++ b/src/cpp/src/text2image/schedulers/euler_discrete.cpp @@ -14,7 +14,7 @@ namespace ov { namespace genai { -EulerDiscreteScheduler::Config::Config(const std::string& scheduler_config_path) { +EulerDiscreteScheduler::Config::Config(const std::filesystem::path& scheduler_config_path) { std::ifstream file(scheduler_config_path); OPENVINO_ASSERT(file.is_open(), "Failed to open ", scheduler_config_path); @@ -40,7 +40,7 @@ EulerDiscreteScheduler::Config::Config(const std::string& scheduler_config_path) read_json_param(data, "use_beta_sigmas", use_beta_sigmas); } -EulerDiscreteScheduler::EulerDiscreteScheduler(const std::string scheduler_config_path) +EulerDiscreteScheduler::EulerDiscreteScheduler(const std::filesystem::path& scheduler_config_path) : EulerDiscreteScheduler(Config(scheduler_config_path)) {} EulerDiscreteScheduler::EulerDiscreteScheduler(const Config& scheduler_config) : m_config(scheduler_config) { diff --git a/src/cpp/src/text2image/schedulers/euler_discrete.hpp b/src/cpp/src/text2image/schedulers/euler_discrete.hpp index 96c3fbfbdf..e6c826f739 100644 --- a/src/cpp/src/text2image/schedulers/euler_discrete.hpp +++ b/src/cpp/src/text2image/schedulers/euler_discrete.hpp @@ -3,6 +3,7 @@ #pragma once +#include #include #include @@ -30,10 +31,10 @@ class EulerDiscreteScheduler : public IScheduler { bool use_karras_sigmas = false, use_exponential_sigmas = false, use_beta_sigmas = false; Config() = default; - explicit Config(const std::string& scheduler_config_path); + explicit Config(const std::filesystem::path& scheduler_config_path); }; - explicit EulerDiscreteScheduler(const std::string scheduler_config_path); + explicit EulerDiscreteScheduler(const std::filesystem::path& scheduler_config_path); explicit EulerDiscreteScheduler(const Config& scheduler_config); void set_timesteps(size_t num_inference_steps) override; diff --git a/src/cpp/src/text2image/schedulers/lcm.cpp b/src/cpp/src/text2image/schedulers/lcm.cpp index c9947297de..c4f0f072a1 100644 --- a/src/cpp/src/text2image/schedulers/lcm.cpp +++ b/src/cpp/src/text2image/schedulers/lcm.cpp @@ -14,7 +14,7 @@ namespace ov { namespace genai { -LCMScheduler::Config::Config(const std::string scheduler_config_path) { +LCMScheduler::Config::Config(const std::filesystem::path& scheduler_config_path) { std::ifstream file(scheduler_config_path); OPENVINO_ASSERT(file.is_open(), "Failed to open ", scheduler_config_path); @@ -40,7 +40,7 @@ LCMScheduler::Config::Config(const std::string scheduler_config_path) { read_json_param(data, "timestep_spacing", timestep_spacing); } -LCMScheduler::LCMScheduler(const std::string scheduler_config_path) : +LCMScheduler::LCMScheduler(const std::filesystem::path& scheduler_config_path) : LCMScheduler(Config(scheduler_config_path)) { } diff --git a/src/cpp/src/text2image/schedulers/lcm.hpp b/src/cpp/src/text2image/schedulers/lcm.hpp index 8abbcd3e29..13b9d9406c 100644 --- a/src/cpp/src/text2image/schedulers/lcm.hpp +++ b/src/cpp/src/text2image/schedulers/lcm.hpp @@ -3,6 +3,7 @@ #pragma once +#include #include #include #include @@ -35,10 +36,10 @@ class LCMScheduler : public IScheduler { bool rescale_betas_zero_snr = false; Config() = default; - explicit Config(const std::string scheduler_config_path); + explicit Config(const std::filesystem::path& scheduler_config_path); }; - explicit LCMScheduler(const std::string scheduler_config_path); + explicit LCMScheduler(const std::filesystem::path& scheduler_config_path); explicit LCMScheduler(const Config& scheduler_config); void set_timesteps(size_t num_inference_steps) override; diff --git a/src/cpp/src/text2image/schedulers/lms_discrete.cpp b/src/cpp/src/text2image/schedulers/lms_discrete.cpp index 270d12ecea..dbb1358373 100644 --- a/src/cpp/src/text2image/schedulers/lms_discrete.cpp +++ b/src/cpp/src/text2image/schedulers/lms_discrete.cpp @@ -92,7 +92,7 @@ int64_t LMSDiscreteScheduler::_sigma_to_t(float sigma) const { return timestep; } -LMSDiscreteScheduler::Config::Config(const std::string& scheduler_config_path) { +LMSDiscreteScheduler::Config::Config(const std::filesystem::path& scheduler_config_path) { std::ifstream file(scheduler_config_path); OPENVINO_ASSERT(file.is_open(), "Failed to open ", scheduler_config_path); @@ -108,7 +108,7 @@ LMSDiscreteScheduler::Config::Config(const std::string& scheduler_config_path) { read_json_param(data, "steps_offset", steps_offset); } -LMSDiscreteScheduler::LMSDiscreteScheduler(const std::string scheduler_config_path) +LMSDiscreteScheduler::LMSDiscreteScheduler(const std::filesystem::path& scheduler_config_path) : LMSDiscreteScheduler(Config(scheduler_config_path)) { } diff --git a/src/cpp/src/text2image/schedulers/lms_discrete.hpp b/src/cpp/src/text2image/schedulers/lms_discrete.hpp index a8eacc4759..6c0a61a777 100644 --- a/src/cpp/src/text2image/schedulers/lms_discrete.hpp +++ b/src/cpp/src/text2image/schedulers/lms_discrete.hpp @@ -3,6 +3,7 @@ #pragma once +#include #include #include @@ -24,10 +25,10 @@ class LMSDiscreteScheduler : public IScheduler { size_t steps_offset = 0; Config() = default; - explicit Config(const std::string& scheduler_config_path); + explicit Config(const std::filesystem::path& scheduler_config_path); }; - explicit LMSDiscreteScheduler(const std::string scheduler_config_path); + explicit LMSDiscreteScheduler(const std::filesystem::path& scheduler_config_path); explicit LMSDiscreteScheduler(const Config& scheduler_config); void set_timesteps(size_t num_inference_steps) override; diff --git a/src/cpp/src/text2image/schedulers/scheduler.cpp b/src/cpp/src/text2image/schedulers/scheduler.cpp index a69de78868..2ee4c2adac 100644 --- a/src/cpp/src/text2image/schedulers/scheduler.cpp +++ b/src/cpp/src/text2image/schedulers/scheduler.cpp @@ -15,7 +15,7 @@ namespace ov { namespace genai { -std::shared_ptr Text2ImagePipeline::Scheduler::from_config(const std::string& scheduler_config_path, Type scheduler_type) { +std::shared_ptr Text2ImagePipeline::Scheduler::from_config(const std::filesystem::path& scheduler_config_path, Type scheduler_type) { std::ifstream file(scheduler_config_path); OPENVINO_ASSERT(file.is_open(), "Failed to open ", scheduler_config_path); diff --git a/src/cpp/src/text2image/stable_diffusion_pipeline.hpp b/src/cpp/src/text2image/stable_diffusion_pipeline.hpp index 5586e28c56..014f1763bf 100644 --- a/src/cpp/src/text2image/stable_diffusion_pipeline.hpp +++ b/src/cpp/src/text2image/stable_diffusion_pipeline.hpp @@ -5,6 +5,7 @@ #include #include +#include #include "json_utils.hpp" #include "lora_helper.hpp" @@ -39,33 +40,33 @@ ov::Tensor get_guidance_scale_embedding(float guidance_scale, uint32_t embedding class Text2ImagePipeline::StableDiffusionPipeline : public Text2ImagePipeline::DiffusionPipeline { public: - explicit StableDiffusionPipeline(const std::string& root_dir) { - const std::string model_index_path = root_dir + "/model_index.json"; + explicit StableDiffusionPipeline(const std::filesystem::path& root_dir) { + const std::filesystem::path model_index_path = root_dir / "model_index.json"; std::ifstream file(model_index_path); OPENVINO_ASSERT(file.is_open(), "Failed to open ", model_index_path); nlohmann::json data = nlohmann::json::parse(file); using utils::read_json_param; - set_scheduler(Scheduler::from_config(root_dir + "/scheduler/scheduler_config.json")); + set_scheduler(Scheduler::from_config(root_dir / "scheduler/scheduler_config.json")); const std::string text_encoder = data["text_encoder"][1].get(); if (text_encoder == "CLIPTextModel") { - m_clip_text_encoder = std::make_shared(root_dir + "/text_encoder"); + m_clip_text_encoder = std::make_shared(root_dir / "text_encoder"); } else { OPENVINO_THROW("Unsupported '", text_encoder, "' text encoder type"); } const std::string unet = data["unet"][1].get(); if (unet == "UNet2DConditionModel") { - m_unet = std::make_shared(root_dir + "/unet"); + m_unet = std::make_shared(root_dir / "unet"); } else { OPENVINO_THROW("Unsupported '", unet, "' UNet type"); } const std::string vae = data["vae"][1].get(); if (vae == "AutoencoderKL") { - m_vae_decoder = std::make_shared(root_dir + "/vae_decoder"); + m_vae_decoder = std::make_shared(root_dir / "vae_decoder"); } else { OPENVINO_THROW("Unsupported '", vae, "' VAE decoder type"); } @@ -74,33 +75,33 @@ class Text2ImagePipeline::StableDiffusionPipeline : public Text2ImagePipeline::D initialize_generation_config(data["_class_name"].get()); } - StableDiffusionPipeline(const std::string& root_dir, const std::string& device, const ov::AnyMap& properties) { - const std::string model_index_path = root_dir + "/model_index.json"; + StableDiffusionPipeline(const std::filesystem::path& root_dir, const std::string& device, const ov::AnyMap& properties) { + const std::filesystem::path model_index_path = root_dir / "model_index.json"; std::ifstream file(model_index_path); OPENVINO_ASSERT(file.is_open(), "Failed to open ", model_index_path); nlohmann::json data = nlohmann::json::parse(file); using utils::read_json_param; - set_scheduler(Scheduler::from_config(root_dir + "/scheduler/scheduler_config.json")); + set_scheduler(Scheduler::from_config(root_dir / "scheduler/scheduler_config.json")); const std::string text_encoder = data["text_encoder"][1].get(); if (text_encoder == "CLIPTextModel") { - m_clip_text_encoder = std::make_shared(root_dir + "/text_encoder", device, properties); + m_clip_text_encoder = std::make_shared(root_dir / "text_encoder", device, properties); } else { OPENVINO_THROW("Unsupported '", text_encoder, "' text encoder type"); } const std::string unet = data["unet"][1].get(); if (unet == "UNet2DConditionModel") { - m_unet = std::make_shared(root_dir + "/unet", device, properties); + m_unet = std::make_shared(root_dir / "unet", device, properties); } else { OPENVINO_THROW("Unsupported '", unet, "' UNet type"); } const std::string vae = data["vae"][1].get(); if (vae == "AutoencoderKL") { - m_vae_decoder = std::make_shared(root_dir + "/vae_decoder", device, properties); + m_vae_decoder = std::make_shared(root_dir / "vae_decoder", device, properties); } else { OPENVINO_THROW("Unsupported '", vae, "' VAE decoder type"); } diff --git a/src/cpp/src/text2image/stable_diffusion_xl_pipeline.hpp b/src/cpp/src/text2image/stable_diffusion_xl_pipeline.hpp index 483d3628ac..e42a34d287 100644 --- a/src/cpp/src/text2image/stable_diffusion_xl_pipeline.hpp +++ b/src/cpp/src/text2image/stable_diffusion_xl_pipeline.hpp @@ -13,40 +13,40 @@ namespace genai { class Text2ImagePipeline::StableDiffusionXLPipeline : public Text2ImagePipeline::DiffusionPipeline { public: - explicit StableDiffusionXLPipeline(const std::string& root_dir) { - const std::string model_index_path = root_dir + "/model_index.json"; + explicit StableDiffusionXLPipeline(const std::filesystem::path& root_dir) { + const std::filesystem::path model_index_path = root_dir / "model_index.json"; std::ifstream file(model_index_path); OPENVINO_ASSERT(file.is_open(), "Failed to open ", model_index_path); nlohmann::json data = nlohmann::json::parse(file); using utils::read_json_param; - set_scheduler(Scheduler::from_config(root_dir + "/scheduler/scheduler_config.json")); + set_scheduler(Scheduler::from_config(root_dir / "scheduler/scheduler_config.json")); const std::string text_encoder = data["text_encoder"][1].get(); if (text_encoder == "CLIPTextModel") { - m_clip_text_encoder = std::make_shared(root_dir + "/text_encoder"); + m_clip_text_encoder = std::make_shared(root_dir / "text_encoder"); } else { OPENVINO_THROW("Unsupported '", text_encoder, "' text encoder type"); } const std::string text_encoder_2 = data["text_encoder_2"][1].get(); if (text_encoder_2 == "CLIPTextModelWithProjection") { - m_clip_text_encoder_with_projection = std::make_shared(root_dir + "/text_encoder_2"); + m_clip_text_encoder_with_projection = std::make_shared(root_dir / "text_encoder_2"); } else { OPENVINO_THROW("Unsupported '", text_encoder, "' text encoder type"); } const std::string unet = data["unet"][1].get(); if (unet == "UNet2DConditionModel") { - m_unet = std::make_shared(root_dir + "/unet"); + m_unet = std::make_shared(root_dir / "unet"); } else { OPENVINO_THROW("Unsupported '", unet, "' UNet type"); } const std::string vae = data["vae"][1].get(); if (vae == "AutoencoderKL") { - m_vae_decoder = std::make_shared(root_dir + "/vae_decoder"); + m_vae_decoder = std::make_shared(root_dir / "vae_decoder"); } else { OPENVINO_THROW("Unsupported '", vae, "' VAE decoder type"); } @@ -55,20 +55,20 @@ class Text2ImagePipeline::StableDiffusionXLPipeline : public Text2ImagePipeline: initialize_generation_config(data["_class_name"].get()); } - StableDiffusionXLPipeline(const std::string& root_dir, const std::string& device, const ov::AnyMap& properties) { - const std::string model_index_path = root_dir + "/model_index.json"; + StableDiffusionXLPipeline(const std::filesystem::path& root_dir, const std::string& device, const ov::AnyMap& properties) { + const std::filesystem::path model_index_path = root_dir / "model_index.json"; std::ifstream file(model_index_path); OPENVINO_ASSERT(file.is_open(), "Failed to open ", model_index_path); nlohmann::json data = nlohmann::json::parse(file); using utils::read_json_param; - set_scheduler(Scheduler::from_config(root_dir + "/scheduler/scheduler_config.json")); + set_scheduler(Scheduler::from_config(root_dir / "scheduler/scheduler_config.json")); const std::string text_encoder = data["text_encoder"][1].get(); if (text_encoder == "CLIPTextModel") { AdapterConfig adapters; - std::string path = root_dir + "/text_encoder"; + std::filesystem::path path = root_dir / "text_encoder"; if(update_adapters_from_properties(properties, adapters) && !adapters.get_tensor_name_prefix()) { auto clip_properties = properties; adapters.set_tensor_name_prefix("lora_te1"); @@ -84,7 +84,7 @@ class Text2ImagePipeline::StableDiffusionXLPipeline : public Text2ImagePipeline: const std::string text_encoder_2 = data["text_encoder_2"][1].get(); if (text_encoder_2 == "CLIPTextModelWithProjection") { AdapterConfig adapters; - std::string path = root_dir + "/text_encoder_2"; + std::filesystem::path path = root_dir / "text_encoder_2"; if(update_adapters_from_properties(properties, adapters) && !adapters.get_tensor_name_prefix()) { auto clip_properties = properties; adapters.set_tensor_name_prefix("lora_te2"); @@ -99,14 +99,14 @@ class Text2ImagePipeline::StableDiffusionXLPipeline : public Text2ImagePipeline: const std::string unet = data["unet"][1].get(); if (unet == "UNet2DConditionModel") { - m_unet = std::make_shared(root_dir + "/unet", device, properties); + m_unet = std::make_shared(root_dir / "unet", device, properties); } else { OPENVINO_THROW("Unsupported '", unet, "' UNet type"); } const std::string vae = data["vae"][1].get(); if (vae == "AutoencoderKL") { - m_vae_decoder = std::make_shared(root_dir + "/vae_decoder", device, properties); + m_vae_decoder = std::make_shared(root_dir / "vae_decoder", device, properties); } else { OPENVINO_THROW("Unsupported '", vae, "' VAE decoder type"); } diff --git a/src/cpp/src/text2image/text2image_pipeline.cpp b/src/cpp/src/text2image/text2image_pipeline.cpp index 04422ef12f..a50ca564af 100644 --- a/src/cpp/src/text2image/text2image_pipeline.cpp +++ b/src/cpp/src/text2image/text2image_pipeline.cpp @@ -63,7 +63,7 @@ void Text2ImagePipeline::GenerationConfig::validate() const { // Text2ImagePipeline // -Text2ImagePipeline::Text2ImagePipeline(const std::string& root_dir) { +Text2ImagePipeline::Text2ImagePipeline(const std::filesystem::path& root_dir) { const std::string class_name = get_class_name(root_dir); if (class_name == "StableDiffusionPipeline" || @@ -76,7 +76,7 @@ Text2ImagePipeline::Text2ImagePipeline(const std::string& root_dir) { } } -Text2ImagePipeline::Text2ImagePipeline(const std::string& root_dir, const std::string& device, const ov::AnyMap& properties) { +Text2ImagePipeline::Text2ImagePipeline(const std::filesystem::path& root_dir, const std::string& device, const ov::AnyMap& properties) { const std::string class_name = get_class_name(root_dir); if (class_name == "StableDiffusionPipeline" || diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp index 757a5c3544..c5ccd8ab4d 100644 --- a/src/cpp/src/tokenizer.cpp +++ b/src/cpp/src/tokenizer.cpp @@ -79,11 +79,11 @@ class Tokenizer::TokenizerImpl { int64_t m_bos_token_id = -1; int64_t m_eos_token_id = -1; - std::string m_pad_token = ""; - std::string m_bos_token = ""; - std::string m_eos_token = ""; + std::string m_pad_token = {}; + std::string m_bos_token = {}; + std::string m_eos_token = {}; - std::string m_chat_template = ""; + std::string m_chat_template = {}; void set_state_if_necessary(CircularBufferQueueElementGuard& infer_request_guard, bool add_special_tokens) { // If user requested add_special_tokens mode different from the current one, @@ -110,15 +110,15 @@ class Tokenizer::TokenizerImpl { TokenizerImpl() = default; - TokenizerImpl(std::filesystem::path tokenizer_path, const ov::AnyMap& plugin_config) + TokenizerImpl(std::filesystem::path tokenizer_path, const ov::AnyMap& properties) : m_chat_template{chat_template_from_tokenizer_json_if_exists(tokenizer_path)} { ov::Core core; - OPENVINO_ASSERT(tokenizer_path.extension() != ".xml", "ov_tokenizers_path should be a path to a dir not a xml file"); + OPENVINO_ASSERT(tokenizer_path.extension() != ".xml", "ov_tokenizer_path should be a path to a dir not a xml file"); - const char* ov_tokenizers_path = getenv(ScopedVar::ENVIRONMENT_VARIABLE_NAME); - OPENVINO_ASSERT(ov_tokenizers_path, "openvino_tokenizers path is not set"); - core.add_extension(ov_tokenizers_path); + const char* ov_tokenizer_path = getenv(ScopedVar::ENVIRONMENT_VARIABLE_NAME); + OPENVINO_ASSERT(ov_tokenizer_path, "openvino_tokenizers path is not set"); + core.add_extension(ov_tokenizer_path); read_config(tokenizer_path); read_special_tokens_map(tokenizer_path); @@ -133,9 +133,9 @@ class Tokenizer::TokenizerImpl { manager.register_pass(); manager.run_passes(ov_tokenizer); - m_tokenizer = core.compile_model(ov_tokenizer, device, plugin_config); + m_tokenizer = core.compile_model(ov_tokenizer, device, properties); if (std::filesystem::exists(tokenizer_path / "openvino_detokenizer.xml")) { - m_detokenizer = core.compile_model(tokenizer_path / "openvino_detokenizer.xml", device, plugin_config); + m_detokenizer = core.compile_model(tokenizer_path / "openvino_detokenizer.xml", device, properties); } @@ -306,7 +306,6 @@ class Tokenizer::TokenizerImpl { } TokenizedInputs encode(std::vector& prompts, const ov::AnyMap& tokenization_params = {}) { - TokenizedInputs unpadded; { bool add_special_tokens_flag = true; @@ -491,9 +490,9 @@ class Tokenizer::TokenizerImpl { } }; -Tokenizer::Tokenizer(const std::string& tokenizer_path, const ov::AnyMap& plugin_config) { +Tokenizer::Tokenizer(const std::filesystem::path& tokenizer_path, const ov::AnyMap& properties) { ScopedVar env_manager(tokenizers_relative_to_genai().string()); - m_pimpl = std::make_shared(tokenizer_path, plugin_config); + m_pimpl = std::make_shared(tokenizer_path, properties); } TokenizedInputs Tokenizer::encode(const std::string prompt, const ov::AnyMap& tokenization_params) { diff --git a/src/cpp/src/tokenizers_path.hpp b/src/cpp/src/tokenizers_path.hpp index 5557b099f1..a8ef1cb214 100644 --- a/src/cpp/src/tokenizers_path.hpp +++ b/src/cpp/src/tokenizers_path.hpp @@ -22,9 +22,10 @@ namespace { // and unsets in destructor. Does nothing if ENVIRONMENT_VARIABLE_NAME // was already defined. class ScopedVar { -public: bool was_already_set{false}; +public: static constexpr char ENVIRONMENT_VARIABLE_NAME[] = "OPENVINO_TOKENIZERS_PATH_GENAI"; + explicit ScopedVar(const std::string& environment_variable_value) { #ifdef _WIN32 char* value = nullptr; diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp index ab5853403f..dcc73f2ea3 100644 --- a/src/cpp/src/utils.cpp +++ b/src/cpp/src/utils.cpp @@ -203,20 +203,20 @@ ProcessorConfig from_any_map( * There are not supported by `core.compile` function plugin options like `ENABLE_MMAP` * Move this options to `core.set_property` config */ -std::pair split_core_complile_config(const ov::AnyMap& plugin_config) { - const std::vector unsupported_by_compile_options{"ENABLE_MMAP"}; - ov::AnyMap core_config; - ov::AnyMap compile_config{plugin_config}; - - for (const auto option : unsupported_by_compile_options) { - auto iter = plugin_config.find(option); - if (iter != plugin_config.end()) { - core_config[option] = iter->second; - compile_config.erase(option); +std::pair split_core_complile_config(const ov::AnyMap& properties) { + const std::vector unsupported_by_compile_properties{"ENABLE_MMAP"}; + ov::AnyMap core_properties; + ov::AnyMap compile_properties{properties}; + + for (const auto option : unsupported_by_compile_properties) { + auto iter = properties.find(option); + if (iter != properties.end()) { + core_properties[option] = iter->second; + compile_properties.erase(option); } } - return {core_config, compile_config}; + return {core_properties, compile_properties}; }; ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend) { diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index 531b50e163..c412b4b650 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -36,9 +36,9 @@ const std::string STREAMER_ARG_NAME = "streamer"; const std::string CONFIG_ARG_NAME = "generation_config"; const std::string DRAFT_MODEL_ARG_NAME = "draft_model"; -template -Config from_config_json_if_exists(const std::filesystem::path& model_path, const char config_name[]="generation_config.json") { - auto config_file_path = model_path / config_name; +template +Config from_config_json_if_exists(const std::filesystem::path& models_path, const char config_name[] = "generation_config.json") { + auto config_file_path = models_path / config_name; if (std::filesystem::exists(config_file_path)) { return Config{(config_file_path).string()}; } else { @@ -55,7 +55,7 @@ ProcessorConfig from_any_map( const ProcessorConfig& initial ); -std::pair split_core_complile_config(const ov::AnyMap& plugin_config); +std::pair split_core_complile_config(const ov::AnyMap& properties); ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend); diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 69f135d037..ad10c0c8dc 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -169,24 +169,24 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { std::shared_ptr m_inputs_embedder; VLMPipelineImpl( - const std::filesystem::path& model_dir, + const std::filesystem::path& models_dir, const std::string& device, - const ov::AnyMap device_config + const ov::AnyMap& properties ) : m_vlm_config{ utils::from_config_json_if_exists( - model_dir, "config.json" + models_dir, "config.json" ) }, m_is_chat_conversation{false} { m_inputs_embedder = std::make_shared( - m_vlm_config, model_dir, device, device_config); + m_vlm_config, models_dir, device, properties); m_tokenizer = m_inputs_embedder->get_tokenizer(); m_embedding = m_inputs_embedder->get_embedding_model(); m_language = utils::singleton_core().compile_model( - model_dir / "openvino_language_model.xml", device, device_config + models_dir / "openvino_language_model.xml", device, properties ).create_infer_request(); m_language.get_tensor("attention_mask").set_shape({1, 0}); @@ -323,10 +323,10 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { }; VLMPipeline::VLMPipeline( - const std::filesystem::path& model_dir, + const std::filesystem::path& models_dir, const std::string& device, - const ov::AnyMap device_config -) : m_pimpl{std::make_unique(model_dir, device, device_config)} {} + const ov::AnyMap& properties +) : m_pimpl{std::make_unique(models_dir, device, properties)} {} ov::genai::VLMPipeline::~VLMPipeline() = default; diff --git a/src/cpp/src/whisper/whisper_config.cpp b/src/cpp/src/whisper/whisper_config.cpp index 9ec00c57bd..ab34063184 100644 --- a/src/cpp/src/whisper/whisper_config.cpp +++ b/src/cpp/src/whisper/whisper_config.cpp @@ -13,7 +13,7 @@ namespace ov { namespace genai { -WhisperConfig::WhisperConfig(const std::string& json_path) { +WhisperConfig::WhisperConfig(const std::filesystem::path& json_path) { // preprocessor_config.json not found. Skip parameters initialization from file, use defaults. if (!std::filesystem::exists(json_path)) { return; @@ -22,7 +22,7 @@ WhisperConfig::WhisperConfig(const std::string& json_path) { using ov::genai::utils::read_json_param; std::ifstream f(json_path); - OPENVINO_ASSERT(f.is_open(), "Failed to open '" + json_path + "' with config"); + OPENVINO_ASSERT(f.is_open(), "Failed to open '", json_path, "' with config"); nlohmann::json data = nlohmann::json::parse(f); diff --git a/src/cpp/src/whisper/whisper_config.hpp b/src/cpp/src/whisper/whisper_config.hpp index 31f8cd7618..8e67c3b5ab 100644 --- a/src/cpp/src/whisper/whisper_config.hpp +++ b/src/cpp/src/whisper/whisper_config.hpp @@ -3,7 +3,7 @@ #pragma once -#include +#include namespace ov { namespace genai { @@ -13,7 +13,7 @@ namespace genai { */ class WhisperConfig { public: - explicit WhisperConfig(const std::string& json_path); + explicit WhisperConfig(const std::filesystem::path& json_path); size_t max_source_positions = 1500; }; diff --git a/src/cpp/src/whisper/whisper_feature_extractor.cpp b/src/cpp/src/whisper/whisper_feature_extractor.cpp index 0aee350382..04070404c0 100644 --- a/src/cpp/src/whisper/whisper_feature_extractor.cpp +++ b/src/cpp/src/whisper/whisper_feature_extractor.cpp @@ -440,13 +440,13 @@ std::vector WhisperFeatures::get_data_with_offset(const size_t frame_offs return offset_data; } -WhisperFeatureExtractor::WhisperFeatureExtractor(const std::string& preprocessor_json_path) { +WhisperFeatureExtractor::WhisperFeatureExtractor(const std::filesystem::path& preprocessor_json_path) { init_parameters(preprocessor_json_path); fill_sin_cos_table(sin_vals, cos_vals, n_fft); init_mel_filter(); } -void WhisperFeatureExtractor::init_parameters(const std::string& preprocessor_json_path) { +void WhisperFeatureExtractor::init_parameters(const std::filesystem::path& preprocessor_json_path) { // preprocessor_config.json not found. Skip parameters initialization from file, use defaults. if (!std::filesystem::exists(preprocessor_json_path)) { return; @@ -455,7 +455,7 @@ void WhisperFeatureExtractor::init_parameters(const std::string& preprocessor_js using ov::genai::utils::read_json_param; std::ifstream f(preprocessor_json_path); - OPENVINO_ASSERT(f.is_open(), "Failed to open '" + preprocessor_json_path + "' with preprocessor config"); + OPENVINO_ASSERT(f.is_open(), "Failed to open '", preprocessor_json_path, "' with preprocessor config"); nlohmann::json data = nlohmann::json::parse(f); diff --git a/src/cpp/src/whisper/whisper_feature_extractor.hpp b/src/cpp/src/whisper/whisper_feature_extractor.hpp index b34b66c608..9197cc32ea 100644 --- a/src/cpp/src/whisper/whisper_feature_extractor.hpp +++ b/src/cpp/src/whisper/whisper_feature_extractor.hpp @@ -3,7 +3,7 @@ #pragma once -#include +#include #include #include "openvino/genai/visibility.hpp" @@ -41,7 +41,7 @@ class WhisperFeatureExtractor { size_t n_samples = 480000; size_t nb_max_frames = 3000; - explicit WhisperFeatureExtractor(const std::string& preprocessor_json_path); + explicit WhisperFeatureExtractor(const std::filesystem::path& preprocessor_json_path); /** * @brief Create a flattened 2d log-mel spectrogram [feature_size, n_frames] from raw speech data @@ -57,7 +57,7 @@ class WhisperFeatureExtractor { std::vector mel_filter; void init_mel_filter(); - void init_parameters(const std::string& preprocessor_json_path); + void init_parameters(const std::filesystem::path& preprocessor_json_path); }; } // namespace genai diff --git a/src/cpp/src/whisper_generation_config.cpp b/src/cpp/src/whisper_generation_config.cpp index e49ba57067..0fba4e962f 100644 --- a/src/cpp/src/whisper_generation_config.cpp +++ b/src/cpp/src/whisper_generation_config.cpp @@ -14,11 +14,11 @@ namespace ov { namespace genai { -WhisperGenerationConfig::WhisperGenerationConfig(const std::string& json_path) { +WhisperGenerationConfig::WhisperGenerationConfig(const std::filesystem::path& json_path) { using ov::genai::utils::read_json_param; std::ifstream f(json_path); - OPENVINO_ASSERT(f.is_open(), "Failed to open '" + json_path + "' with generation config"); + OPENVINO_ASSERT(f.is_open(), "Failed to open '", json_path, "' with generation config"); nlohmann::json data = nlohmann::json::parse(f); diff --git a/src/cpp/src/whisper_pipeline.cpp b/src/cpp/src/whisper_pipeline.cpp index ac36be941d..66ec044673 100644 --- a/src/cpp/src/whisper_pipeline.cpp +++ b/src/cpp/src/whisper_pipeline.cpp @@ -16,8 +16,8 @@ #include "whisper/whisper_models.hpp" namespace { -ov::genai::WhisperGenerationConfig from_config_json_if_exists(const std::filesystem::path& model_path) { - auto config_file_path = model_path / "generation_config.json"; +ov::genai::WhisperGenerationConfig from_config_json_if_exists(const std::filesystem::path& models_path) { + auto config_file_path = models_path / "generation_config.json"; if (std::filesystem::exists(config_file_path)) { return ov::genai::WhisperGenerationConfig((config_file_path).string()); } else { @@ -48,24 +48,23 @@ class WhisperPipeline::Impl { Tokenizer m_tokenizer; float m_load_time_ms = 0; - Impl(const std::filesystem::path& model_path, - const ov::genai::Tokenizer& tokenizer, + Impl(const std::filesystem::path& models_path, const std::string& device, - const ov::AnyMap& plugin_config) - : m_generation_config{from_config_json_if_exists(model_path)}, - m_tokenizer{tokenizer}, - m_feature_extractor{(model_path / "preprocessor_config.json").string()}, - m_model_config{(model_path / "config.json").string()} { - ov::Core core; - auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_complile_config(plugin_config); - core.set_property(core_plugin_config); - - m_models.encoder = core.compile_model(model_path / "openvino_encoder_model.xml", device, compile_plugin_config) + const ov::AnyMap& properties) + : m_generation_config{from_config_json_if_exists(models_path)}, + m_tokenizer{models_path}, + m_feature_extractor{(models_path / "preprocessor_config.json")}, + m_model_config{(models_path / "config.json")} { + ov::Core core = utils::singleton_core(); + auto [core_properties, compile_properties] = ov::genai::utils::split_core_complile_config(properties); + core.set_property(core_properties); + + m_models.encoder = core.compile_model((models_path / "openvino_encoder_model.xml").string(), device, compile_properties) .create_infer_request(); - m_models.decoder = core.compile_model(model_path / "openvino_decoder_model.xml", device, compile_plugin_config) + m_models.decoder = core.compile_model((models_path / "openvino_decoder_model.xml").string(), device, compile_properties) .create_infer_request(); m_models.decoder_with_past = - core.compile_model(model_path / "openvino_decoder_with_past_model.xml", device, compile_plugin_config) + core.compile_model(models_path / "openvino_decoder_with_past_model.xml", device, compile_properties) .create_infer_request(); // If eos_token_id was not provided, take value @@ -74,9 +73,6 @@ class WhisperPipeline::Impl { } } - Impl(const std::filesystem::path& model_path, const std::string& device, const ov::AnyMap& plugin_config) - : Impl{model_path, Tokenizer(model_path.string()), device, plugin_config} {} - WhisperDecodedResults generate(const RawSpeechInput& raw_speech_input, OptionalWhisperGenerationConfig generation_config, StreamerVariant streamer) { @@ -139,21 +135,11 @@ class WhisperPipeline::Impl { } // namespace genai } // namespace ov -ov::genai::WhisperPipeline::WhisperPipeline(const std::string& model_path, - const ov::genai::Tokenizer& tokenizer, - const std::string& device, - const ov::AnyMap& plugin_config) { - auto start_time = std::chrono::steady_clock::now(); - m_impl = std::make_unique(model_path, tokenizer, device, plugin_config); - auto stop_time = std::chrono::steady_clock::now(); - m_impl->m_load_time_ms = std::chrono::duration_cast(stop_time - start_time).count(); -} - -ov::genai::WhisperPipeline::WhisperPipeline(const std::string& model_path, +ov::genai::WhisperPipeline::WhisperPipeline(const std::filesystem::path& models_path, const std::string& device, - const ov::AnyMap& plugin_config) { + const ov::AnyMap& properties) { auto start_time = std::chrono::steady_clock::now(); - m_impl = std::make_unique(model_path, device, plugin_config); + m_impl = std::make_unique(models_path, device, properties); auto stop_time = std::chrono::steady_clock::now(); m_impl->m_load_time_ms = std::chrono::duration_cast(stop_time - start_time).count(); } diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp index ff651f0117..18d6e4ee81 100644 --- a/src/python/py_generate_pipeline.cpp +++ b/src/python/py_generate_pipeline.cpp @@ -5,10 +5,11 @@ #include #include #include +#include #include + #include "openvino/genai/continuous_batching_pipeline.hpp" #include "openvino/genai/llm_pipeline.hpp" -#include #include "tokenizers_path.hpp" #include "py_utils.hpp" @@ -84,7 +85,7 @@ auto decoded_results_docstring = R"( Structure to store resulting batched text outputs and scores for each batch. The first num_return_sequences elements correspond to the first batch element. - Parameters: + Parameters: texts: vector of resulting sequences. scores: scores for each sequence. metrics: performance metrics with tpot, ttft, etc. of type ov::genai::PerfMetrics. @@ -97,18 +98,18 @@ auto encoded_results_docstring = R"( sum of logarithmic probabilities for each token in the sequence. In the case of greedy decoding scores are filled with zeros. - Parameters: + Parameters: tokens: sequence of resulting tokens. scores: sum of logarithmic probabilities of all tokens in the sequence. metrics: performance metrics with tpot, ttft, etc. of type ov::genai::PerfMetrics. )"; auto generation_config_docstring = R"( - Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group - and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will + Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group + and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will be used while greedy and beam search parameters will not affect decoding at all. - Parameters: + Parameters: max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set. max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. @@ -129,8 +130,8 @@ auto generation_config_docstring = R"( length_penalty < 0.0 encourages shorter sequences. num_return_sequences: the number of sequences to return for grouped beam search decoding. no_repeat_ngram_size: if set to int > 0, all ngrams of that size can only occur once. - stop_criteria: controls the stopping condition for grouped beam search. It accepts the following values: - "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; + stop_criteria: controls the stopping condition for grouped beam search. It accepts the following values: + "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates; "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). @@ -139,13 +140,13 @@ auto generation_config_docstring = R"( top_p: if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. top_k: the number of highest probability vocabulary tokens to keep for top-k-filtering. do_sample: whether or not to use multinomial random sampling that add up to `top_p` or higher are kept. - repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. + repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. )"; auto scheduler_config_docstring = R"( SchedulerConfig to construct ContinuousBatchingPipeline - Parameters: + Parameters: max_num_batched_tokens: a maximum number of tokens to batch (in constrast to max_batch_size which combines independent sequences, we consider total amount of tokens in a batch). num_kv_blocks: total number of KV blocks available to scheduler logic. @@ -166,7 +167,7 @@ auto scheduler_config_docstring = R"( auto generation_result_docstring = R"( GenerationResult stores resulting batched tokens and scores. - Parameters: + Parameters: request_id: obsolete when handle API is approved as handle will connect results with prompts. generation_ids: in a generic case we have multiple generation results per initial prompt depending on sampling parameters (e.g. beam search or parallel sampling). @@ -182,7 +183,7 @@ auto generation_result_docstring = R"( auto stop_criteria_docstring = R"( StopCriteria controls the stopping condition for grouped beam search. - + The following values are possible: "openvino_genai.StopCriteria.EARLY" stops as soon as there are `num_beams` complete candidates. "openvino_genai.StopCriteria.HEURISTIC" stops when is it unlikely to find better candidates. @@ -195,8 +196,8 @@ auto streamer_base_docstring = R"( auto tokenized_inputs_docstring = R"( Structure to agregate inputs to model. - - Parameters: + + Parameters: input_ids: numerical token IDs from the tokenizer attention_mask: indicates which tokens are attended to )"; @@ -332,10 +333,10 @@ py::list handle_utf8_results(const std::vector& decoded_res) { } py::object call_common_generate( - LLMPipeline& pipe, - const std::variant>& inputs, - const OptionalGenerationConfig& config, - const utils::PyBindStreamerVariant& py_streamer, + LLMPipeline& pipe, + const std::variant>& inputs, + const OptionalGenerationConfig& config, + const utils::PyBindStreamerVariant& py_streamer, const py::kwargs& kwargs ) { auto updated_config = ov::genai::pybind::utils::update_config_from_kwargs(config, kwargs); @@ -365,7 +366,7 @@ py::object call_common_generate( results = py::cast(pipe.generate(string_input, updated_config, streamer)); }}, inputs); - + return results; } @@ -403,87 +404,87 @@ PYBIND11_MODULE(py_generate_pipeline, m) { py::class_(m, "LLMPipeline", "This class is used for generation with LLMs") .def(py::init([]( - const std::string& model_path, + const std::filesystem::path& models_path, const std::string& device, const std::map& config ) { ScopedVar env_manager(utils::ov_tokenizers_module_path()); - return std::make_unique(model_path, device, utils::properties_to_any_map(config)); + return std::make_unique(models_path, device, utils::properties_to_any_map(config)); }), - py::arg("model_path"), "folder with openvino_model.xml and openvino_tokenizer[detokenizer].xml files", - py::arg("device") = "CPU", "device on which inference will be done", + py::arg("models_path"), "folder with openvino_model.xml and openvino_tokenizer[detokenizer].xml files", + py::arg("device"), "device on which inference will be done", py::arg("config") = ov::AnyMap({}), "openvino.properties map", R"( LLMPipeline class constructor. - model_path (str): Path to the model file. + models_path (str): Path to the model file. device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'. Add {"scheduler_config": ov_genai.SchedulerConfig} to config properties to create continuous batching pipeline. )") .def(py::init([]( - const std::string& model_path, + const std::filesystem::path& models_path, const std::string& device, const py::kwargs& kwargs ) { ScopedVar env_manager(utils::ov_tokenizers_module_path()); - return std::make_unique(model_path, device, utils::kwargs_to_any_map(kwargs)); + return std::make_unique(models_path, device, utils::kwargs_to_any_map(kwargs)); }), - py::arg("model_path"), "folder with openvino_model.xml and openvino_tokenizer[detokenizer].xml files", - py::arg("device") = "CPU", "device on which inference will be done", + py::arg("models_path"), "folder with openvino_model.xml and openvino_tokenizer[detokenizer].xml files", + py::arg("device"), "device on which inference will be done", R"( LLMPipeline class constructor. - model_path (str): Path to the model file. + models_path (str): Path to the model file. device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'. Add {"scheduler_config": ov_genai.SchedulerConfig} to config properties to create continuous batching pipeline. )") .def(py::init([]( - const std::string& model_path, + const std::filesystem::path& models_path, const Tokenizer& tokenizer, const std::string& device, const std::map& config ) { ScopedVar env_manager(utils::ov_tokenizers_module_path()); - return std::make_unique(model_path, tokenizer, device, utils::properties_to_any_map(config)); + return std::make_unique(models_path, tokenizer, device, utils::properties_to_any_map(config)); }), - py::arg("model_path"), + py::arg("models_path"), py::arg("tokenizer"), - py::arg("device") = "CPU", + py::arg("device"), py::arg("config") = ov::AnyMap({}), "openvino.properties map", R"( LLMPipeline class constructor for manualy created openvino_genai.Tokenizer. - model_path (str): Path to the model file. + models_path (str): Path to the models file. tokenizer (openvino_genai.Tokenizer): tokenizer object. device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'. Add {"scheduler_config": ov_genai.SchedulerConfig} to config properties to create continuous batching pipeline. )") .def(py::init([]( - const std::string& model_path, + const std::filesystem::path& models_path, const Tokenizer& tokenizer, const std::string& device, const py::kwargs& kwargs ) { ScopedVar env_manager(utils::ov_tokenizers_module_path()); - return std::make_unique(model_path, tokenizer, device, utils::kwargs_to_any_map(kwargs)); + return std::make_unique(models_path, tokenizer, device, utils::kwargs_to_any_map(kwargs)); }), - py::arg("model_path"), + py::arg("models_path"), py::arg("tokenizer"), - py::arg("device") = "CPU", + py::arg("device"), R"( LLMPipeline class constructor for manualy created openvino_genai.Tokenizer. - model_path (str): Path to the model file. + models_path (str): Path to the model file. tokenizer (openvino_genai.Tokenizer): tokenizer object. device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'. Add {"scheduler_config": ov_genai.SchedulerConfig} to config properties to create continuous batching pipeline. )") .def( - "generate", - [](LLMPipeline& pipe, - const std::variant>& inputs, - const OptionalGenerationConfig& generation_config, - const utils::PyBindStreamerVariant& streamer, + "generate", + [](LLMPipeline& pipe, + const std::variant>& inputs, + const OptionalGenerationConfig& generation_config, + const utils::PyBindStreamerVariant& streamer, const py::kwargs& kwargs ) { return call_common_generate(pipe, inputs, generation_config, streamer, kwargs); @@ -495,11 +496,11 @@ PYBIND11_MODULE(py_generate_pipeline, m) { ) .def( - "__call__", - [](LLMPipeline& pipe, - const std::variant>& inputs, - const OptionalGenerationConfig& generation_config, - const utils::PyBindStreamerVariant& streamer, + "__call__", + [](LLMPipeline& pipe, + const std::variant>& inputs, + const OptionalGenerationConfig& generation_config, + const utils::PyBindStreamerVariant& streamer, const py::kwargs& kwargs ) { return call_common_generate(pipe, inputs, generation_config, streamer, kwargs); @@ -518,14 +519,14 @@ PYBIND11_MODULE(py_generate_pipeline, m) { // Binding for Tokenizer py::class_(m, "Tokenizer", - R"(openvino_genai.Tokenizer object is used to initialize Tokenizer + R"(openvino_genai.Tokenizer object is used to initialize Tokenizer if it's located in a different path than the main model.)") - - .def(py::init([](const std::string& tokenizer_path, const std::map& plugin_config) { + + .def(py::init([](const std::filesystem::path& tokenizer_path, const std::map& plugin_config) { ScopedVar env_manager(utils::ov_tokenizers_module_path()); return std::make_unique(tokenizer_path, utils::properties_to_any_map(plugin_config)); }), py::arg("tokenizer_path"), py::arg("plugin_config") = ov::AnyMap({})) - + .def("encode", [](Tokenizer& tok, std::vector& prompts, bool add_special_tokens) { ov::AnyMap tokenization_params; tokenization_params[ov::genai::add_special_tokens.name()] = add_special_tokens; @@ -534,7 +535,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) { py::arg("prompts"), py::arg("add_special_tokens") = true, R"(Encodes a list of prompts into tokenized inputs.)") - + .def("encode", [](Tokenizer& tok, const std::string prompt, bool add_special_tokens) { ov::AnyMap tokenization_params; tokenization_params[ov::genai::add_special_tokens.name()] = add_special_tokens; @@ -542,43 +543,43 @@ PYBIND11_MODULE(py_generate_pipeline, m) { }, py::arg("prompt"), py::arg("add_special_tokens") = true, R"(Encodes a single prompt into tokenized input.)") - + .def( - "decode", - [](Tokenizer& tok, std::vector& tokens) -> py::str { + "decode", + [](Tokenizer& tok, std::vector& tokens) -> py::str { return handle_utf8_results({tok.decode(tokens)})[0]; }, py::arg("tokens"), R"(Decode a sequence into a string prompt.)" ) - + .def( - "decode", - [](Tokenizer& tok, ov::Tensor& tokens) -> py::list { - return handle_utf8_results(tok.decode(tokens)); + "decode", + [](Tokenizer& tok, ov::Tensor& tokens) -> py::list { + return handle_utf8_results(tok.decode(tokens)); }, py::arg("tokens"), R"(Decode tensor into a list of string prompts.)") - + .def( - "decode", - [](Tokenizer& tok, std::vector>& tokens) -> py::list{ - return handle_utf8_results(tok.decode(tokens)); + "decode", + [](Tokenizer& tok, std::vector>& tokens) -> py::list{ + return handle_utf8_results(tok.decode(tokens)); }, py::arg("tokens"), R"(Decode a batch of tokens into a list of string prompt.)") - + .def("apply_chat_template", [](Tokenizer& tok, ChatHistory history, bool add_generation_prompt, const std::string& chat_template) { return tok.apply_chat_template(history, add_generation_prompt, chat_template); - }, - py::arg("history"), - py::arg("add_generation_prompt"), + }, + py::arg("history"), + py::arg("add_generation_prompt"), py::arg("chat_template") = "", R"(Embeds input prompts with special tags for a chat scenario.)") - + .def( "set_chat_template", &Tokenizer::set_chat_template, py::arg("chat_template"), "The new template to override with.", @@ -601,8 +602,8 @@ PYBIND11_MODULE(py_generate_pipeline, m) { // Binding for GenerationConfig py::class_(m, "GenerationConfig", generation_config_docstring) - .def(py::init(), py::arg("json_path"), "path where generation_config.json is stored") - .def(py::init([](py::kwargs kwargs) { return *ov::genai::pybind::utils::update_config_from_kwargs(GenerationConfig(), kwargs); })) + .def(py::init(), py::arg("json_path"), "path where generation_config.json is stored") + .def(py::init([](const py::kwargs& kwargs) { return *ov::genai::pybind::utils::update_config_from_kwargs(GenerationConfig(), kwargs); })) .def_readwrite("max_new_tokens", &GenerationConfig::max_new_tokens) .def_readwrite("max_length", &GenerationConfig::max_length) .def_readwrite("ignore_eos", &GenerationConfig::ignore_eos) @@ -641,7 +642,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) { py::str res; if (valid_utf8_strings.size() == 1) return valid_utf8_strings[0]; - + for (size_t i = 0; i < valid_utf8_strings.size() - 1; i++) { res += py::str(std::to_string(dr.scores[i])) + py::str(": ") + valid_utf8_strings[i] + py::str("\n"); } @@ -654,11 +655,11 @@ PYBIND11_MODULE(py_generate_pipeline, m) { .def_property_readonly("generate_durations", [](const RawPerfMetrics &rw) { return get_ms(rw, &RawPerfMetrics::generate_durations); }) - .def_property_readonly("tokenization_durations", [](const RawPerfMetrics &rw) { + .def_property_readonly("tokenization_durations", [](const RawPerfMetrics &rw) { return get_ms(rw, &RawPerfMetrics::tokenization_durations); }) - .def_property_readonly("detokenization_durations", [](const RawPerfMetrics &rw) { - return get_ms(rw, &RawPerfMetrics::detokenization_durations); + .def_property_readonly("detokenization_durations", [](const RawPerfMetrics &rw) { + return get_ms(rw, &RawPerfMetrics::detokenization_durations); }) .def_property_readonly("m_times_to_first_token", [](const RawPerfMetrics &rw) { return get_ms(rw, &RawPerfMetrics::m_times_to_first_token); @@ -779,14 +780,14 @@ PYBIND11_MODULE(py_generate_pipeline, m) { .export_values(); py::class_(m, "ContinuousBatchingPipeline", "This class is used for generation with LLMs with continuous batchig") - .def(py::init([](const std::string& model_path, const SchedulerConfig& scheduler_config, const std::string& device, const std::map& llm_plugin_config, const std::map& tokenizer_plugin_config) { + .def(py::init([](const std::filesystem::path& models_path, const SchedulerConfig& scheduler_config, const std::string& device, const std::map& llm_plugin_config, const std::map& tokenizer_plugin_config) { ScopedVar env_manager(utils::ov_tokenizers_module_path()); - return std::make_unique(model_path, scheduler_config, device, utils::properties_to_any_map(llm_plugin_config), utils::properties_to_any_map(tokenizer_plugin_config)); - }), py::arg("model_path"), py::arg("scheduler_config"), py::arg("device") = "CPU", py::arg("llm_plugin_config") = ov::AnyMap({}), py::arg("tokenizer_plugin_config") = ov::AnyMap({})) - .def(py::init([](const std::string& model_path, const ov::genai::Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const std::map& plugin_config) { + return std::make_unique(models_path, scheduler_config, device, utils::properties_to_any_map(llm_plugin_config), utils::properties_to_any_map(tokenizer_plugin_config)); + }), py::arg("models_path"), py::arg("scheduler_config"), py::arg("device"), py::arg("llm_plugin_config") = ov::AnyMap({}), py::arg("tokenizer_plugin_config") = ov::AnyMap({})) + .def(py::init([](const std::filesystem::path& models_path, const ov::genai::Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const std::map& plugin_config) { ScopedVar env_manager(utils::ov_tokenizers_module_path()); - return std::make_unique(model_path, tokenizer, scheduler_config, device, utils::properties_to_any_map(plugin_config)); - }), py::arg("model_path"), py::arg("tokenizer"), py::arg("scheduler_config"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap({})) + return std::make_unique(models_path, tokenizer, scheduler_config, device, utils::properties_to_any_map(plugin_config)); + }), py::arg("models_path"), py::arg("tokenizer"), py::arg("scheduler_config"), py::arg("device"), py::arg("plugin_config") = ov::AnyMap({})) .def("get_tokenizer", &ContinuousBatchingPipeline::get_tokenizer) .def("get_config", &ContinuousBatchingPipeline::get_config) .def("get_metrics", &ContinuousBatchingPipeline::get_metrics) @@ -811,13 +812,13 @@ PYBIND11_MODULE(py_generate_pipeline, m) { py::class_(m, "draft_model", py::module_local(), "This class is used to enable Speculative Decoding") .def(py::init([]( - const std::string& model_path, + const std::filesystem::path& models_path, const std::string& device, const py::kwargs& kwargs ) { - return ov::genai::_draft_model(model_path, device, utils::kwargs_to_any_map(kwargs)).second; + return ov::genai::draft_model(models_path, device, utils::kwargs_to_any_map(kwargs)).second; }), - py::arg("model_path"), "folder with openvino_model.xml and openvino_tokenizer[detokenizer].xml files", + py::arg("models_path"), "folder with openvino_model.xml and openvino_tokenizer[detokenizer].xml files", py::arg("device") = "", "device on which inference will be performed" ); @@ -839,5 +840,5 @@ PYBIND11_MODULE(py_generate_pipeline, m) { // init text2image pipeline init_text2image_pipeline(m); - + } diff --git a/src/python/py_lora_adapter.cpp b/src/python/py_lora_adapter.cpp index 0f8529d294..6fdc4223e5 100644 --- a/src/python/py_lora_adapter.cpp +++ b/src/python/py_lora_adapter.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include "py_utils.hpp" @@ -17,7 +18,7 @@ void init_lora_adapter(py::module_& m) { py::class_(m, "Adapter", "Immutable LoRA Adapter that carries the adaptation matrices and serves as unique adapter identifier.") .def(py::init<>()) .def(py::init([]( - const std::string& path + const std::filesystem::path& path ) { return ov::genai::Adapter(path); }), diff --git a/src/python/py_text2image_models.cpp b/src/python/py_text2image_models.cpp index f64438cade..52b5ab8b72 100644 --- a/src/python/py_text2image_models.cpp +++ b/src/python/py_text2image_models.cpp @@ -2,10 +2,13 @@ // SPDX-License-Identifier: Apache-2.0 #include + #include #include #include +#include #include + #include "openvino/genai/text2image/pipeline.hpp" #include "py_utils.hpp" @@ -16,7 +19,7 @@ namespace utils = ov::genai::pybind::utils; void init_clip_text_model(py::module_& m) { auto clip_text_model = py::class_(m, "CLIPTextModel", "CLIPTextModel class.") .def(py::init([]( - const std::string& root_dir + const std::filesystem::path& root_dir ) { return std::make_unique(root_dir); }), @@ -26,7 +29,7 @@ void init_clip_text_model(py::module_& m) { root_dir (str): Model root directory. )") .def(py::init([]( - const std::string& root_dir, + const std::filesystem::path& root_dir, const std::string& device, const py::kwargs& kwargs ) { @@ -74,7 +77,7 @@ void init_clip_text_model(py::module_& m) { ) { self.compile(device, utils::kwargs_to_any_map(kwargs)); }, - py::arg("device") = "CPU", "device on which inference will be done", + py::arg("device"), "device on which inference will be done", R"( Compiles the model. device (str): Device to run the model on (e.g., CPU, GPU). @@ -85,7 +88,7 @@ void init_clip_text_model(py::module_& m) { void init_unet2d_condition_model(py::module_& m) { auto unet2d_condition_model = py::class_(m, "UNet2DConditionModel", "UNet2DConditionModel class.") .def(py::init([]( - const std::string& root_dir + const std::filesystem::path& root_dir ) { return std::make_unique(root_dir); }), @@ -95,7 +98,7 @@ void init_unet2d_condition_model(py::module_& m) { root_dir (str): Model root directory. )") .def(py::init([]( - const std::string& root_dir, + const std::filesystem::path& root_dir, const std::string& device, const py::kwargs& kwargs ) { @@ -122,7 +125,7 @@ void init_unet2d_condition_model(py::module_& m) { py::class_(unet2d_condition_model, "Config", "This class is used for storing UNet2DConditionModel config.") .def(py::init([]( - const std::string& config_path + const std::filesystem::path& config_path ) { return std::make_unique(config_path); })) @@ -139,13 +142,13 @@ void init_unet2d_condition_model(py::module_& m) { unet2d_condition_model.def("set_hidden_states", &ov::genai::UNet2DConditionModel::set_hidden_states); unet2d_condition_model.def( "compile", - [](ov::genai::UNet2DConditionModel& self, + [](ov::genai::UNet2DConditionModel& self, const std::string& device, const py::kwargs& kwargs ) { self.compile(device, utils::kwargs_to_any_map(kwargs)); }, - py::arg("device") = "CPU", "device on which inference will be done", + py::arg("device"), "device on which inference will be done", R"( Compiles the model. device (str): Device to run the model on (e.g., CPU, GPU). @@ -156,7 +159,7 @@ void init_unet2d_condition_model(py::module_& m) { void init_autoencoder_kl(py::module_& m) { auto autoencoder_kl = py::class_(m, "AutoencoderKL", "AutoencoderKL class.") .def(py::init([]( - const std::string& root_dir + const std::filesystem::path& root_dir ) { return std::make_unique(root_dir); }), @@ -166,7 +169,7 @@ void init_autoencoder_kl(py::module_& m) { root_dir (str): Root directory. )") .def(py::init([]( - const std::string& root_dir, + const std::filesystem::path& root_dir, const std::string& device, const py::kwargs& kwargs ) { @@ -193,7 +196,7 @@ void init_autoencoder_kl(py::module_& m) { py::class_(autoencoder_kl, "Config", "This class is used for storing AutoencoderKL config.") .def(py::init([]( - const std::string& config_path + const std::filesystem::path& config_path ) { return std::make_unique(config_path); })) @@ -207,13 +210,13 @@ void init_autoencoder_kl(py::module_& m) { autoencoder_kl.def("infer", &ov::genai::AutoencoderKL::infer); autoencoder_kl.def( "compile", - [](ov::genai::AutoencoderKL& self, + [](ov::genai::AutoencoderKL& self, const std::string& device, const py::kwargs& kwargs ) { self.compile(device, utils::kwargs_to_any_map(kwargs)); }, - py::arg("device") = "CPU", "device on which inference will be done" + py::arg("device"), "device on which inference will be done" R"( Compiles the model. device (str): Device to run the model on (e.g., CPU, GPU). @@ -224,7 +227,7 @@ void init_autoencoder_kl(py::module_& m) { void init_clip_text_model_with_projection(py::module_& m) { auto clip_text_model_with_projection = py::class_(m, "CLIPTextModelWithProjection", "CLIPTextModelWithProjection class.") .def(py::init([]( - const std::string& root_dir + const std::filesystem::path& root_dir ) { return std::make_unique(root_dir); }), @@ -234,7 +237,7 @@ void init_clip_text_model_with_projection(py::module_& m) { root_dir (str): Model root directory. )") .def(py::init([]( - const std::string& root_dir, + const std::filesystem::path& root_dir, const std::string& device, const py::kwargs& kwargs ) { @@ -261,7 +264,7 @@ void init_clip_text_model_with_projection(py::module_& m) { py::class_(clip_text_model_with_projection, "Config", "This class is used for storing CLIPTextModelWithProjection config.") .def(py::init([]( - const std::string& config_path + const std::filesystem::path& config_path ) { return std::make_unique(config_path); })) @@ -281,7 +284,7 @@ void init_clip_text_model_with_projection(py::module_& m) { ) { self.compile(device, utils::kwargs_to_any_map(kwargs)); }, - py::arg("device") = "CPU", "device on which inference will be done", + py::arg("device"), "device on which inference will be done", R"( Compiles the model. device (str): Device to run the model on (e.g., CPU, GPU). diff --git a/src/python/py_text2image_pipeline.cpp b/src/python/py_text2image_pipeline.cpp index 63877ba2e2..6bf07b167d 100644 --- a/src/python/py_text2image_pipeline.cpp +++ b/src/python/py_text2image_pipeline.cpp @@ -2,10 +2,13 @@ // SPDX-License-Identifier: Apache-2.0 #include + #include #include #include +#include #include + #include "openvino/genai/text2image/pipeline.hpp" #include "py_utils.hpp" @@ -166,28 +169,28 @@ void init_text2image_pipeline(py::module_& m) { auto text2image_pipeline = py::class_(m, "Text2ImagePipeline", "This class is used for generation with text-to-image models.") .def(py::init([]( - const std::string& model_path + const std::filesystem::path& models_path ) { - return std::make_unique(model_path); + return std::make_unique(models_path); }), - py::arg("model_path"), "folder with exported model files.", + py::arg("models_path"), "folder with exported model files.", R"( Text2ImagePipeline class constructor. - model_path (str): Path to the folder with exported model files. + models_path (str): Path to the folder with exported model files. )") .def(py::init([]( - const std::string& model_path, + const std::filesystem::path& models_path, const std::string& device, const py::kwargs& kwargs ) { - return std::make_unique(model_path, device, text2image_kwargs_to_any_map(kwargs, true)); + return std::make_unique(models_path, device, text2image_kwargs_to_any_map(kwargs, true)); }), - py::arg("model_path"), "folder with exported model files.", - py::arg("device") = "CPU", "device on which inference will be done", + py::arg("models_path"), "folder with exported model files.", + py::arg("device"), "device on which inference will be done", R"( Text2ImagePipeline class constructor. - model_path (str): Path with exported model files. + models_path (str): Path with exported model files. device (str): Device to run the model on (e.g., CPU, GPU). kwargs: Text2ImagePipeline properties )") @@ -206,7 +209,7 @@ void init_text2image_pipeline(py::module_& m) { ) { pipe.compile(device, utils::kwargs_to_any_map(kwargs)); }, - py::arg("device") = "CPU", "device on which inference will be done", + py::arg("device"), "device on which inference will be done", R"( Compiles the model. device (str): Device to run the model on (e.g., CPU, GPU). diff --git a/src/python/py_utils.cpp b/src/python/py_utils.cpp index 0beb71887c..6ae9e2c4f6 100644 --- a/src/python/py_utils.cpp +++ b/src/python/py_utils.cpp @@ -201,7 +201,7 @@ ov::genai::StreamerVariant pystreamer_to_streamer(const utils::PyBindStreamerVar } ov::genai::OptionalGenerationConfig update_config_from_kwargs(const ov::genai::OptionalGenerationConfig& config, const py::kwargs& kwargs) { - if(!config.has_value() && kwargs.empty()) + if (!config.has_value() && kwargs.empty()) return std::nullopt; ov::genai::GenerationConfig res_config; diff --git a/src/python/py_vlm_pipeline.cpp b/src/python/py_vlm_pipeline.cpp index 7f6d639e01..115d816e95 100644 --- a/src/python/py_vlm_pipeline.cpp +++ b/src/python/py_vlm_pipeline.cpp @@ -3,10 +3,13 @@ #include + #include #include #include +#include #include + #include "openvino/genai/visual_language/pipeline.hpp" #include "tokenizers_path.hpp" #include "py_utils.hpp" @@ -21,7 +24,7 @@ auto vlm_generate_docstring = R"( :param prompt: input prompt :type prompt: str - :param images: list of images + :param images: list of images :type inputs: List[ov.Tensor] :param generation_config: generation_config @@ -43,9 +46,9 @@ auto vlm_generate_kwargs_docstring = R"( :param prompt: input prompt :type prompt: str - :param kwargs: arbitrary keyword arguments with keys corresponding to generate params. - - Expected parameters list: + :param kwargs: arbitrary keyword arguments with keys corresponding to generate params. + + Expected parameters list: image: ov.Tensor - input image, images: List[ov.Tensor] - input images, generation_config: GenerationConfig, @@ -56,21 +59,21 @@ auto vlm_generate_kwargs_docstring = R"( )"; py::object call_vlm_generate( - ov::genai::VLMPipeline& pipe, + ov::genai::VLMPipeline& pipe, const std::string& prompt, const std::vector& images, - const ov::genai::GenerationConfig& generation_config, - const utils::PyBindStreamerVariant& py_streamer, + const ov::genai::GenerationConfig& generation_config, + const utils::PyBindStreamerVariant& py_streamer, const py::kwargs& kwargs ) { auto updated_config = *ov::genai::pybind::utils::update_config_from_kwargs(generation_config, kwargs); ov::genai::StreamerVariant streamer = ov::genai::pybind::utils::pystreamer_to_streamer(py_streamer); - + return py::cast(pipe.generate(prompt, images, updated_config, streamer)); } py::object call_vlm_generate( - ov::genai::VLMPipeline& pipe, + ov::genai::VLMPipeline& pipe, const std::string& prompt, const py::kwargs& kwargs ) { @@ -95,40 +98,40 @@ py::object call_vlm_generate( "Use help(openvino_genai.VLMPipeline.generate) to get list of acceptable parameters.")); } } - + return py::cast(pipe.generate(prompt, params)); } void init_vlm_pipeline(py::module_& m) { py::class_(m, "VLMPipeline", "This class is used for generation with VLMs") .def(py::init([]( - const std::string& model_path, + const std::filesystem::path& models_path, const std::string& device, const std::map& config ) { ScopedVar env_manager(utils::ov_tokenizers_module_path()); - return std::make_unique(model_path, device, utils::properties_to_any_map(config)); + return std::make_unique(models_path, device, utils::properties_to_any_map(config)); }), - py::arg("model_path"), "folder with exported model files", - py::arg("device") = "CPU", "device on which inference will be done", + py::arg("models_path"), "folder with exported model files", + py::arg("device"), "device on which inference will be done", py::arg("config") = ov::AnyMap({}), "openvino.properties map" R"( VLMPipeline class constructor. - model_path (str): Path to the folder with exported model files. + models_path (str): Path to the folder with exported model files. device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'. )") .def("start_chat", &ov::genai::VLMPipeline::start_chat, py::arg("system_message") = "") - .def("finish_chat", &ov::genai::VLMPipeline::finish_chat) + .def("finish_chat", &ov::genai::VLMPipeline::finish_chat) .def("get_tokenizer", &ov::genai::VLMPipeline::get_tokenizer) .def("get_generation_config", &ov::genai::VLMPipeline::get_generation_config) .def("set_generation_config", &ov::genai::VLMPipeline::set_generation_config) .def( - "generate", - [](ov::genai::VLMPipeline& pipe, + "generate", + [](ov::genai::VLMPipeline& pipe, const std::string& prompt, const std::vector& images, - const ov::genai::GenerationConfig& generation_config, + const ov::genai::GenerationConfig& generation_config, const utils::PyBindStreamerVariant& streamer, const py::kwargs& kwargs ) { @@ -141,8 +144,8 @@ void init_vlm_pipeline(py::module_& m) { (vlm_generate_docstring + std::string(" \n ")).c_str() ) .def( - "generate", - [](ov::genai::VLMPipeline& pipe, + "generate", + [](ov::genai::VLMPipeline& pipe, const std::string& prompt, const py::kwargs& kwargs ) { diff --git a/src/python/py_whisper_pipeline.cpp b/src/python/py_whisper_pipeline.cpp index f10d5c86d6..da1d640ffe 100644 --- a/src/python/py_whisper_pipeline.cpp +++ b/src/python/py_whisper_pipeline.cpp @@ -1,14 +1,16 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 +#include "openvino/genai/whisper_generation_config.hpp" +#include "openvino/genai/whisper_pipeline.hpp" + #include #include #include +#include #include #include "tokenizers_path.hpp" -#include "openvino/genai/whisper_generation_config.hpp" -#include "openvino/genai/whisper_pipeline.hpp" #include "py_utils.hpp" namespace py = pybind11; @@ -29,10 +31,10 @@ namespace { auto whisper_generate_docstring = R"( High level generate that receives raw speech as a vector of floats and returns decoded output. - + :param raw_speech_input: inputs in the form of list of floats. Required to be normalized to near [-1, 1] range and have 16k Hz sampling rate. :type raw_speech_input: List[float] - + :param generation_config: generation_config :type generation_config: WhisperGenerationConfig or a Dict @@ -50,7 +52,7 @@ auto whisper_decoded_results_docstring = R"( Structure to store resulting batched text outputs and scores for each batch. The first num_return_sequences elements correspond to the first batch element. - Parameters: + Parameters: texts: vector of resulting sequences. scores: scores for each sequence. metrics: performance metrics with tpot, ttft, etc. of type ov::genai::PerfMetrics. @@ -84,19 +86,19 @@ auto whisper_generation_config_docstring = R"( pad_token_id: Padding token id. type: int - + translate_token_id: Translate token id. type: int - + transcribe_token_id: Transcribe token id. type: int - + no_timestamps_token_id: No timestamps token id. type: int - + is_multilingual: type: bool - + begin_suppress_tokens: A list containing tokens that will be supressed at the beginning of the sampling process. type: list[int] @@ -106,10 +108,10 @@ auto whisper_generation_config_docstring = R"( language: Language token to use for generation in the form of <|en|>. You can find all the possible language tokens in the generation_config.json lang_to_id dictionary. type: Optional[str] - + lang_to_id: Language token to token_id map. Initialized from the generation_config.json lang_to_id dictionary. type: Dict[str, int] - + task: Task to use for generation, either “translate” or “transcribe” type: int @@ -221,8 +223,8 @@ void init_whisper_pipeline(py::module_& m) { // Binding for WhisperGenerationConfig py::class_(m, "WhisperGenerationConfig", whisper_generation_config_docstring) - .def(py::init(), py::arg("json_path"), "path where generation_config.json is stored") - .def(py::init([](py::kwargs kwargs) { + .def(py::init(), py::arg("json_path"), "path where generation_config.json is stored") + .def(py::init([](const py::kwargs& kwargs) { return *update_whisper_config_from_kwargs(WhisperGenerationConfig(), kwargs); })) .def_readwrite("max_new_tokens", &WhisperGenerationConfig::max_new_tokens) @@ -255,42 +257,21 @@ void init_whisper_pipeline(py::module_& m) { .def_readonly("chunks", &WhisperDecodedResults::chunks); py::class_(m, "WhisperPipeline") - .def(py::init([](const std::string& model_path, + .def(py::init([](const std::filesystem::path& models_path, const std::string& device, const std::map& config) { ScopedVar env_manager(utils::ov_tokenizers_module_path()); - return std::make_unique(model_path, device, utils::properties_to_any_map(config)); + return std::make_unique(models_path, device, utils::properties_to_any_map(config)); }), - py::arg("model_path"), + py::arg("models_path"), "folder with openvino_model.xml and openvino_tokenizer[detokenizer].xml files", - py::arg("device") = "CPU", + py::arg("device"), "device on which inference will be done", py::arg("config") = ov::AnyMap({}), "openvino.properties map", R"( WhisperPipeline class constructor. - model_path (str): Path to the model file. - device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'. - )") - - .def(py::init([](const std::string& model_path, - const Tokenizer& tokenizer, - const std::string& device, - const std::map& config) { - return std::make_unique(model_path, - tokenizer, - device, - utils::properties_to_any_map(config)); - }), - py::arg("model_path"), - py::arg("tokenizer"), - py::arg("device") = "CPU", - py::arg("config") = ov::AnyMap({}), - "openvino.properties map", - R"( - WhisperPipeline class constructor for manualy created openvino_genai.Tokenizer. - model_path (str): Path to the model file. - tokenizer (openvino_genai.Tokenizer): tokenizer object. + models_path (str): Path to the model file. device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'. )") diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py index 897839b454..50ee452f5c 100644 --- a/tests/python_tests/common.py +++ b/tests/python_tests/common.py @@ -328,15 +328,15 @@ def run_hugging_face( def run_continuous_batching( - model_path : Path, + models_path : Path, scheduler_config : SchedulerConfig, prompts: List[str], generation_configs : List[GenerationConfig] ) -> List[GenerationResult]: - pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config, "CPU", {}, {}) + pipe = ContinuousBatchingPipeline(models_path.absolute().as_posix(), scheduler_config, "CPU", {}, {}) output = pipe.generate(prompts, generation_configs) del pipe - shutil.rmtree(model_path) + shutil.rmtree(models_path) return output @@ -363,14 +363,14 @@ def compare_results(hf_result: GenerationResult, ov_result: GenerationResult, ge for hf_text, ov_text in zip(hf_result.m_generation_ids, ov_result.m_generation_ids): assert hf_text == ov_text -def save_ov_model_from_optimum(model, hf_tokenizer, model_path: Path): - model.save_pretrained(model_path) +def save_ov_model_from_optimum(model, hf_tokenizer, models_path: Path): + model.save_pretrained(models_path) # convert tokenizers as well from openvino_tokenizers import convert_tokenizer from openvino import serialize tokenizer, detokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=True, skip_special_tokens=True) - serialize(tokenizer, model_path / "openvino_tokenizer.xml") - serialize(detokenizer, model_path / "openvino_detokenizer.xml") + serialize(tokenizer, models_path / "openvino_tokenizer.xml") + serialize(detokenizer, models_path / "openvino_detokenizer.xml") def get_model_and_tokenizer(model_id: str, use_optimum = True): hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) @@ -380,18 +380,18 @@ def get_model_and_tokenizer(model_id: str, use_optimum = True): def generate_and_compare_with_hf(model_id: str, prompts: List[str], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig, tmp_path: Path): use_optimum = True - model_path : Path = tmp_path / model_id + models_path : Path = tmp_path / model_id model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum) if use_optimum: - save_ov_model_from_optimum(model, hf_tokenizer, model_path) + save_ov_model_from_optimum(model, hf_tokenizer, models_path) hf_results = run_hugging_face(model=model, hf_tokenizer=hf_tokenizer, prompts=prompts, generation_configs=generation_configs) - _generate_and_compare_with_reference_results(model_path, prompts, hf_results, generation_configs, scheduler_config) + _generate_and_compare_with_reference_results(models_path, prompts, hf_results, generation_configs, scheduler_config) -def _generate_and_compare_with_reference_results(model_path: Path, prompts: List[str], reference_results: List[GenerationResult], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig): - ov_results : List[GenerationResult] = run_continuous_batching(model_path, scheduler_config, prompts, generation_configs) +def _generate_and_compare_with_reference_results(models_path: Path, prompts: List[str], reference_results: List[GenerationResult], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig): + ov_results : List[GenerationResult] = run_continuous_batching(models_path, scheduler_config, prompts, generation_configs) assert len(prompts) == len(reference_results) assert len(prompts) == len(ov_results) @@ -401,8 +401,8 @@ def _generate_and_compare_with_reference_results(model_path: Path, prompts: List compare_results(ref_result, ov_result, generation_config) -def generate_and_compare_with_reference_text(model_path: Path, prompts: List[str], reference_texts_per_prompt: List[List[str]], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig): - ov_results : List[GenerationResult] = run_continuous_batching(model_path, scheduler_config, prompts, generation_configs) +def generate_and_compare_with_reference_text(models_path: Path, prompts: List[str], reference_texts_per_prompt: List[List[str]], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig): + ov_results : List[GenerationResult] = run_continuous_batching(models_path, scheduler_config, prompts, generation_configs) assert len(prompts) == len(reference_texts_per_prompt) assert len(prompts) == len(ov_results) diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py index 1195d0b04e..9d48a5d1e4 100644 --- a/tests/python_tests/ov_genai_test_utils.py +++ b/tests/python_tests/ov_genai_test_utils.py @@ -197,7 +197,7 @@ def read_model(params, **tokenizer_kwargs): path, tokenizer, opt_model, - ov_genai.LLMPipeline(str(path), device='CPU', config={"ENABLE_MMAP": False}), + ov_genai.LLMPipeline(path, 'CPU', config={'ENABLE_MMAP': False}), ) @@ -220,7 +220,7 @@ def model_tmp_path(tmpdir_factory): for pattern in ['*.xml', '*.bin']: for src_file in path.glob(pattern): if src_file.is_file(): - shutil.copy(src_file, temp_path / src_file.name) + shutil.copy(src_file, temp_path / src_file.name) yield model_id, Path(temp_path) @@ -233,7 +233,7 @@ def load_tok(configs: List[Tuple], temp_path): for config_json, config_name in configs: with (temp_path / config_name).open('w') as f: json.dump(config_json, f) - return ov_genai.Tokenizer(str(temp_path), {}) + return ov_genai.Tokenizer(temp_path) def load_pipe(configs: List[Tuple], temp_path): @@ -245,11 +245,11 @@ def load_pipe(configs: List[Tuple], temp_path): for config_json, config_name in configs: with (temp_path / config_name).open('w') as f: json.dump(config_json, f) - return ov_genai.LLMPipeline(str(temp_path)) + return ov_genai.LLMPipeline(temp_path, 'CPU') @functools.lru_cache(1) def get_continuous_batching(path): scheduler_config = ov_genai.SchedulerConfig() scheduler_config.cache_size = 1 - return ov_genai.LLMPipeline(str(path), ov_genai.Tokenizer(str(path)), device='CPU', config={"scheduler_config": scheduler_config}) + return ov_genai.LLMPipeline(path, ov_genai.Tokenizer(path), 'CPU', config={"scheduler_config": scheduler_config}) diff --git a/tests/python_tests/test_cache_optimizations.py b/tests/python_tests/test_cache_optimizations.py index f3125976d2..49cb04ca1f 100644 --- a/tests/python_tests/test_cache_optimizations.py +++ b/tests/python_tests/test_cache_optimizations.py @@ -37,7 +37,7 @@ def get_scheduler_config(num_kv_blocks: int) -> SchedulerConfig: class ConvertedModel: model: OVModelForCausalLM tokenizer: AutoTokenizer - model_path: Path + models_path: Path @pytest.fixture(scope='module') @@ -45,12 +45,12 @@ def converted_model(tmp_path_factory): model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_id) - model_path = tmp_path_factory.mktemp("cacheopt_test_models") / model_id - model.save_pretrained(model_path) + models_path = tmp_path_factory.mktemp("cacheopt_test_models") / model_id + model.save_pretrained(models_path) ov_tokenizer, ov_detokenizer = convert_tokenizer(tokenizer, with_detokenizer=True, skip_special_tokens=True) - serialize(ov_tokenizer, model_path / "openvino_tokenizer.xml") - serialize(ov_detokenizer, model_path / "openvino_detokenizer.xml") - converted_model = ConvertedModel(model, tokenizer, model_path) + serialize(ov_tokenizer, models_path / "openvino_tokenizer.xml") + serialize(ov_detokenizer, models_path / "openvino_detokenizer.xml") + converted_model = ConvertedModel(model, tokenizer, models_path) yield converted_model del converted_model del model @@ -110,9 +110,9 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, t scheduler_config_opt.cache_eviction_config = test_struct.cache_eviction_config scheduler_config_opt.enable_prefix_caching = enable_prefix_caching - model_path = converted_model.model_path - model_cb_noopt = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config, "CPU", {}) - model_cb_opt = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config_opt, "CPU", {}) + models_path = converted_model.models_path + model_cb_noopt = ContinuousBatchingPipeline(models_path.absolute().as_posix(), scheduler_config, "CPU", {}) + model_cb_opt = ContinuousBatchingPipeline(models_path.absolute().as_posix(), scheduler_config_opt, "CPU", {}) tokenizer = converted_model.tokenizer diff --git a/tests/python_tests/test_chat_generate_api.py b/tests/python_tests/test_chat_generate_api.py index b68de6372d..3e29e8ad2c 100644 --- a/tests/python_tests/test_chat_generate_api.py +++ b/tests/python_tests/test_chat_generate_api.py @@ -118,7 +118,7 @@ def test_chat_compare_statefull_vs_text_history(model_descr, generation_config: # HF in chat scenario does not add special tokens, but openvino tokenizer by default is converted with add_special_tokens=True. # Need to regenerate openvino_tokenizer/detokenizer. model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'), add_special_tokens=False) - pipe_with_kv_cache = ov_genai.LLMPipeline(str(path), device, config={"ENABLE_MMAP": False}) + pipe_with_kv_cache = ov_genai.LLMPipeline(path, device, config={"ENABLE_MMAP": False}) pipe_with_kv_cache.start_chat() for question in quenstions: diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py index f80729d425..9a9d80d6ae 100644 --- a/tests/python_tests/test_generate_api.py +++ b/tests/python_tests/test_generate_api.py @@ -132,7 +132,7 @@ def hf_ov_genai_tensors_comparison( hf_output = model.generate(**inputs_hf, **generation_config_hf) - pipe = ov_genai.LLMPipeline(str(path), device) + pipe = ov_genai.LLMPipeline(path, device) ov_output = pipe.generate(inputs_ov, **config) hf_res = hf_output[0, input_ids.shape[1]:].numpy() diff --git a/tests/python_tests/test_preemption.py b/tests/python_tests/test_preemption.py index 239ae6399c..ec21089247 100644 --- a/tests/python_tests/test_preemption.py +++ b/tests/python_tests/test_preemption.py @@ -89,11 +89,11 @@ def test_preemption_with_multinomial(tmp_path, dynamic_split_fuse): model_id : str = "facebook/opt-125m" model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True) - model_path : Path = tmp_path / model_id - save_ov_model_from_optimum(model, hf_tokenizer, model_path) + models_path : Path = tmp_path / model_id + save_ov_model_from_optimum(model, hf_tokenizer, models_path) scheduler_config = get_scheduler_config({"num_kv_blocks": 3, "block_size": 32, "dynamic_split_fuse": dynamic_split_fuse, "max_num_batched_tokens": 256, "max_num_seqs": 256}) - generate_and_compare_with_reference_text(model_path, multinomial_params.prompts, multinomial_params.ref_texts, generation_configs, scheduler_config) + generate_and_compare_with_reference_text(models_path, multinomial_params.prompts, multinomial_params.ref_texts, generation_configs, scheduler_config) multinomial_params_n_seq = RandomSamplingTestStruct( @@ -170,9 +170,9 @@ def test_preemption_with_multinomial_n_seq(tmp_path, dynamic_split_fuse): model_id : str = "facebook/opt-125m" model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True) - model_path : Path = tmp_path / model_id - save_ov_model_from_optimum(model, hf_tokenizer, model_path) + models_path : Path = tmp_path / model_id + save_ov_model_from_optimum(model, hf_tokenizer, models_path) # needed kv_blocks - 16 (2 blocks per sequence (30 tokens to generated text + prompt (> 2 tokens)) * (1 + 3 + 4) seq ) scheduler_config = get_scheduler_config({"num_kv_blocks": 8, "block_size": 32, "dynamic_split_fuse": dynamic_split_fuse, "max_num_batched_tokens": 256, "max_num_seqs": 256}) - generate_and_compare_with_reference_text(model_path, multinomial_params_n_seq.prompts, multinomial_params_n_seq.ref_texts, generation_configs, scheduler_config) \ No newline at end of file + generate_and_compare_with_reference_text(models_path, multinomial_params_n_seq.prompts, multinomial_params_n_seq.ref_texts, generation_configs, scheduler_config) \ No newline at end of file diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py index c88776d0dc..b98caf4f6e 100644 --- a/tests/python_tests/test_sampling.py +++ b/tests/python_tests/test_sampling.py @@ -307,11 +307,11 @@ def test_individual_generation_configs_random(tmp_path, test_struct: RandomSampl model_id : str = "facebook/opt-125m" model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True) - model_path : Path = tmp_path / model_id - save_ov_model_from_optimum(model, hf_tokenizer, model_path) + models_path : Path = tmp_path / model_id + save_ov_model_from_optimum(model, hf_tokenizer, models_path) # run multinomial without comparison with reference - _ = run_continuous_batching(model_path, DEFAULT_SCHEDULER_CONFIG, prompts, generation_configs) + _ = run_continuous_batching(models_path, DEFAULT_SCHEDULER_CONFIG, prompts, generation_configs) # Reference comparison is not performed as sampling results are non-deterministic. # Discrete_distribution impl depends on platform, model inference results may depend on CPU. @@ -332,10 +332,10 @@ def test_post_oom_health(tmp_path, sampling_config): model_id : str = "facebook/opt-125m" model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True) - model_path : Path = tmp_path / model_id - save_ov_model_from_optimum(model, hf_tokenizer, model_path) + models_path : Path = tmp_path / model_id + save_ov_model_from_optimum(model, hf_tokenizer, models_path) - pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), Tokenizer(model_path.absolute().as_posix(), {}), scheduler_config, "CPU", {}) + pipe = ContinuousBatchingPipeline(models_path.absolute().as_posix(), Tokenizer(models_path.absolute().as_posix(), {}), scheduler_config, "CPU", {}) # First run should return incomplete response output = pipe.generate(["What is OpenVINO?"], generation_configs) assert (len(output)) @@ -345,4 +345,4 @@ def test_post_oom_health(tmp_path, sampling_config): assert (len(output)) assert(len(output[0].m_generation_ids)) del pipe - shutil.rmtree(model_path) + shutil.rmtree(models_path) diff --git a/tests/python_tests/test_vlm_api.py b/tests/python_tests/test_vlm_api.py index 4354531f9c..0cb2e509f3 100644 --- a/tests/python_tests/test_vlm_api.py +++ b/tests/python_tests/test_vlm_api.py @@ -48,14 +48,14 @@ def test_vlm_pipeline(cache): def streamer(word: str) -> bool: return False - model_path = get_ov_model(cache) + models_path = get_ov_model(cache) for links in image_links_for_testing: images = [] for link in links: images.append(get_image_by_link(link)) - pipe = VLMPipeline(str(model_path), "CPU") + pipe = VLMPipeline(models_path, "CPU") pipe.start_chat() pipe.generate(prompts[0], images=images, generation_config=get_greedy(), streamer=streamer) @@ -69,8 +69,8 @@ def streamer(word: str) -> bool: @pytest.mark.precommit @pytest.mark.nightly def test_vlm_get_tokenizer(cache): - model_path = get_ov_model(cache) - pipe = VLMPipeline(str(model_path), "CPU") + models_path = get_ov_model(cache) + pipe = VLMPipeline(models_path, "CPU") tokenizer = pipe.get_tokenizer() tokenizer.encode("") @@ -82,7 +82,7 @@ def test_vlm_get_tokenizer(cache): get_multinomial_all_parameters(), ]) def test_sampling(config, cache): - model_path = get_ov_model(cache) + models_path = get_ov_model(cache) image = get_image_by_link(image_links[0]) - pipe = VLMPipeline(str(model_path), "CPU") + pipe = VLMPipeline(models_path, "CPU") pipe.generate(prompts[0], image=image, generation_config=config) diff --git a/tests/python_tests/test_whisper_generate_api.py b/tests/python_tests/test_whisper_generate_api.py index 6eeea76d31..b96b002774 100644 --- a/tests/python_tests/test_whisper_generate_api.py +++ b/tests/python_tests/test_whisper_generate_api.py @@ -68,7 +68,7 @@ def read_whisper_model(params, **tokenizer_kwargs): path, opt_pipe, ov_genai.WhisperPipeline( - str(path), device="CPU", config={"ENABLE_MMAP": False} + path, 'CPU', config={'ENABLE_MMAP': False} ), ) @@ -157,7 +157,7 @@ def test_smoke(model_descr, test_sample): def test_whisper_config_constructor(model_descr): model_id, path = model_descr - config = ov_genai.WhisperGenerationConfig(str(path / "generation_config.json")) + config = ov_genai.WhisperGenerationConfig(path / "generation_config.json") with open(path / "generation_config.json") as f: original_config = json.load(f) @@ -191,34 +191,6 @@ def test_whisper_config_constructor(model_descr): assert config.lang_to_id["<|_ru|>"] == 42 -@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) -@pytest.mark.parametrize("test_sample", get_samples_from_dataset(length=1)) -@pytest.mark.precommit -def test_whisper_constructors(model_descr, test_sample): - model_id, path = model_descr - model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) - - expected = opt_pipe(test_sample)["text"] - - genai_result = ov_genai.WhisperPipeline( - str(path), device="CPU", config={"ENABLE_MMAP": False} - ).generate(test_sample) - - assert genai_result.texts[0] == expected - - genai_result = ov_genai.WhisperPipeline(str(path)).generate(test_sample) - - assert genai_result.texts[0] == expected - - tokenizer = ov_genai.Tokenizer(str(path)) - - genai_result = ov_genai.WhisperPipeline( - str(path), tokenizer=tokenizer, device="CPU", config={"ENABLE_MMAP": False} - ).generate(test_sample) - - assert genai_result.texts[0] == expected - - @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) @pytest.mark.parametrize("test_sample", get_samples_from_dataset(length=1)) @pytest.mark.precommit @@ -228,23 +200,12 @@ def test_max_new_tokens(model_descr, test_sample): expected = opt_pipe(test_sample, max_new_tokens=30)["text"] - genai_result = ov_genai.WhisperPipeline(str(path)).generate( + genai_result = ov_genai.WhisperPipeline(path, 'CPU').generate( test_sample, max_new_tokens=30 ) assert genai_result.texts[0] == expected - tokenizer = ov_genai.Tokenizer(str(path)) - - genai_pipeline = ov_genai.WhisperPipeline( - str(path), tokenizer=tokenizer, device="CPU", config={"ENABLE_MMAP": False} - ) - config = genai_pipeline.get_generation_config() - config.max_new_tokens = 30 - genai_result = genai_pipeline.generate(test_sample, config) - - assert genai_result.texts[0] == expected - @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) @pytest.mark.parametrize( diff --git a/tools/continuous_batching/accuracy/continuous_batching_speculative_decoding.cpp b/tools/continuous_batching/accuracy/continuous_batching_speculative_decoding.cpp index eed6fdd49d..1c3473f841 100644 --- a/tools/continuous_batching/accuracy/continuous_batching_speculative_decoding.cpp +++ b/tools/continuous_batching/accuracy/continuous_batching_speculative_decoding.cpp @@ -73,8 +73,8 @@ int main(int argc, char* argv[]) try { const size_t num_prompts = result["num_prompts"].as(); const bool dynamic_split_fuse = result["dynamic_split_fuse"].as(); - const std::string model_path = result["model"].as(); - const std::string draft_model_path = result["draft_model"].as(); + const std::string models_path = result["model"].as(); + const std::string draft_models_path = result["draft_model"].as(); const std::string device = result["device"].as(); std::vector prompt_examples = { @@ -117,7 +117,7 @@ int main(int argc, char* argv[]) try { // vLLM specific params scheduler_config.max_num_seqs = 2; - ov::genai::ContinuousBatchingPipeline pipe(model_path, scheduler_config, device, {ov::genai::draft_model(draft_model_path, device)}); + ov::genai::ContinuousBatchingPipeline pipe(models_path, scheduler_config, device, {ov::genai::draft_model(draft_models_path, device)}); std::vector generation_results = pipe.generate(prompts, generation_config); for (size_t request_id = 0; request_id < generation_results.size(); ++request_id) { diff --git a/tools/llm_bench/llm_bench_utils/ov_utils.py b/tools/llm_bench/llm_bench_utils/ov_utils.py index da77f5da22..173e87d3a7 100644 --- a/tools/llm_bench/llm_bench_utils/ov_utils.py +++ b/tools/llm_bench/llm_bench_utils/ov_utils.py @@ -201,7 +201,7 @@ def create_genai_text_gen_model(model_path, device, ov_config, **kwargs): setattr(scheduler_config, param, value) ov_config["scheduler_config"] = scheduler_config start = time.perf_counter() - llm_pipe = openvino_genai.LLMPipeline(str(model_path), device.upper(), ov_config) + llm_pipe = openvino_genai.LLMPipeline(model_path, device.upper(), ov_config) end = time.perf_counter() log.info(f'Pipeline initialization time: {end - start:.2f}s')