From ec5f305c8bf9a6ee7d43a46e24e759eb20eba05f Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Thu, 19 Sep 2024 14:30:58 +0400 Subject: [PATCH 01/57] Use continuous batching by default --- src/cpp/src/llm_pipeline.cpp | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 66e2890671..ff79efb75b 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -515,6 +515,7 @@ ov::genai::LLMPipeline::LLMPipeline( const ov::genai::Tokenizer& tokenizer, OptionalGenerationConfig generation_config ) { + OPENVINO_THROW("Not supported"); auto start_time = std::chrono::steady_clock::now(); m_pimpl = std::make_unique(request, tokenizer, generation_config); auto stop_time = std::chrono::steady_clock::now(); @@ -527,12 +528,24 @@ ov::genai::LLMPipeline::LLMPipeline( const std::string& device, const ov::AnyMap& plugin_config ){ + std::cout << "Using continuous batching backend.\n"; auto start_time = std::chrono::steady_clock::now(); if (plugin_config.find(ov::genai::scheduler_config.name()) != plugin_config.end()) { auto config_without_scheduler_config = plugin_config; config_without_scheduler_config.erase(ov::genai::scheduler_config.name()); auto& scheduler_config = plugin_config.at(ov::genai::scheduler_config.name()).as(); m_pimpl = std::make_unique(model_path, tokenizer, scheduler_config, device, config_without_scheduler_config); + std::cout << "Found custom SchedulerConfig.\n"; + } else if (true) { + SchedulerConfig scheduler_config; + scheduler_config.cache_size = 8; + m_pimpl = std::make_unique( + model_path, + tokenizer, + scheduler_config, + device, + plugin_config + ); } else if ("NPU" == device) { m_pimpl = std::make_unique(model_path, tokenizer, device, plugin_config); } else { @@ -547,12 +560,23 @@ ov::genai::LLMPipeline::LLMPipeline( const std::string& device, const ov::AnyMap& config ){ + std::cout << "Using continuous batching backend.\n"; auto start_time = std::chrono::steady_clock::now(); if (config.find(ov::genai::scheduler_config.name()) != config.end()) { auto config_without_scheduler_config = config; config_without_scheduler_config.erase(ov::genai::scheduler_config.name()); auto& scheduler_config = config.at(ov::genai::scheduler_config.name()).as(); m_pimpl = std::make_unique(path, scheduler_config, device, config_without_scheduler_config); + std::cout << "Found custom SchedulerConfig.\n"; + } else if (true) { + SchedulerConfig scheduler_config; + scheduler_config.cache_size = 8; + m_pimpl = std::make_unique( + path, + scheduler_config, + device, + config + ); } else if ("NPU" == device) { m_pimpl = std::make_unique(path, device, config); } else { From 41d1fe7fd07487ccddd8d39cf6305a83c1511240 Mon Sep 17 00:00:00 2001 From: Andrei Kochin Date: Fri, 20 Sep 2024 16:41:54 +0400 Subject: [PATCH 02/57] Update src/cpp/src/llm_pipeline.cpp --- src/cpp/src/llm_pipeline.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index ff79efb75b..46650e94bb 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -538,7 +538,7 @@ ov::genai::LLMPipeline::LLMPipeline( std::cout << "Found custom SchedulerConfig.\n"; } else if (true) { SchedulerConfig scheduler_config; - scheduler_config.cache_size = 8; + scheduler_config.num_kv_blocks = 16; m_pimpl = std::make_unique( model_path, tokenizer, From 36150c4c314a19768daff37e059492d95b2f4b07 Mon Sep 17 00:00:00 2001 From: Andrei Kochin Date: Fri, 20 Sep 2024 16:42:01 +0400 Subject: [PATCH 03/57] Update src/cpp/src/llm_pipeline.cpp --- src/cpp/src/llm_pipeline.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 46650e94bb..d68b2aa250 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -570,7 +570,7 @@ ov::genai::LLMPipeline::LLMPipeline( std::cout << "Found custom SchedulerConfig.\n"; } else if (true) { SchedulerConfig scheduler_config; - scheduler_config.cache_size = 8; + scheduler_config.num_kv_blocks= 16; m_pimpl = std::make_unique( path, scheduler_config, From 4a4a09e705d98e081cdd510520e01f18045e9698 Mon Sep 17 00:00:00 2001 From: Andrei Kochin Date: Fri, 20 Sep 2024 17:32:01 +0400 Subject: [PATCH 04/57] Update src/cpp/src/llm_pipeline.cpp --- src/cpp/src/llm_pipeline.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index d68b2aa250..d34ee2563b 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -571,6 +571,7 @@ ov::genai::LLMPipeline::LLMPipeline( } else if (true) { SchedulerConfig scheduler_config; scheduler_config.num_kv_blocks= 16; + scheduler_config.enable_prefix_caching = true; m_pimpl = std::make_unique( path, scheduler_config, From 1a58b5e8bb60d21b5c5299fe1c18ca94c3131c74 Mon Sep 17 00:00:00 2001 From: Andrei Kochin Date: Fri, 20 Sep 2024 17:32:06 +0400 Subject: [PATCH 05/57] Update src/cpp/src/llm_pipeline.cpp --- src/cpp/src/llm_pipeline.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index d34ee2563b..616187d501 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -539,6 +539,7 @@ ov::genai::LLMPipeline::LLMPipeline( } else if (true) { SchedulerConfig scheduler_config; scheduler_config.num_kv_blocks = 16; + scheduler_config.enable_prefix_caching = true; m_pimpl = std::make_unique( model_path, tokenizer, From 90d81e654d884fc5221785860d4c04f3e8a9e699 Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Fri, 20 Sep 2024 17:31:02 +0400 Subject: [PATCH 06/57] Reorder cout --- src/cpp/src/continuous_batching_pipeline.cpp | 8 +------- src/cpp/src/timer.hpp | 4 +--- src/python/py_generate_pipeline.cpp | 7 ++++++- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp index 6100a870f3..13f2005acd 100644 --- a/src/cpp/src/continuous_batching_pipeline.cpp +++ b/src/cpp/src/continuous_batching_pipeline.cpp @@ -41,13 +41,7 @@ class ContinuousBatchingPipeline::Impl { float m_matmul_time_ms = 0.0f; float m_infer_total_ms = 0.0f; - ~PerfTime() { - std::cout << "Inference requests aggregated statistic: " << std::endl; - std::cout << "Paged attention % of inference execution: " << (m_paged_attention_time_ms / m_infer_total_ms) * 100 << std::endl; - std::cout << "MatMul % of inference execution: " << (m_matmul_time_ms / m_infer_total_ms) * 100 << std::endl; - std::cout << "Total inference execution secs: " << m_infer_total_ms / 1000. << std::endl; - std::cout << std::endl; - } + ~PerfTime() {} } m_perf; // current requests to process diff --git a/src/cpp/src/timer.hpp b/src/cpp/src/timer.hpp index c4893acd1c..03367a5530 100644 --- a/src/cpp/src/timer.hpp +++ b/src/cpp/src/timer.hpp @@ -26,7 +26,5 @@ class ManualTimer { m_total += std::chrono::duration(m_end - m_start).count(); } - ~ManualTimer() { - std::cout << m_title << ": " << m_total / 1000. << " secs" << std::endl; - } + ~ManualTimer() {} }; diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp index 058fd2a823..0104364891 100644 --- a/src/python/py_generate_pipeline.cpp +++ b/src/python/py_generate_pipeline.cpp @@ -10,6 +10,7 @@ #include "openvino/genai/llm_pipeline.hpp" #include #include "../cpp/src/tokenizers_path.hpp" +#include #include "./utils.hpp" @@ -433,7 +434,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) { m.doc() = "Pybind11 binding for LLM Pipeline"; py::class_(m, "LLMPipeline", "This class is used for generation with LLMs") - .def(py::init([]( + .def(py::init([&]( const std::string& model_path, const std::string& device, const std::map& config @@ -441,6 +442,8 @@ PYBIND11_MODULE(py_generate_pipeline, m) { ScopedVar env_manager(utils::ov_tokenizers_module_path()); return std::make_unique(model_path, device, utils::properties_to_any_map(config)); }), + py::call_guard(), py::arg("model_path"), "folder with openvino_model.xml and openvino_tokenizer[detokenizer].xml files", py::arg("device") = "CPU", "device on which inference will be done", py::arg("config") = ov::AnyMap({}), "openvino.properties map", @@ -460,6 +463,8 @@ PYBIND11_MODULE(py_generate_pipeline, m) { ScopedVar env_manager(utils::ov_tokenizers_module_path()); return std::make_unique(model_path, tokenizer, device, utils::properties_to_any_map(config)); }), + py::call_guard(), py::arg("model_path"), py::arg("tokenizer"), py::arg("device") = "CPU", From 6dc43a30afa9c1a960b5b215b4ad35ae8d8cffb7 Mon Sep 17 00:00:00 2001 From: Andrei Kochin Date: Fri, 20 Sep 2024 18:48:12 +0400 Subject: [PATCH 07/57] Update src/cpp/src/llm_pipeline.cpp --- src/cpp/src/llm_pipeline.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 616187d501..09b9ad3ced 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -571,7 +571,7 @@ ov::genai::LLMPipeline::LLMPipeline( std::cout << "Found custom SchedulerConfig.\n"; } else if (true) { SchedulerConfig scheduler_config; - scheduler_config.num_kv_blocks= 16; + scheduler_config.num_kv_blocks= 32; scheduler_config.enable_prefix_caching = true; m_pimpl = std::make_unique( path, From 03e2f329696c93dfe758dbc021f799d291719596 Mon Sep 17 00:00:00 2001 From: Andrei Kochin Date: Fri, 20 Sep 2024 18:48:17 +0400 Subject: [PATCH 08/57] Update src/cpp/src/llm_pipeline.cpp --- src/cpp/src/llm_pipeline.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 09b9ad3ced..73076ed96c 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -538,7 +538,7 @@ ov::genai::LLMPipeline::LLMPipeline( std::cout << "Found custom SchedulerConfig.\n"; } else if (true) { SchedulerConfig scheduler_config; - scheduler_config.num_kv_blocks = 16; + scheduler_config.num_kv_blocks = 32; scheduler_config.enable_prefix_caching = true; m_pimpl = std::make_unique( model_path, From e561e937ca884d5b1f80b6cc2a5019cd36a45135 Mon Sep 17 00:00:00 2001 From: Andrei Kochin Date: Fri, 20 Sep 2024 19:03:00 +0400 Subject: [PATCH 09/57] Update src/cpp/src/llm_pipeline.cpp --- src/cpp/src/llm_pipeline.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 73076ed96c..270f1e3f71 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -538,7 +538,7 @@ ov::genai::LLMPipeline::LLMPipeline( std::cout << "Found custom SchedulerConfig.\n"; } else if (true) { SchedulerConfig scheduler_config; - scheduler_config.num_kv_blocks = 32; + scheduler_config.num_kv_blocks = 64; scheduler_config.enable_prefix_caching = true; m_pimpl = std::make_unique( model_path, From 37ea2adebbce22d6ead62f964f883ccbde3045c2 Mon Sep 17 00:00:00 2001 From: Andrei Kochin Date: Fri, 20 Sep 2024 19:03:05 +0400 Subject: [PATCH 10/57] Update src/cpp/src/llm_pipeline.cpp --- src/cpp/src/llm_pipeline.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 270f1e3f71..e2c5fe3724 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -571,7 +571,7 @@ ov::genai::LLMPipeline::LLMPipeline( std::cout << "Found custom SchedulerConfig.\n"; } else if (true) { SchedulerConfig scheduler_config; - scheduler_config.num_kv_blocks= 32; + scheduler_config.num_kv_blocks= 64; scheduler_config.enable_prefix_caching = true; m_pimpl = std::make_unique( path, From b62aee9ed6c7c3f22a54d49dc8465316f82a17de Mon Sep 17 00:00:00 2001 From: Andrei Kochin Date: Mon, 23 Sep 2024 11:45:43 +0400 Subject: [PATCH 11/57] Update src/cpp/src/llm_pipeline.cpp --- src/cpp/src/llm_pipeline.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index e2c5fe3724..3c3aebe9de 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -535,7 +535,7 @@ ov::genai::LLMPipeline::LLMPipeline( config_without_scheduler_config.erase(ov::genai::scheduler_config.name()); auto& scheduler_config = plugin_config.at(ov::genai::scheduler_config.name()).as(); m_pimpl = std::make_unique(model_path, tokenizer, scheduler_config, device, config_without_scheduler_config); - std::cout << "Found custom SchedulerConfig.\n"; + // std::cout << "Found custom SchedulerConfig.\n"; } else if (true) { SchedulerConfig scheduler_config; scheduler_config.num_kv_blocks = 64; From 07505b3c8992e1afa1240ddfc0e8c8b24a8c7cdd Mon Sep 17 00:00:00 2001 From: Andrei Kochin Date: Mon, 23 Sep 2024 11:45:48 +0400 Subject: [PATCH 12/57] Update src/cpp/src/llm_pipeline.cpp --- src/cpp/src/llm_pipeline.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 3c3aebe9de..5372f934ba 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -528,7 +528,7 @@ ov::genai::LLMPipeline::LLMPipeline( const std::string& device, const ov::AnyMap& plugin_config ){ - std::cout << "Using continuous batching backend.\n"; + // std::cout << "Using continuous batching backend.\n"; auto start_time = std::chrono::steady_clock::now(); if (plugin_config.find(ov::genai::scheduler_config.name()) != plugin_config.end()) { auto config_without_scheduler_config = plugin_config; From e07881886ed6595b54b5f49295951fcfd54282c2 Mon Sep 17 00:00:00 2001 From: Andrei Kochin Date: Mon, 23 Sep 2024 11:57:38 +0400 Subject: [PATCH 13/57] Update src/cpp/src/llm_pipeline.cpp --- src/cpp/src/llm_pipeline.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 5372f934ba..e45e7258f6 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -568,7 +568,7 @@ ov::genai::LLMPipeline::LLMPipeline( config_without_scheduler_config.erase(ov::genai::scheduler_config.name()); auto& scheduler_config = config.at(ov::genai::scheduler_config.name()).as(); m_pimpl = std::make_unique(path, scheduler_config, device, config_without_scheduler_config); - std::cout << "Found custom SchedulerConfig.\n"; + // std::cout << "Found custom SchedulerConfig.\n"; } else if (true) { SchedulerConfig scheduler_config; scheduler_config.num_kv_blocks= 64; From 001d3a0932d5fc7db94aaa01611ba7c31124e7b3 Mon Sep 17 00:00:00 2001 From: Andrei Kochin Date: Mon, 23 Sep 2024 11:57:45 +0400 Subject: [PATCH 14/57] Update src/cpp/src/llm_pipeline.cpp --- src/cpp/src/llm_pipeline.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index e45e7258f6..40ef2c74ef 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -561,7 +561,7 @@ ov::genai::LLMPipeline::LLMPipeline( const std::string& device, const ov::AnyMap& config ){ - std::cout << "Using continuous batching backend.\n"; + // std::cout << "Using continuous batching backend.\n"; auto start_time = std::chrono::steady_clock::now(); if (config.find(ov::genai::scheduler_config.name()) != config.end()) { auto config_without_scheduler_config = config; From a0a964f483d1a542b27c47269ca1774963fbe145 Mon Sep 17 00:00:00 2001 From: Andrei Kochin Date: Wed, 25 Sep 2024 18:32:54 +0400 Subject: [PATCH 15/57] Update src/cpp/src/llm_pipeline.cpp --- src/cpp/src/llm_pipeline.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 40ef2c74ef..eb0d6d22e3 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -571,7 +571,7 @@ ov::genai::LLMPipeline::LLMPipeline( // std::cout << "Found custom SchedulerConfig.\n"; } else if (true) { SchedulerConfig scheduler_config; - scheduler_config.num_kv_blocks= 64; + scheduler_config.cache_size = 1; scheduler_config.enable_prefix_caching = true; m_pimpl = std::make_unique( path, From 3cb2105c82866165459335a355a94bc29fdb430c Mon Sep 17 00:00:00 2001 From: Andrei Kochin Date: Wed, 25 Sep 2024 18:33:01 +0400 Subject: [PATCH 16/57] Update src/cpp/src/llm_pipeline.cpp --- src/cpp/src/llm_pipeline.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index eb0d6d22e3..0732adf61c 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -538,7 +538,7 @@ ov::genai::LLMPipeline::LLMPipeline( // std::cout << "Found custom SchedulerConfig.\n"; } else if (true) { SchedulerConfig scheduler_config; - scheduler_config.num_kv_blocks = 64; + scheduler_config.cache_size = 1; scheduler_config.enable_prefix_caching = true; m_pimpl = std::make_unique( model_path, From 40ea516e2236528d86e567ea762329be31da77af Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Wed, 25 Sep 2024 17:24:06 +0200 Subject: [PATCH 17/57] Limit max new tokens. --- src/cpp/src/llm_pipeline.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 0732adf61c..6f2e2def08 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include "openvino/genai/continuous_batching_pipeline.hpp" #include "openvino/genai/generation_config.hpp" #include "openvino/genai/llm_pipeline.hpp" @@ -611,6 +612,9 @@ void ov::genai::LLMPipeline::set_generation_config(const GenerationConfig& confi if (config.eos_token_id == -1) m_pimpl->m_generation_config.eos_token_id = default_eos_token_id; + if (config.m_generation_config.max_new_tokens == SIZE_MAX) + config.m_generation_config.max_new_tokens = 100; + m_pimpl->m_generation_config.validate(); } From 193df7e705a9eeec7598188d010971d86b31cb5d Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Wed, 25 Sep 2024 17:29:23 +0200 Subject: [PATCH 18/57] Fixed error --- src/cpp/src/llm_pipeline.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 6f2e2def08..e3aa795b2c 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -612,8 +612,8 @@ void ov::genai::LLMPipeline::set_generation_config(const GenerationConfig& confi if (config.eos_token_id == -1) m_pimpl->m_generation_config.eos_token_id = default_eos_token_id; - if (config.m_generation_config.max_new_tokens == SIZE_MAX) - config.m_generation_config.max_new_tokens = 100; + if (config.max_new_tokens == SIZE_MAX) + m_pimpl->m_generation_config.max_new_tokens = 100; m_pimpl->m_generation_config.validate(); } From 1704548afea68747161e29811bbdec7eecc2c216 Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Mon, 30 Sep 2024 07:58:14 +0400 Subject: [PATCH 19/57] Clean up --- src/python/py_generate_pipeline.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp index 0104364891..85ae6b3e67 100644 --- a/src/python/py_generate_pipeline.cpp +++ b/src/python/py_generate_pipeline.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 #include +#include #include #include #include @@ -10,7 +11,6 @@ #include "openvino/genai/llm_pipeline.hpp" #include #include "../cpp/src/tokenizers_path.hpp" -#include #include "./utils.hpp" @@ -434,7 +434,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) { m.doc() = "Pybind11 binding for LLM Pipeline"; py::class_(m, "LLMPipeline", "This class is used for generation with LLMs") - .def(py::init([&]( + .def(py::init([]( const std::string& model_path, const std::string& device, const std::map& config @@ -442,8 +442,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) { ScopedVar env_manager(utils::ov_tokenizers_module_path()); return std::make_unique(model_path, device, utils::properties_to_any_map(config)); }), - py::call_guard(), + py::call_guard(), // Respect std::cout flushes from constructor. py::arg("model_path"), "folder with openvino_model.xml and openvino_tokenizer[detokenizer].xml files", py::arg("device") = "CPU", "device on which inference will be done", py::arg("config") = ov::AnyMap({}), "openvino.properties map", @@ -463,8 +462,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) { ScopedVar env_manager(utils::ov_tokenizers_module_path()); return std::make_unique(model_path, tokenizer, device, utils::properties_to_any_map(config)); }), - py::call_guard(), + py::call_guard(), // Respect std::cout flushes from constructor. py::arg("model_path"), py::arg("tokenizer"), py::arg("device") = "CPU", From 086c7b8698ed4901c0f18d67568eaf7570ec04db Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Mon, 30 Sep 2024 10:33:47 +0400 Subject: [PATCH 20/57] Default destructors --- src/cpp/src/continuous_batching_pipeline.cpp | 2 -- src/cpp/src/timer.hpp | 2 -- 2 files changed, 4 deletions(-) diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp index 13f2005acd..2ed14a86ff 100644 --- a/src/cpp/src/continuous_batching_pipeline.cpp +++ b/src/cpp/src/continuous_batching_pipeline.cpp @@ -40,8 +40,6 @@ class ContinuousBatchingPipeline::Impl { float m_paged_attention_time_ms = 0.0f; float m_matmul_time_ms = 0.0f; float m_infer_total_ms = 0.0f; - - ~PerfTime() {} } m_perf; // current requests to process diff --git a/src/cpp/src/timer.hpp b/src/cpp/src/timer.hpp index 03367a5530..15976a54a0 100644 --- a/src/cpp/src/timer.hpp +++ b/src/cpp/src/timer.hpp @@ -25,6 +25,4 @@ class ManualTimer { auto m_end = std::chrono::steady_clock::now(); m_total += std::chrono::duration(m_end - m_start).count(); } - - ~ManualTimer() {} }; From 741c13bf475069db5fa101347ea734af8762e98d Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Mon, 30 Sep 2024 11:11:19 +0400 Subject: [PATCH 21/57] Default ~PerfTime --- src/cpp/src/continuous_batching_impl_interface.hpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/cpp/src/continuous_batching_impl_interface.hpp b/src/cpp/src/continuous_batching_impl_interface.hpp index a3615b5828..a9ae021de0 100644 --- a/src/cpp/src/continuous_batching_impl_interface.hpp +++ b/src/cpp/src/continuous_batching_impl_interface.hpp @@ -26,14 +26,6 @@ class ContinuousBatchingPipeline::ImplInterface { float m_paged_attention_time_ms = 0.0f; float m_matmul_time_ms = 0.0f; float m_infer_total_ms = 0.0f; - - ~PerfTime() { - std::cout << "Inference requests aggregated statistic: " << std::endl; - std::cout << "Paged attention % of inference execution: " << (m_paged_attention_time_ms / m_infer_total_ms) * 100 << std::endl; - std::cout << "MatMul % of inference execution: " << (m_matmul_time_ms / m_infer_total_ms) * 100 << std::endl; - std::cout << "Total inference execution secs: " << m_infer_total_ms / 1000. << std::endl; - std::cout << std::endl; - } } m_perf; bool m_is_chat_conversation = false; ChatHistory m_history; From 8d7d39d112f2589fb133a1825eea6d40fe253f9f Mon Sep 17 00:00:00 2001 From: Andrei Kochin Date: Fri, 11 Oct 2024 17:19:40 +0400 Subject: [PATCH 22/57] Update src/cpp/src/llm_pipeline.cpp --- src/cpp/src/llm_pipeline.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 79d71ce9dc..78f646e031 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -545,6 +545,7 @@ ov::genai::LLMPipeline::LLMPipeline( SchedulerConfig scheduler_config; scheduler_config.cache_size = 1; scheduler_config.enable_prefix_caching = true; + scheduler_config.max_num_batched_tokens = 2048; m_pimpl = std::make_unique( model_path, tokenizer, From c4e8e05bba9ce698f7582de3a49e9b6877ace69e Mon Sep 17 00:00:00 2001 From: Andrei Kochin Date: Fri, 11 Oct 2024 17:19:46 +0400 Subject: [PATCH 23/57] Update src/cpp/src/llm_pipeline.cpp --- src/cpp/src/llm_pipeline.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 78f646e031..fbb3aae5c8 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -579,6 +579,7 @@ ov::genai::LLMPipeline::LLMPipeline( SchedulerConfig scheduler_config; scheduler_config.cache_size = 1; scheduler_config.enable_prefix_caching = true; + scheduler_config.max_num_batched_tokens = 2048; m_pimpl = std::make_unique( path, scheduler_config, From 81163424bb4432d9be3d639defc4d3698c3af075 Mon Sep 17 00:00:00 2001 From: Andrei Kochin Date: Fri, 11 Oct 2024 17:32:01 +0400 Subject: [PATCH 24/57] Update src/cpp/src/llm_pipeline.cpp --- src/cpp/src/llm_pipeline.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index fbb3aae5c8..199b8eaceb 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -546,6 +546,7 @@ ov::genai::LLMPipeline::LLMPipeline( scheduler_config.cache_size = 1; scheduler_config.enable_prefix_caching = true; scheduler_config.max_num_batched_tokens = 2048; + scheduler_config.dynamic_split_fuse = false; m_pimpl = std::make_unique( model_path, tokenizer, From b87d0f6c56b1d92cfde9c2e41a4057a0911ab875 Mon Sep 17 00:00:00 2001 From: Andrei Kochin Date: Fri, 11 Oct 2024 17:32:07 +0400 Subject: [PATCH 25/57] Update src/cpp/src/llm_pipeline.cpp --- src/cpp/src/llm_pipeline.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 199b8eaceb..af8739157f 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -581,6 +581,7 @@ ov::genai::LLMPipeline::LLMPipeline( scheduler_config.cache_size = 1; scheduler_config.enable_prefix_caching = true; scheduler_config.max_num_batched_tokens = 2048; + scheduler_config.dynamic_split_fuse = false; m_pimpl = std::make_unique( path, scheduler_config, From 1806fa0de8b0be4f5eca3888c60960778e391026 Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Fri, 11 Oct 2024 19:04:33 +0400 Subject: [PATCH 26/57] CB: fix deadlock (#71) --- src/cpp/src/continuous_batching_impl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp index 8df6d6a185..4af4593bd0 100644 --- a/src/cpp/src/continuous_batching_impl.cpp +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -258,7 +258,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vectorcan_read()) { std::unordered_map token = generations.at(0).get()->back(); OPENVINO_ASSERT(1 == token.size()); OPENVINO_ASSERT(1 == token.begin()->second.generated_ids.size()); From 4bbcd0ea8e13c70d6ce8b9ed635b82cc8ada065f Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Sat, 12 Oct 2024 21:43:52 +0400 Subject: [PATCH 27/57] Increase timeouts for tests --- .github/workflows/causal_lm_cpp.yml | 46 ++++++++++++++--------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index b8fbe397d2..e8dcd73f99 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -57,7 +57,7 @@ jobs: - run: > . ./ov/setupvars.sh && export PYTHONPATH=./build/:$PYTHONPATH - && timeout 25s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./open_llama_3b_v2/ "return 0" + && timeout 50s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./open_llama_3b_v2/ "return 0" | diff <(timeout 25s samples/python/greedy_causal_lm/greedy_causal_lm.py ./open_llama_3b_v2/ "return 0") - cpp-beam_search_causal_lm-ubuntu: @@ -100,7 +100,7 @@ jobs: source ./ov/setupvars.sh export PYTHONPATH=./build/:$PYTHONPATH # C++ ignores that - timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?" > ./pred.txt + timeout 50s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?" > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: @@ -116,7 +116,7 @@ jobs: " echo "Why is the Sun yellow?" passed - timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ 69 > ./pred.txt + timeout 50s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ 69 > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: @@ -132,7 +132,7 @@ jobs: " echo 69 passed - timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ Hi > ./pred.txt + timeout 50s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ Hi > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: @@ -148,7 +148,7 @@ jobs: " echo "Hi" passed - timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "return 0" > ./pred.txt + timeout 50s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "return 0" > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: @@ -164,7 +164,7 @@ jobs: " echo "return 0" passed - timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "你好! 你好嗎?" > ./pred.txt + timeout 50s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "你好! 你好嗎?" > ./pred.txt python -c " import transformers with open('pred.txt', 'r', errors='ignore') as file: @@ -180,7 +180,7 @@ jobs: " echo "你好! 你好嗎?" passed - timeout 1m ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "Alan Turing was a" "return 0" "你好! 你好嗎?" > ./pred.txt + timeout 2m ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "Alan Turing was a" "return 0" "你好! 你好嗎?" > ./pred.txt python -c " import transformers with open('pred.txt', 'r', errors='ignore') as file: @@ -285,7 +285,7 @@ jobs: - run: > . ./ov/setupvars.sh && export PYTHONPATH=./build/:$PYTHONPATH - && timeout 2m ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./Qwen-7B-Chat/ 69 | diff <(timeout 2m samples/python/greedy_causal_lm/greedy_causal_lm.py ./Qwen-7B-Chat/ 69) - + && timeout 4m ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./Qwen-7B-Chat/ 69 | diff <(timeout 4m samples/python/greedy_causal_lm/greedy_causal_lm.py ./Qwen-7B-Chat/ 69) - cpp-beam_search_causal_lm-Qwen1_5-7B-Chat: runs-on: ubuntu-20.04-16-cores @@ -318,8 +318,8 @@ jobs: - run: > . ./ov/setupvars.sh && export PYTHONPATH=./build/:$PYTHONPATH - && timeout 50s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./Qwen1.5-7B-Chat/ "你好!" - | diff <(timeout 50s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./Qwen1.5-7B-Chat/ "你好!") - + && timeout 100s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./Qwen1.5-7B-Chat/ "你好!" + | diff <(timeout 100s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./Qwen1.5-7B-Chat/ "你好!") - cpp-beam_search_causal_lm-Phi-2: runs-on: ubuntu-20.04-16-cores @@ -352,8 +352,8 @@ jobs: - run: > . ./ov/setupvars.sh && export PYTHONPATH=./build/:$PYTHONPATH - && timeout 50s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./phi-2/ 69 - | diff <(timeout 50s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./phi-2/ 69) - + && timeout 100s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./phi-2/ 69 + | diff <(timeout 100s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./phi-2/ 69) - cpp-beam_search_causal_lm-notus-7b-v1: runs-on: ubuntu-20.04-16-cores @@ -386,8 +386,8 @@ jobs: - run: > . ./ov/setupvars.sh && export PYTHONPATH=./build/:$PYTHONPATH - && timeout 50s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./notus-7b-v1/ 69 - | diff <(timeout 50s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./notus-7b-v1/ 69) - + && timeout 100s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./notus-7b-v1/ 69 + | diff <(timeout 100s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./notus-7b-v1/ 69) - cpp-speculative_decoding_lm-ubuntu: runs-on: ubuntu-20.04-16-cores @@ -536,7 +536,7 @@ jobs: - name: Run Generation run: | source ./ov/setupvars.sh - timeout 50s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_greedy.txt + timeout 100s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_greedy.txt - name: Compare run: | python -c " @@ -556,7 +556,7 @@ jobs: - run: > . ./ov/setupvars.sh && export PYTHONPATH=./build/:$PYTHONPATH - && timeout 50s samples/python/greedy_causal_lm/greedy_causal_lm.py ./phi-1_5/ "Alan Turing was a" + && timeout 100s samples/python/greedy_causal_lm/greedy_causal_lm.py ./phi-1_5/ "Alan Turing was a" | diff ./pred_greedy.txt - cpp-greedy_causal_lm-redpajama-3b-chat: @@ -590,7 +590,7 @@ jobs: - name: Run Generation run: | source ./ov/setupvars.sh - timeout 50s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./redpajama-3b-chat/ "Alan Turing was a" > ./pred_greedy.txt + timeout 100s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./redpajama-3b-chat/ "Alan Turing was a" > ./pred_greedy.txt - name: Compare run: | python -c " @@ -610,7 +610,7 @@ jobs: - run: > . ./ov/setupvars.sh && export PYTHONPATH=./build/:$PYTHONPATH - && timeout 50s samples/python/greedy_causal_lm/greedy_causal_lm.py ./redpajama-3b-chat/ "Alan Turing was a" + && timeout 100s samples/python/greedy_causal_lm/greedy_causal_lm.py ./redpajama-3b-chat/ "Alan Turing was a" | diff ./pred_greedy.txt - cpp-chat_sample-ubuntu: @@ -645,7 +645,7 @@ jobs: run: | source ./ov/setupvars.sh printf 'What is 2 + 2?\nWhat is the previous answer?\nAdd 1 to it.\nSubtract 5 from it.\nWhy is the sun yellow?\nWhat was my first question?\n' > ./input.txt - timeout 30s ./build/samples/cpp/chat_sample/chat_sample ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred.txt + timeout 60s ./build/samples/cpp/chat_sample/chat_sample ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred.txt python -c " from transformers import LlamaTokenizer, AutoModelForCausalLM model_id = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0' @@ -675,7 +675,7 @@ jobs: diff pred.txt ref.txt echo "Chat sample cpp" passed export PYTHONPATH=./build/:$PYTHONPATH - timeout 30s ./samples/python/chat_sample/chat_sample.py ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred2.txt + timeout 60s ./samples/python/chat_sample/chat_sample.py ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred2.txt diff pred2.txt ref.txt echo "Chat sample python" passed @@ -708,7 +708,7 @@ jobs: - name: Run visual_language_chat sample - MiniCPM-V-2_6 run: > source ./ov/setupvars.sh - && timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ cat.jpg + && timeout 240s ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ cat.jpg <<< $'What is on the image?\nWhat is special on the image?' - name: Download and convert LLaVa 1.5 model and an image run: | @@ -720,7 +720,7 @@ jobs: - name: Run visual_language_chat sample - LLaVa 1.5 run: > source ./ov/setupvars.sh - && timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./llava_1_5_7b_ov/ monalisa.jpg + && timeout 240s ./build/samples/cpp/visual_language_chat/visual_language_chat ./llava_1_5_7b_ov/ monalisa.jpg <<< $'Who drew this painting?\nWhen did the painter live?' - name: Run python chat sample @@ -728,7 +728,7 @@ jobs: source ./ov/setupvars.sh export PYTHONPATH=./build/:$PYTHONPATH printf 'What is on the image?\nWhat is special on the image?\n' > ./input.txt - timeout 120s python ./samples/python/visual_language_chat/visual_language_chat.py ./miniCPM-V-2_6/ cat.jpg < input.txt > ./pred.txt + timeout 240s python ./samples/python/visual_language_chat/visual_language_chat.py ./miniCPM-V-2_6/ cat.jpg < input.txt > ./pred.txt cpp-continuous-batching-ubuntu: runs-on: ubuntu-20.04-8-cores From 743e018baa058a14d0353bc186ecae50a116ef0b Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Sun, 13 Oct 2024 02:17:15 +0400 Subject: [PATCH 28/57] Update causal_lm_cpp.yml --- .github/workflows/causal_lm_cpp.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index e8dcd73f99..0f0192a7a1 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -479,7 +479,7 @@ jobs: predicted_greedy = f.readline() with open('predictions_prompt_lookup.txt', 'r') as f: predicted_prompt_lookup = f.readline() - assert predicted_greedy == predicted_prompt_lookup + assert predicted_greedy == predicted_prompt_lookup, "Expected {predicted_greedy}, actual {predicted_prompt_lookup}" " echo "Prompt lookup" passed - name: run and compare (model with seq_length_axis = 1) From cfccefa1f6db1464bb544cd3ca321f6055b97713 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Sun, 13 Oct 2024 02:20:32 +0400 Subject: [PATCH 29/57] Use split_core_complile_config for CB --- src/cpp/src/continuous_batching_impl.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp index 4af4593bd0..c6163dd540 100644 --- a/src/cpp/src/continuous_batching_impl.cpp +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -4,6 +4,7 @@ #include "text_callback_streamer.hpp" #include "continuous_batching_impl.hpp" #include "paged_attention_transformations.hpp" +#include "utils.hpp" namespace ov::genai { template struct overloaded : Ts... {using Ts::operator()...;}; @@ -18,15 +19,18 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl( m_tokenizer = tokenizer; ov::Core core; + auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_complile_config(plugin_config); + core.set_property(core_plugin_config); + // The model can be compiled for GPU as well std::shared_ptr model = core.read_model(models_path + "/openvino_model.xml"); - DeviceConfig device_config(core, scheduler_config, device, plugin_config); + DeviceConfig device_config(core, scheduler_config, device, compile_plugin_config); bool is_need_per_layer_cache_control = scheduler_config.use_cache_eviction; apply_paged_attention_transformations(model, device_config, is_need_per_layer_cache_control); - ov::InferRequest infer_request = core.compile_model(model, device_config.get_device(), plugin_config).create_infer_request(); + ov::InferRequest infer_request = core.compile_model(model, device_config.get_device(), compile_plugin_config).create_infer_request(); // setup KV caches m_cache_manager = std::make_shared(device_config, core); From 03965d6677c59ee8732bef368d635c7a07e0cada Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Sun, 13 Oct 2024 02:27:19 +0400 Subject: [PATCH 30/57] Update causal_lm_cpp.yml --- .github/workflows/causal_lm_cpp.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 0f0192a7a1..398a83c23e 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -501,7 +501,7 @@ jobs: predicted_greedy = f.readline() with open('predictions_prompt_lookup.txt', 'r') as f: predicted_prompt_lookup = f.readline() - assert predicted_greedy == predicted_prompt_lookup + assert predicted_greedy == predicted_prompt_lookup, "Expected {predicted_greedy}, actual {predicted_prompt_lookup}" " echo "Prompt lookup" passed From 784c3312dc0a7d7cf2025b9e1a2d6144a9cf4cff Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Sun, 13 Oct 2024 05:22:19 +0400 Subject: [PATCH 31/57] Drop request if it's aborted by streamer --- src/cpp/src/continuous_batching_impl.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp index c6163dd540..683e22feb4 100644 --- a/src/cpp/src/continuous_batching_impl.cpp +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -273,6 +273,16 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vectorend(); } + if (!continue_generation && !m_requests.empty()) { + SequenceGroup::Ptr request = m_requests[0]; + for (const auto& sequence: request->get_sequences()) { + if (m_scheduler->has_block_table(sequence->get_id())) { + m_scheduler->free_sequence(sequence->get_id()); + } + } + m_sampler->clear_beam_search_info(request->get_request_id()); + } + for (size_t generation_idx = 0; generation_idx < generations.size(); ++generation_idx) { const auto& generation = generations[generation_idx]; EncodedGenerationResult result; From 93b8c38e0f74bf4228da3d310b84c25806dc424b Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Sun, 13 Oct 2024 08:46:57 +0400 Subject: [PATCH 32/57] Update src/cpp/src/continuous_batching_impl.cpp --- src/cpp/src/continuous_batching_impl.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp index 683e22feb4..1a2a21e649 100644 --- a/src/cpp/src/continuous_batching_impl.cpp +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -281,6 +281,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vectorclear_beam_search_info(request->get_request_id()); + m_requests.erase(m_requests.begin()); } for (size_t generation_idx = 0; generation_idx < generations.size(); ++generation_idx) { From 043d842c7f725b7009dceb004d55da9f3d90a3a8 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 14 Oct 2024 13:41:49 +0400 Subject: [PATCH 33/57] Drop request in case of exceptions, etc --- src/cpp/src/continuous_batching_impl.cpp | 31 ++++++++++++++++-------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp index 1a2a21e649..7df72de2c9 100644 --- a/src/cpp/src/continuous_batching_impl.cpp +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -259,9 +259,25 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector results; results.reserve(m_awaiting_requests.size()); + auto drop_current_request = [&] () { + SequenceGroup::Ptr request = m_requests[0]; + for (const auto& sequence: request->get_sequences()) { + if (m_scheduler->has_block_table(sequence->get_id())) { + m_scheduler->free_sequence(sequence->get_id()); + } + } + m_sampler->clear_beam_search_info(request->get_request_id()); + m_requests.erase(m_requests.begin()); + }; + bool continue_generation = true; while (has_non_finished_requests() && continue_generation) { - step(); + try { + step(); + } catch (...) { + drop_current_request(); + throw; + } if (streamer_ptr && generations.at(0)->can_read()) { std::unordered_map token = generations.at(0).get()->back(); OPENVINO_ASSERT(1 == token.size()); @@ -273,15 +289,10 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vectorend(); } - if (!continue_generation && !m_requests.empty()) { - SequenceGroup::Ptr request = m_requests[0]; - for (const auto& sequence: request->get_sequences()) { - if (m_scheduler->has_block_table(sequence->get_id())) { - m_scheduler->free_sequence(sequence->get_id()); - } - } - m_sampler->clear_beam_search_info(request->get_request_id()); - m_requests.erase(m_requests.begin()); + if (!continue_generation) { + drop_current_request(); + } else { + OPENVINO_ASSERT(m_requests.empty(), "Internal error: current request is supposed to be dropped within step() function as completed"); } for (size_t generation_idx = 0; generation_idx < generations.size(); ++generation_idx) { From fdad63cfa81b6531f1bdcdec09ce2ec7c304fc28 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 14 Oct 2024 14:37:10 +0400 Subject: [PATCH 34/57] Turned off prefix caching --- src/cpp/src/llm_pipeline.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 20afe9f5ab..af625beb89 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -547,7 +547,7 @@ ov::genai::LLMPipeline::LLMPipeline( } else if (true) { SchedulerConfig scheduler_config; scheduler_config.cache_size = 1; - scheduler_config.enable_prefix_caching = true; + scheduler_config.enable_prefix_caching = false; scheduler_config.max_num_batched_tokens = 2048; scheduler_config.dynamic_split_fuse = false; m_pimpl = std::make_unique( @@ -582,7 +582,7 @@ ov::genai::LLMPipeline::LLMPipeline( } else if (true) { SchedulerConfig scheduler_config; scheduler_config.cache_size = 1; - scheduler_config.enable_prefix_caching = true; + scheduler_config.enable_prefix_caching = false; scheduler_config.max_num_batched_tokens = 2048; scheduler_config.dynamic_split_fuse = false; m_pimpl = std::make_unique( From a21f7255e1a8b281f4b55c62adee08b5e83eff10 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 14 Oct 2024 14:52:00 +0400 Subject: [PATCH 35/57] Apply suggestions from code review --- src/cpp/src/llm_pipeline.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index af625beb89..e870db98a2 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -548,8 +548,7 @@ ov::genai::LLMPipeline::LLMPipeline( SchedulerConfig scheduler_config; scheduler_config.cache_size = 1; scheduler_config.enable_prefix_caching = false; - scheduler_config.max_num_batched_tokens = 2048; - scheduler_config.dynamic_split_fuse = false; + scheduler_config.dynamic_split_fuse = true; m_pimpl = std::make_unique( model_path, tokenizer, @@ -582,8 +581,7 @@ ov::genai::LLMPipeline::LLMPipeline( } else if (true) { SchedulerConfig scheduler_config; scheduler_config.cache_size = 1; - scheduler_config.enable_prefix_caching = false; - scheduler_config.max_num_batched_tokens = 2048; + scheduler_config.max_num_batched_tokens = 256; scheduler_config.dynamic_split_fuse = false; m_pimpl = std::make_unique( path, From a66be9ef9f10416058783bc72fc645a21c50158d Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 14 Oct 2024 14:53:05 +0400 Subject: [PATCH 36/57] Apply suggestions from code review --- src/cpp/src/llm_pipeline.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index e870db98a2..d2f7112e17 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -548,7 +548,6 @@ ov::genai::LLMPipeline::LLMPipeline( SchedulerConfig scheduler_config; scheduler_config.cache_size = 1; scheduler_config.enable_prefix_caching = false; - scheduler_config.dynamic_split_fuse = true; m_pimpl = std::make_unique( model_path, tokenizer, @@ -581,8 +580,7 @@ ov::genai::LLMPipeline::LLMPipeline( } else if (true) { SchedulerConfig scheduler_config; scheduler_config.cache_size = 1; - scheduler_config.max_num_batched_tokens = 256; - scheduler_config.dynamic_split_fuse = false; + scheduler_config.enable_prefix_caching = false; m_pimpl = std::make_unique( path, scheduler_config, From 82fceb5baaa752afc71b00776b97d5c67d484211 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 14 Oct 2024 15:43:30 +0400 Subject: [PATCH 37/57] Update continuous_batching_impl.cpp --- src/cpp/src/continuous_batching_impl.cpp | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp index 7df72de2c9..981e85b671 100644 --- a/src/cpp/src/continuous_batching_impl.cpp +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -259,15 +259,16 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector results; results.reserve(m_awaiting_requests.size()); - auto drop_current_request = [&] () { - SequenceGroup::Ptr request = m_requests[0]; - for (const auto& sequence: request->get_sequences()) { - if (m_scheduler->has_block_table(sequence->get_id())) { - m_scheduler->free_sequence(sequence->get_id()); + auto drop_requests = [&] () { + for (const std::shared_ptr request : m_requests) { + for (const auto& sequence: request->get_sequences()) { + if (m_scheduler->has_block_table(sequence->get_id())) { + m_scheduler->free_sequence(sequence->get_id()); + } } + m_sampler->clear_beam_search_info(request->get_request_id()); } - m_sampler->clear_beam_search_info(request->get_request_id()); - m_requests.erase(m_requests.begin()); + m_requests.clear(); }; bool continue_generation = true; @@ -275,7 +276,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vectorcan_read()) { @@ -290,7 +291,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector Date: Mon, 14 Oct 2024 17:31:33 +0400 Subject: [PATCH 38/57] Apply suggestions from code review --- .github/workflows/causal_lm_cpp.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index d411ec2aa7..f578e4d7b2 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -479,7 +479,7 @@ jobs: predicted_greedy = f.readline() with open('predictions_prompt_lookup.txt', 'r') as f: predicted_prompt_lookup = f.readline() - assert predicted_greedy == predicted_prompt_lookup, "Expected {predicted_greedy}, actual {predicted_prompt_lookup}" + assert predicted_greedy == predicted_prompt_lookup, 'Expected {predicted_greedy}, actual {predicted_prompt_lookup}' " echo "Prompt lookup" passed - name: run and compare (model with seq_length_axis = 1) @@ -501,7 +501,7 @@ jobs: predicted_greedy = f.readline() with open('predictions_prompt_lookup.txt', 'r') as f: predicted_prompt_lookup = f.readline() - assert predicted_greedy == predicted_prompt_lookup, "Expected {predicted_greedy}, actual {predicted_prompt_lookup}" + assert predicted_greedy == predicted_prompt_lookup, 'Expected {predicted_greedy}, actual {predicted_prompt_lookup}' " echo "Prompt lookup" passed From 401967830487e22a9a92b8b6b7b6bf39f97e4a52 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 14 Oct 2024 20:50:27 +0400 Subject: [PATCH 39/57] Apply suggestions from code review --- .github/workflows/causal_lm_cpp.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index f578e4d7b2..e4a2a4d184 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -479,7 +479,7 @@ jobs: predicted_greedy = f.readline() with open('predictions_prompt_lookup.txt', 'r') as f: predicted_prompt_lookup = f.readline() - assert predicted_greedy == predicted_prompt_lookup, 'Expected {predicted_greedy}, actual {predicted_prompt_lookup}' + assert predicted_greedy == predicted_prompt_lookup, f'Expected {predicted_greedy}, actual {predicted_prompt_lookup}' " echo "Prompt lookup" passed - name: run and compare (model with seq_length_axis = 1) @@ -501,7 +501,7 @@ jobs: predicted_greedy = f.readline() with open('predictions_prompt_lookup.txt', 'r') as f: predicted_prompt_lookup = f.readline() - assert predicted_greedy == predicted_prompt_lookup, 'Expected {predicted_greedy}, actual {predicted_prompt_lookup}' + assert predicted_greedy == predicted_prompt_lookup, f'Expected {predicted_greedy}, actual {predicted_prompt_lookup}' " echo "Prompt lookup" passed From feae5469a0ea266dc38062ff2edefd7169e2e534 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 14 Oct 2024 22:48:10 +0400 Subject: [PATCH 40/57] Update causal_lm_cpp.yml --- .github/workflows/causal_lm_cpp.yml | 40 ++++++++++++++--------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 1ba7f83fea..2735d3380f 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -57,7 +57,7 @@ jobs: - run: > . ./ov/setupvars.sh && export PYTHONPATH=./build/:$PYTHONPATH - && timeout 50s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./open_llama_3b_v2/ "return 0" + && timeout 25s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./open_llama_3b_v2/ "return 0" | diff <(timeout 25s samples/python/greedy_causal_lm/greedy_causal_lm.py ./open_llama_3b_v2/ "return 0") - cpp-beam_search_causal_lm-ubuntu: @@ -100,7 +100,7 @@ jobs: source ./ov/setupvars.sh export PYTHONPATH=./build/:$PYTHONPATH # C++ ignores that - timeout 50s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?" > ./pred.txt + timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?" > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: @@ -116,7 +116,7 @@ jobs: " echo "Why is the Sun yellow?" passed - timeout 50s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ 69 > ./pred.txt + timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ 69 > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: @@ -132,7 +132,7 @@ jobs: " echo 69 passed - timeout 50s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ Hi > ./pred.txt + timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ Hi > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: @@ -148,7 +148,7 @@ jobs: " echo "Hi" passed - timeout 50s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "return 0" > ./pred.txt + timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "return 0" > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: @@ -164,7 +164,7 @@ jobs: " echo "return 0" passed - timeout 50s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "你好! 你好嗎?" > ./pred.txt + timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "你好! 你好嗎?" > ./pred.txt python -c " import transformers with open('pred.txt', 'r', errors='ignore') as file: @@ -180,7 +180,7 @@ jobs: " echo "你好! 你好嗎?" passed - timeout 2m ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "Alan Turing was a" "return 0" "你好! 你好嗎?" > ./pred.txt + timeout 1m ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "Alan Turing was a" "return 0" "你好! 你好嗎?" > ./pred.txt python -c " import transformers with open('pred.txt', 'r', errors='ignore') as file: @@ -285,7 +285,7 @@ jobs: - run: > . ./ov/setupvars.sh && export PYTHONPATH=./build/:$PYTHONPATH - && timeout 4m ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./Qwen-7B-Chat/ 69 | diff <(timeout 4m samples/python/greedy_causal_lm/greedy_causal_lm.py ./Qwen-7B-Chat/ 69) - + && timeout 2m ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./Qwen-7B-Chat/ 69 | diff <(timeout 4m samples/python/greedy_causal_lm/greedy_causal_lm.py ./Qwen-7B-Chat/ 69) - cpp-beam_search_causal_lm-Qwen1_5-7B-Chat: runs-on: ubuntu-20.04-16-cores @@ -318,8 +318,8 @@ jobs: - run: > . ./ov/setupvars.sh && export PYTHONPATH=./build/:$PYTHONPATH - && timeout 100s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./Qwen1.5-7B-Chat/ "你好!" - | diff <(timeout 100s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./Qwen1.5-7B-Chat/ "你好!") - + && timeout 50s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./Qwen1.5-7B-Chat/ "你好!" + | diff <(timeout 50s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./Qwen1.5-7B-Chat/ "你好!") - cpp-beam_search_causal_lm-Phi-2: runs-on: ubuntu-20.04-16-cores @@ -352,8 +352,8 @@ jobs: - run: > . ./ov/setupvars.sh && export PYTHONPATH=./build/:$PYTHONPATH - && timeout 100s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./phi-2/ 69 - | diff <(timeout 100s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./phi-2/ 69) - + && timeout 50s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./phi-2/ 69 + | diff <(timeout 50s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./phi-2/ 69) - cpp-beam_search_causal_lm-notus-7b-v1: runs-on: ubuntu-20.04-16-cores @@ -386,8 +386,8 @@ jobs: - run: > . ./ov/setupvars.sh && export PYTHONPATH=./build/:$PYTHONPATH - && timeout 100s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./notus-7b-v1/ 69 - | diff <(timeout 100s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./notus-7b-v1/ 69) - + && timeout 50s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./notus-7b-v1/ 69 + | diff <(timeout 50s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./notus-7b-v1/ 69) - cpp-speculative_decoding_lm-ubuntu: runs-on: ubuntu-20.04-16-cores @@ -536,7 +536,7 @@ jobs: - name: Run Generation run: | source ./ov/setupvars.sh - timeout 100s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_greedy.txt + timeout 50s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_greedy.txt - name: Compare run: | python -c " @@ -556,7 +556,7 @@ jobs: - run: > . ./ov/setupvars.sh && export PYTHONPATH=./build/:$PYTHONPATH - && timeout 100s samples/python/greedy_causal_lm/greedy_causal_lm.py ./phi-1_5/ "Alan Turing was a" + && timeout 50s samples/python/greedy_causal_lm/greedy_causal_lm.py ./phi-1_5/ "Alan Turing was a" | diff ./pred_greedy.txt - cpp-greedy_causal_lm-redpajama-3b-chat: @@ -590,7 +590,7 @@ jobs: - name: Run Generation run: | source ./ov/setupvars.sh - timeout 100s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./redpajama-3b-chat/ "Alan Turing was a" > ./pred_greedy.txt + timeout 50s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./redpajama-3b-chat/ "Alan Turing was a" > ./pred_greedy.txt - name: Compare run: | python -c " @@ -610,7 +610,7 @@ jobs: - run: > . ./ov/setupvars.sh && export PYTHONPATH=./build/:$PYTHONPATH - && timeout 100s samples/python/greedy_causal_lm/greedy_causal_lm.py ./redpajama-3b-chat/ "Alan Turing was a" + && timeout 50s samples/python/greedy_causal_lm/greedy_causal_lm.py ./redpajama-3b-chat/ "Alan Turing was a" | diff ./pred_greedy.txt - cpp-chat_sample-ubuntu: @@ -645,7 +645,7 @@ jobs: run: | source ./ov/setupvars.sh printf 'What is 2 + 2?\nWhat is the previous answer?\nAdd 1 to it.\nSubtract 5 from it.\nWhy is the sun yellow?\nWhat was my first question?\n' > ./input.txt - timeout 60s ./build/samples/cpp/chat_sample/chat_sample ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred.txt + timeout 30s ./build/samples/cpp/chat_sample/chat_sample ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred.txt python -c " from transformers import LlamaTokenizer, AutoModelForCausalLM model_id = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0' @@ -675,7 +675,7 @@ jobs: diff pred.txt ref.txt echo "Chat sample cpp" passed export PYTHONPATH=./build/:$PYTHONPATH - timeout 60s ./samples/python/chat_sample/chat_sample.py ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred2.txt + timeout 30s ./samples/python/chat_sample/chat_sample.py ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred2.txt diff pred2.txt ref.txt echo "Chat sample python" passed From 5bdf7791ad236af3bbf844dd47a086b3372fa91f Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 14 Oct 2024 22:49:42 +0400 Subject: [PATCH 41/57] Apply suggestions from code review --- .github/workflows/causal_lm_cpp.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 2735d3380f..14c42fb7bb 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -285,7 +285,7 @@ jobs: - run: > . ./ov/setupvars.sh && export PYTHONPATH=./build/:$PYTHONPATH - && timeout 2m ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./Qwen-7B-Chat/ 69 | diff <(timeout 4m samples/python/greedy_causal_lm/greedy_causal_lm.py ./Qwen-7B-Chat/ 69) - + && timeout 2m ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./Qwen-7B-Chat/ 69 | diff <(timeout 2m samples/python/greedy_causal_lm/greedy_causal_lm.py ./Qwen-7B-Chat/ 69) - cpp-beam_search_causal_lm-Qwen1_5-7B-Chat: runs-on: ubuntu-20.04-16-cores @@ -708,7 +708,7 @@ jobs: - name: Run visual_language_chat sample - MiniCPM-V-2_6 run: > source ./ov/setupvars.sh - && timeout 240s ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ cat.jpg + && timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ cat.jpg <<< $'What is on the image?\nWhat is special on the image?' - name: Download and convert LLaVa 1.5 model and an image run: | @@ -729,7 +729,7 @@ jobs: source ./ov/setupvars.sh export PYTHONPATH=./build/:$PYTHONPATH printf 'What is on the image?\nWhat is special on the image?\n' > ./input.txt - timeout 240s python ./samples/python/visual_language_chat/visual_language_chat.py ./miniCPM-V-2_6/ cat.jpg < input.txt > ./pred.txt + timeout 120s python ./samples/python/visual_language_chat/visual_language_chat.py ./miniCPM-V-2_6/ cat.jpg < input.txt > ./pred.txt cpp-continuous-batching-ubuntu: runs-on: ubuntu-20.04-8-cores From 7827199e5ee92ad46f6eb3c29ac3c2751b841d90 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 21 Oct 2024 21:47:56 +0400 Subject: [PATCH 42/57] Apply suggestions from code review --- src/python/py_generate_pipeline.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp index ea15da09d6..f6a4de4ff4 100644 --- a/src/python/py_generate_pipeline.cpp +++ b/src/python/py_generate_pipeline.cpp @@ -2,7 +2,6 @@ // SPDX-License-Identifier: Apache-2.0 #include -#include #include #include #include @@ -404,7 +403,6 @@ PYBIND11_MODULE(py_generate_pipeline, m) { ScopedVar env_manager(utils::ov_tokenizers_module_path()); return std::make_unique(model_path, device, utils::properties_to_any_map(config)); }), - py::call_guard(), // Respect std::cout flushes from constructor. py::arg("model_path"), "folder with openvino_model.xml and openvino_tokenizer[detokenizer].xml files", py::arg("device") = "CPU", "device on which inference will be done", py::arg("config") = ov::AnyMap({}), "openvino.properties map", @@ -424,7 +422,6 @@ PYBIND11_MODULE(py_generate_pipeline, m) { ScopedVar env_manager(utils::ov_tokenizers_module_path()); return std::make_unique(model_path, tokenizer, device, utils::properties_to_any_map(config)); }), - py::call_guard(), // Respect std::cout flushes from constructor. py::arg("model_path"), py::arg("tokenizer"), py::arg("device") = "CPU", From 467ab86baa49f13da7b2cec5332a202ec604de66 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Tue, 22 Oct 2024 11:20:38 +0400 Subject: [PATCH 43/57] Apply suggestions from code review --- src/cpp/src/llm_pipeline.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 2683f8f253..8b22d3a074 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -590,7 +590,7 @@ ov::genai::LLMPipeline::LLMPipeline( if (config.find(ov::genai::scheduler_config.name()) != config.end()) { auto config_without_scheduler_config = config; config_without_scheduler_config.erase(ov::genai::scheduler_config.name()); - auto& scheduler_config = properties.at(ov::genai::scheduler_config.name()).as(); + auto& scheduler_config = config.at(ov::genai::scheduler_config.name()).as(); m_pimpl = std::make_unique(models_path, scheduler_config, device, config_without_scheduler_config); } else if (true) { SchedulerConfig scheduler_config; @@ -600,7 +600,7 @@ ov::genai::LLMPipeline::LLMPipeline( models_path, scheduler_config, device, - properties + config ); } else if ("NPU" == device) { m_pimpl = std::make_unique(models_path, device, config); From 9fad1d24e9acf58a9eda3e21b11d71b46d67824a Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 11 Nov 2024 12:51:45 +0400 Subject: [PATCH 44/57] Update linux.yml --- .github/workflows/linux.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 0942483a65..42ba010fb5 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -259,7 +259,7 @@ jobs: run: | source ${OV_INSTALL_DIR}/setupvars.sh python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager - python -m pytest ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template + python -m pytest -s -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template env: PYTHONPATH: "./build/:$PYTHONPATH" @@ -267,13 +267,13 @@ jobs: run: | source ${OV_INSTALL_DIR}/setupvars.sh python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels - python -m pytest ./tests/python_tests --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template" + python -m pytest -s -v ./tests/python_tests --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template" - run: python -m pip install git+https://github.com/eaidova/optimum-intel.git@ea/minicpmv - run: python -m pip install -U "optimum<1.23" --no-dependencies - run: > source ${OV_INSTALL_DIR}/setupvars.sh - && python -m pytest ./tests/python_tests/test_vlm_api.py + && python -m pytest -s -v ./tests/python_tests/test_vlm_api.py genai_python_lib_whisper: name: OpenVINO genai extension whisper tests (cmake + wheel) @@ -352,7 +352,7 @@ jobs: run: | source ${OV_INSTALL_DIR}/setupvars.sh python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager - python -m pytest ./tests/python_tests/test_whisper_generate_api.py -k test_smoke + python -m pytest -s -v ./tests/python_tests/test_whisper_generate_api.py -k test_smoke env: PYTHONPATH: "./build/:$PYTHONPATH" @@ -360,7 +360,7 @@ jobs: run: | source ${OV_INSTALL_DIR}/setupvars.sh python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels - python -m pytest ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke" + python -m pytest -s -v ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke" genai_package: name: OpenVINO genai extension (install to OpenVINO package) From 35f4ff2b2a544533e6b3f6d88a24bde5edca1171 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 11 Nov 2024 12:52:32 +0400 Subject: [PATCH 45/57] Update windows.yml --- .github/workflows/windows.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 7dede276d2..5ced333b4a 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -236,7 +236,7 @@ jobs: run: | . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels --upgrade-strategy eager - python -m pytest ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template + python -m pytest -s -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template env: PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that. @@ -244,7 +244,7 @@ jobs: run: | . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" python -m pip install . --verbose - python -m pytest ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template" + python -m pytest -s -v ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template" genai_python_lib_whisper: name: OpenVINO genai extension whisper tests (cmake + wheel) @@ -300,7 +300,7 @@ jobs: run: | . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels --upgrade-strategy eager - python -m pytest ./tests/python_tests/test_whisper_generate_api.py -k test_smoke + python -m pytest -s -v ./tests/python_tests/test_whisper_generate_api.py -k test_smoke env: PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that. @@ -308,7 +308,7 @@ jobs: run: | . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" python -m pip install . --verbose - python -m pytest ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke" + python -m pytest -s -v ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke" genai_python_lib_vlm: name: OpenVINO genai VLM tests (cmake + wheel) @@ -366,7 +366,7 @@ jobs: python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels --upgrade-strategy eager python -m pip install git+https://github.com/eaidova/optimum-intel.git@ea/minicpmv python -m pip install -U "optimum<1.23" --no-dependencies - python -m pytest ./tests/python_tests/test_vlm_api.py + python -m pytest -s -v ./tests/python_tests/test_vlm_api.py env: PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that. From ad78839c017305032bf285ea26feac89781f7a1f Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 11 Nov 2024 12:53:22 +0400 Subject: [PATCH 46/57] Update mac.yml --- .github/workflows/mac.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml index 980c689e19..3afa236843 100644 --- a/.github/workflows/mac.yml +++ b/.github/workflows/mac.yml @@ -225,7 +225,7 @@ jobs: run: | source ${OV_INSTALL_DIR}/setupvars.sh python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager - python -m pytest ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template + python -m pytest -s -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template env: PYTHONPATH: "./build/:$PYTHONPATH" @@ -234,7 +234,7 @@ jobs: source ${OV_INSTALL_DIR}/setupvars.sh python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels python -c "from openvino_genai import LLMPipeline" - python -m pytest ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template" + python -m pytest -s -v ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template" genai_python_lib_whisper: name: OpenVINO genai extension whisper tests (cmake + wheel) @@ -289,7 +289,7 @@ jobs: run: | source ${OV_INSTALL_DIR}/setupvars.sh python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager - python -m pytest ./tests/python_tests/test_whisper_generate_api.py -k test_smoke + python -m pytest -s -v ./tests/python_tests/test_whisper_generate_api.py -k test_smoke env: PYTHONPATH: "./build/:$PYTHONPATH" @@ -298,7 +298,7 @@ jobs: source ${OV_INSTALL_DIR}/setupvars.sh python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels python -c "from openvino_genai import LLMPipeline" - python -m pytest ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke" + python -m pytest -s -v ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke" genai_package: name: OpenVINO genai extension (install to OpenVINO package) From c5201e4c45a1f6d57d69ae3d4d6ddced454c903b Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 11 Nov 2024 16:18:24 +0400 Subject: [PATCH 47/57] Update linux.yml --- .github/workflows/linux.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 42ba010fb5..158930b89e 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -259,7 +259,7 @@ jobs: run: | source ${OV_INSTALL_DIR}/setupvars.sh python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager - python -m pytest -s -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template + python -m pytest -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template env: PYTHONPATH: "./build/:$PYTHONPATH" @@ -267,13 +267,13 @@ jobs: run: | source ${OV_INSTALL_DIR}/setupvars.sh python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels - python -m pytest -s -v ./tests/python_tests --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template" + python -m pytest -v ./tests/python_tests --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template" - run: python -m pip install git+https://github.com/eaidova/optimum-intel.git@ea/minicpmv - run: python -m pip install -U "optimum<1.23" --no-dependencies - run: > source ${OV_INSTALL_DIR}/setupvars.sh - && python -m pytest -s -v ./tests/python_tests/test_vlm_api.py + && python -m pytest -v ./tests/python_tests/test_vlm_api.py genai_python_lib_whisper: name: OpenVINO genai extension whisper tests (cmake + wheel) @@ -352,7 +352,7 @@ jobs: run: | source ${OV_INSTALL_DIR}/setupvars.sh python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager - python -m pytest -s -v ./tests/python_tests/test_whisper_generate_api.py -k test_smoke + python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k test_smoke env: PYTHONPATH: "./build/:$PYTHONPATH" @@ -360,7 +360,7 @@ jobs: run: | source ${OV_INSTALL_DIR}/setupvars.sh python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels - python -m pytest -s -v ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke" + python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke" genai_package: name: OpenVINO genai extension (install to OpenVINO package) From 4dd053c9fb635fc8f118bece3e597888b580d2d3 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 11 Nov 2024 23:23:48 +0400 Subject: [PATCH 48/57] Update llm_pipeline.cpp --- src/cpp/src/llm_pipeline.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 92ade0e7de..8f6da3537a 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -575,7 +575,9 @@ ov::genai::LLMPipeline::LLMPipeline( config_without_scheduler_config.erase(ov::genai::scheduler_config.name()); auto& scheduler_config = properties.at(ov::genai::scheduler_config.name()).as(); m_pimpl = std::make_unique(models_path, tokenizer, scheduler_config, device, config_without_scheduler_config); - // std::cout << "Found custom SchedulerConfig.\n"; + // std::cout << "Found custom SchedulerConfig.\n + } else if ("NPU" == device) { + m_pimpl = std::make_unique(models_path, tokenizer, device, properties); } else if (true) { SchedulerConfig scheduler_config; scheduler_config.cache_size = 1; @@ -587,8 +589,6 @@ ov::genai::LLMPipeline::LLMPipeline( device, properties ); - } else if ("NPU" == device) { - m_pimpl = std::make_unique(models_path, tokenizer, device, properties); } else { m_pimpl = std::make_unique(models_path, tokenizer, device, properties); } @@ -608,6 +608,8 @@ ov::genai::LLMPipeline::LLMPipeline( config_without_scheduler_config.erase(ov::genai::scheduler_config.name()); auto& scheduler_config = config.at(ov::genai::scheduler_config.name()).as(); m_pimpl = std::make_unique(models_path, scheduler_config, device, config_without_scheduler_config); + } else if ("NPU" == device) { + m_pimpl = std::make_unique(models_path, device, config); } else if (true) { SchedulerConfig scheduler_config; scheduler_config.cache_size = 1; @@ -618,8 +620,6 @@ ov::genai::LLMPipeline::LLMPipeline( device, config ); - } else if ("NPU" == device) { - m_pimpl = std::make_unique(models_path, device, config); } else { m_pimpl = std::make_unique(models_path, device, config); } From 78470603f84b1f37b9b020dd59b04cc2ab4613a9 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Tue, 12 Nov 2024 16:45:22 +0400 Subject: [PATCH 49/57] Update llm_pipeline.cpp --- src/cpp/src/llm_pipeline.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 64be0f4b89..393458d281 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -7,7 +7,6 @@ #include #include #include -#include #include "openvino/genai/continuous_batching_pipeline.hpp" #include "openvino/genai/generation_config.hpp" #include "openvino/genai/llm_pipeline.hpp" @@ -568,14 +567,12 @@ ov::genai::LLMPipeline::LLMPipeline( const std::string& device, const ov::AnyMap& properties ){ - // std::cout << "Using continuous batching backend.\n"; auto start_time = std::chrono::steady_clock::now(); if (properties.find(ov::genai::scheduler_config.name()) != properties.end()) { auto config_without_scheduler_config = properties; config_without_scheduler_config.erase(ov::genai::scheduler_config.name()); auto& scheduler_config = properties.at(ov::genai::scheduler_config.name()).as(); m_pimpl = std::make_unique(models_path, tokenizer, scheduler_config, device, config_without_scheduler_config); - // std::cout << "Found custom SchedulerConfig.\n } else if ("NPU" == device) { m_pimpl = std::make_unique(models_path, tokenizer, device, properties); } else if (true) { @@ -601,7 +598,6 @@ ov::genai::LLMPipeline::LLMPipeline( const std::string& device, const ov::AnyMap& config ){ - // std::cout << "Using continuous batching backend.\n"; auto start_time = std::chrono::steady_clock::now(); if (config.find(ov::genai::scheduler_config.name()) != config.end()) { auto config_without_scheduler_config = config; @@ -649,9 +645,6 @@ void ov::genai::LLMPipeline::set_generation_config(const GenerationConfig& confi // if eos_token_id was not provided in config forward from default config if (config.eos_token_id == -1) m_pimpl->m_generation_config.set_eos_token_id(default_eos_token_id); - - if (config.max_new_tokens == SIZE_MAX) - m_pimpl->m_generation_config.max_new_tokens = 100; m_pimpl->m_generation_config.validate(); } From d11db7ea74782583ad319a6dd58a99e4cbcee124 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Tue, 12 Nov 2024 16:45:55 +0400 Subject: [PATCH 50/57] Apply suggestions from code review --- src/cpp/src/llm_pipeline.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 393458d281..e9cfde29d8 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -645,7 +645,6 @@ void ov::genai::LLMPipeline::set_generation_config(const GenerationConfig& confi // if eos_token_id was not provided in config forward from default config if (config.eos_token_id == -1) m_pimpl->m_generation_config.set_eos_token_id(default_eos_token_id); - m_pimpl->m_generation_config.validate(); } From 9acf368d374d1deb4885d9facfb6ab69d1bf7b19 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Tue, 12 Nov 2024 16:46:19 +0400 Subject: [PATCH 51/57] Update llm_pipeline.cpp --- src/cpp/src/llm_pipeline.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index e9cfde29d8..b79403338c 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -645,6 +645,7 @@ void ov::genai::LLMPipeline::set_generation_config(const GenerationConfig& confi // if eos_token_id was not provided in config forward from default config if (config.eos_token_id == -1) m_pimpl->m_generation_config.set_eos_token_id(default_eos_token_id); + m_pimpl->m_generation_config.validate(); } From 3c835af27d1324b323da58f7b685c85db4b6f28e Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Tue, 12 Nov 2024 16:46:48 +0400 Subject: [PATCH 52/57] Update causal_lm_cpp.yml --- .github/workflows/causal_lm_cpp.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 4764c6f747..29f1d082f2 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -510,7 +510,7 @@ jobs: predicted_greedy = f.readline() with open('predictions_prompt_lookup.txt', 'r') as f: predicted_prompt_lookup = f.readline() - assert predicted_greedy == predicted_prompt_lookup, f'Expected {predicted_greedy}, actual {predicted_prompt_lookup}' + assert predicted_greedy == predicted_prompt_lookup " echo "Prompt lookup" passed - name: run and compare (model with seq_length_axis = 1) @@ -531,7 +531,7 @@ jobs: predicted_greedy = f.readline() with open('predictions_prompt_lookup.txt', 'r') as f: predicted_prompt_lookup = f.readline() - assert predicted_greedy == predicted_prompt_lookup, f'Expected {predicted_greedy}, actual {predicted_prompt_lookup}' + assert predicted_greedy == predicted_prompt_lookup " echo "Prompt lookup" passed From c807011cdbaca089ae472f57f4a870094af83285 Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Fri, 22 Nov 2024 15:45:40 +0400 Subject: [PATCH 53/57] Fix validation --- src/cpp/src/llm_pipeline.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index d33c0ae07e..991eb427d5 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -428,7 +428,8 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase { tokenizer, scheduler_config, device, - plugin_config} { + plugin_config + } { m_generation_config = m_impl.get_config(); } @@ -442,7 +443,8 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase { m_tokenizer, scheduler_config, device, - plugin_config} { + plugin_config + } { m_generation_config = m_impl.get_config(); } From 70971647a38e1039d3c742ea849775771adf75f6 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 25 Nov 2024 11:06:38 +0400 Subject: [PATCH 54/57] Update linux.yml --- .github/workflows/linux.yml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 3c3e0347e7..a0507b45f4 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -255,14 +255,6 @@ jobs: cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j - - name: Test bindings - run: | - source ${OV_INSTALL_DIR}/setupvars.sh - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager - python -m pytest -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template - env: - PYTHONPATH: "./build/:$PYTHONPATH" - - name: Test bindings (wheel) run: | source ${OV_INSTALL_DIR}/setupvars.sh From 3538bbeb17334f672a39c85c9308aef8122345ad Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 25 Nov 2024 11:07:27 +0400 Subject: [PATCH 55/57] Update windows.yml --- .github/workflows/windows.yml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 1e4164aa0b..5805695edd 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -232,14 +232,6 @@ jobs: cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j - - name: Test bindings - run: | - . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels --upgrade-strategy eager - python -m pytest -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template - env: - PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that. - - name: Test bindings (wheel) run: | . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" From a510e77750f6b6222bc4b854201c9e048fa034d5 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Tue, 26 Nov 2024 11:49:04 +0400 Subject: [PATCH 56/57] Update linux.yml --- .github/workflows/linux.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index a0507b45f4..a01dc10b2a 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -255,6 +255,13 @@ jobs: cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j + - name: Install tokenizers + run: | + source ${OV_INSTALL_DIR}/setupvars.sh + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager + env: + PYTHONPATH: "./build/:$PYTHONPATH" + - name: Test bindings (wheel) run: | source ${OV_INSTALL_DIR}/setupvars.sh From eb0b0f41d72ce4ae4f9b572f1ec610cded65173b Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Tue, 26 Nov 2024 11:49:25 +0400 Subject: [PATCH 57/57] Update windows.yml --- .github/workflows/windows.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 5805695edd..fa195dd04f 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -232,6 +232,13 @@ jobs: cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j + - name: Install tokenizers + run: | + . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels --upgrade-strategy eager + env: + PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that. + - name: Test bindings (wheel) run: | . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"