From ec5f305c8bf9a6ee7d43a46e24e759eb20eba05f Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Thu, 19 Sep 2024 14:30:58 +0400
Subject: [PATCH 01/57] Use continuous batching by default

---
 src/cpp/src/llm_pipeline.cpp | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 66e2890671..ff79efb75b 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -515,6 +515,7 @@ ov::genai::LLMPipeline::LLMPipeline(
     const ov::genai::Tokenizer& tokenizer,
     OptionalGenerationConfig generation_config
 ) {
+    OPENVINO_THROW("Not supported");
     auto start_time = std::chrono::steady_clock::now();
     m_pimpl = std::make_unique<StatefulLLMPipeline>(request, tokenizer, generation_config);
     auto stop_time = std::chrono::steady_clock::now();
@@ -527,12 +528,24 @@ ov::genai::LLMPipeline::LLMPipeline(
     const std::string& device,
     const ov::AnyMap& plugin_config
 ){
+    std::cout << "Using continuous batching backend.\n";
     auto start_time = std::chrono::steady_clock::now();
     if (plugin_config.find(ov::genai::scheduler_config.name()) != plugin_config.end()) {
         auto config_without_scheduler_config = plugin_config;
         config_without_scheduler_config.erase(ov::genai::scheduler_config.name());
         auto& scheduler_config = plugin_config.at(ov::genai::scheduler_config.name()).as<SchedulerConfig>();
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_path, tokenizer, scheduler_config, device, config_without_scheduler_config);
+        std::cout << "Found custom SchedulerConfig.\n";
+    } else if (true) {
+        SchedulerConfig scheduler_config;
+        scheduler_config.cache_size = 8;
+        m_pimpl = std::make_unique<ContinuousBatchingAdapter>(
+            model_path,
+            tokenizer,
+            scheduler_config,
+            device,
+            plugin_config
+        );
     } else if ("NPU" == device) {
         m_pimpl = std::make_unique<StaticLLMPipeline>(model_path, tokenizer, device, plugin_config);
     } else {
@@ -547,12 +560,23 @@ ov::genai::LLMPipeline::LLMPipeline(
     const std::string& device,
     const ov::AnyMap& config
 ){
+    std::cout << "Using continuous batching backend.\n";
     auto start_time = std::chrono::steady_clock::now();
     if (config.find(ov::genai::scheduler_config.name()) != config.end()) {
         auto config_without_scheduler_config = config;
         config_without_scheduler_config.erase(ov::genai::scheduler_config.name());
         auto& scheduler_config = config.at(ov::genai::scheduler_config.name()).as<SchedulerConfig>();
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(path, scheduler_config, device, config_without_scheduler_config);
+        std::cout << "Found custom SchedulerConfig.\n";
+    } else if (true) {
+        SchedulerConfig scheduler_config;
+        scheduler_config.cache_size = 8;
+        m_pimpl = std::make_unique<ContinuousBatchingAdapter>(
+            path,
+            scheduler_config,
+            device,
+            config
+        );
     } else if ("NPU" == device) {
         m_pimpl = std::make_unique<StaticLLMPipeline>(path, device, config);
     } else {

From 41d1fe7fd07487ccddd8d39cf6305a83c1511240 Mon Sep 17 00:00:00 2001
From: Andrei Kochin <andrei.kochin@intel.com>
Date: Fri, 20 Sep 2024 16:41:54 +0400
Subject: [PATCH 02/57] Update src/cpp/src/llm_pipeline.cpp

---
 src/cpp/src/llm_pipeline.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index ff79efb75b..46650e94bb 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -538,7 +538,7 @@ ov::genai::LLMPipeline::LLMPipeline(
         std::cout << "Found custom SchedulerConfig.\n";
     } else if (true) {
         SchedulerConfig scheduler_config;
-        scheduler_config.cache_size = 8;
+        scheduler_config.num_kv_blocks = 16;
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(
             model_path,
             tokenizer,

From 36150c4c314a19768daff37e059492d95b2f4b07 Mon Sep 17 00:00:00 2001
From: Andrei Kochin <andrei.kochin@intel.com>
Date: Fri, 20 Sep 2024 16:42:01 +0400
Subject: [PATCH 03/57] Update src/cpp/src/llm_pipeline.cpp

---
 src/cpp/src/llm_pipeline.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 46650e94bb..d68b2aa250 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -570,7 +570,7 @@ ov::genai::LLMPipeline::LLMPipeline(
         std::cout << "Found custom SchedulerConfig.\n";
     } else if (true) {
         SchedulerConfig scheduler_config;
-        scheduler_config.cache_size = 8;
+        scheduler_config.num_kv_blocks= 16;
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(
             path,
             scheduler_config,

From 4a4a09e705d98e081cdd510520e01f18045e9698 Mon Sep 17 00:00:00 2001
From: Andrei Kochin <andrei.kochin@intel.com>
Date: Fri, 20 Sep 2024 17:32:01 +0400
Subject: [PATCH 04/57] Update src/cpp/src/llm_pipeline.cpp

---
 src/cpp/src/llm_pipeline.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index d68b2aa250..d34ee2563b 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -571,6 +571,7 @@ ov::genai::LLMPipeline::LLMPipeline(
     } else if (true) {
         SchedulerConfig scheduler_config;
         scheduler_config.num_kv_blocks= 16;
+        scheduler_config.enable_prefix_caching = true;
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(
             path,
             scheduler_config,

From 1a58b5e8bb60d21b5c5299fe1c18ca94c3131c74 Mon Sep 17 00:00:00 2001
From: Andrei Kochin <andrei.kochin@intel.com>
Date: Fri, 20 Sep 2024 17:32:06 +0400
Subject: [PATCH 05/57] Update src/cpp/src/llm_pipeline.cpp

---
 src/cpp/src/llm_pipeline.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index d34ee2563b..616187d501 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -539,6 +539,7 @@ ov::genai::LLMPipeline::LLMPipeline(
     } else if (true) {
         SchedulerConfig scheduler_config;
         scheduler_config.num_kv_blocks = 16;
+        scheduler_config.enable_prefix_caching = true;
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(
             model_path,
             tokenizer,

From 90d81e654d884fc5221785860d4c04f3e8a9e699 Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Fri, 20 Sep 2024 17:31:02 +0400
Subject: [PATCH 06/57] Reorder cout

---
 src/cpp/src/continuous_batching_pipeline.cpp | 8 +-------
 src/cpp/src/timer.hpp                        | 4 +---
 src/python/py_generate_pipeline.cpp          | 7 ++++++-
 3 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp
index 6100a870f3..13f2005acd 100644
--- a/src/cpp/src/continuous_batching_pipeline.cpp
+++ b/src/cpp/src/continuous_batching_pipeline.cpp
@@ -41,13 +41,7 @@ class ContinuousBatchingPipeline::Impl {
         float m_matmul_time_ms = 0.0f;
         float m_infer_total_ms = 0.0f;
 
-        ~PerfTime() {
-            std::cout << "Inference requests aggregated statistic: " << std::endl;
-            std::cout << "Paged attention % of inference execution: " << (m_paged_attention_time_ms / m_infer_total_ms) * 100 << std::endl;
-            std::cout << "MatMul % of inference execution: " << (m_matmul_time_ms / m_infer_total_ms) * 100 << std::endl;
-            std::cout << "Total inference execution secs: " << m_infer_total_ms / 1000. << std::endl;
-            std::cout << std::endl;
-        }
+        ~PerfTime() {}
     } m_perf;
 
     // current requests to process
diff --git a/src/cpp/src/timer.hpp b/src/cpp/src/timer.hpp
index c4893acd1c..03367a5530 100644
--- a/src/cpp/src/timer.hpp
+++ b/src/cpp/src/timer.hpp
@@ -26,7 +26,5 @@ class ManualTimer {
         m_total += std::chrono::duration<double, std::milli>(m_end - m_start).count();
     }
 
-    ~ManualTimer() {
-        std::cout << m_title << ": " << m_total / 1000. << " secs" << std::endl;
-    }
+    ~ManualTimer() {}
 };
diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp
index 058fd2a823..0104364891 100644
--- a/src/python/py_generate_pipeline.cpp
+++ b/src/python/py_generate_pipeline.cpp
@@ -10,6 +10,7 @@
 #include "openvino/genai/llm_pipeline.hpp"
 #include <openvino/runtime/auto/properties.hpp>
 #include "../cpp/src/tokenizers_path.hpp"
+#include <pybind11/iostream.h>
 
 #include "./utils.hpp"
 
@@ -433,7 +434,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
     m.doc() = "Pybind11 binding for LLM Pipeline";
 
     py::class_<LLMPipeline>(m, "LLMPipeline", "This class is used for generation with LLMs")
-        .def(py::init([](
+        .def(py::init([&](
             const std::string& model_path, 
             const std::string& device,
             const std::map<std::string, py::object>& config
@@ -441,6 +442,8 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
             ScopedVar env_manager(utils::ov_tokenizers_module_path());
             return std::make_unique<LLMPipeline>(model_path, device, utils::properties_to_any_map(config));
         }),
+        py::call_guard<py::scoped_ostream_redirect,
+                     py::scoped_estream_redirect>(),
         py::arg("model_path"), "folder with openvino_model.xml and openvino_tokenizer[detokenizer].xml files", 
         py::arg("device") = "CPU", "device on which inference will be done",
         py::arg("config") = ov::AnyMap({}), "openvino.properties map",
@@ -460,6 +463,8 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
             ScopedVar env_manager(utils::ov_tokenizers_module_path());
             return std::make_unique<LLMPipeline>(model_path, tokenizer, device, utils::properties_to_any_map(config));
         }),
+        py::call_guard<py::scoped_ostream_redirect,
+                     py::scoped_estream_redirect>(),
         py::arg("model_path"),
         py::arg("tokenizer"),
         py::arg("device") = "CPU",

From 6dc43a30afa9c1a960b5b215b4ad35ae8d8cffb7 Mon Sep 17 00:00:00 2001
From: Andrei Kochin <andrei.kochin@intel.com>
Date: Fri, 20 Sep 2024 18:48:12 +0400
Subject: [PATCH 07/57] Update src/cpp/src/llm_pipeline.cpp

---
 src/cpp/src/llm_pipeline.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 616187d501..09b9ad3ced 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -571,7 +571,7 @@ ov::genai::LLMPipeline::LLMPipeline(
         std::cout << "Found custom SchedulerConfig.\n";
     } else if (true) {
         SchedulerConfig scheduler_config;
-        scheduler_config.num_kv_blocks= 16;
+        scheduler_config.num_kv_blocks= 32;
         scheduler_config.enable_prefix_caching = true;
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(
             path,

From 03e2f329696c93dfe758dbc021f799d291719596 Mon Sep 17 00:00:00 2001
From: Andrei Kochin <andrei.kochin@intel.com>
Date: Fri, 20 Sep 2024 18:48:17 +0400
Subject: [PATCH 08/57] Update src/cpp/src/llm_pipeline.cpp

---
 src/cpp/src/llm_pipeline.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 09b9ad3ced..73076ed96c 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -538,7 +538,7 @@ ov::genai::LLMPipeline::LLMPipeline(
         std::cout << "Found custom SchedulerConfig.\n";
     } else if (true) {
         SchedulerConfig scheduler_config;
-        scheduler_config.num_kv_blocks = 16;
+        scheduler_config.num_kv_blocks = 32;
         scheduler_config.enable_prefix_caching = true;
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(
             model_path,

From e561e937ca884d5b1f80b6cc2a5019cd36a45135 Mon Sep 17 00:00:00 2001
From: Andrei Kochin <andrei.kochin@intel.com>
Date: Fri, 20 Sep 2024 19:03:00 +0400
Subject: [PATCH 09/57] Update src/cpp/src/llm_pipeline.cpp

---
 src/cpp/src/llm_pipeline.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 73076ed96c..270f1e3f71 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -538,7 +538,7 @@ ov::genai::LLMPipeline::LLMPipeline(
         std::cout << "Found custom SchedulerConfig.\n";
     } else if (true) {
         SchedulerConfig scheduler_config;
-        scheduler_config.num_kv_blocks = 32;
+        scheduler_config.num_kv_blocks = 64;
         scheduler_config.enable_prefix_caching = true;
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(
             model_path,

From 37ea2adebbce22d6ead62f964f883ccbde3045c2 Mon Sep 17 00:00:00 2001
From: Andrei Kochin <andrei.kochin@intel.com>
Date: Fri, 20 Sep 2024 19:03:05 +0400
Subject: [PATCH 10/57] Update src/cpp/src/llm_pipeline.cpp

---
 src/cpp/src/llm_pipeline.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 270f1e3f71..e2c5fe3724 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -571,7 +571,7 @@ ov::genai::LLMPipeline::LLMPipeline(
         std::cout << "Found custom SchedulerConfig.\n";
     } else if (true) {
         SchedulerConfig scheduler_config;
-        scheduler_config.num_kv_blocks= 32;
+        scheduler_config.num_kv_blocks= 64;
         scheduler_config.enable_prefix_caching = true;
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(
             path,

From b62aee9ed6c7c3f22a54d49dc8465316f82a17de Mon Sep 17 00:00:00 2001
From: Andrei Kochin <andrei.kochin@intel.com>
Date: Mon, 23 Sep 2024 11:45:43 +0400
Subject: [PATCH 11/57] Update src/cpp/src/llm_pipeline.cpp

---
 src/cpp/src/llm_pipeline.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index e2c5fe3724..3c3aebe9de 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -535,7 +535,7 @@ ov::genai::LLMPipeline::LLMPipeline(
         config_without_scheduler_config.erase(ov::genai::scheduler_config.name());
         auto& scheduler_config = plugin_config.at(ov::genai::scheduler_config.name()).as<SchedulerConfig>();
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_path, tokenizer, scheduler_config, device, config_without_scheduler_config);
-        std::cout << "Found custom SchedulerConfig.\n";
+        // std::cout << "Found custom SchedulerConfig.\n";
     } else if (true) {
         SchedulerConfig scheduler_config;
         scheduler_config.num_kv_blocks = 64;

From 07505b3c8992e1afa1240ddfc0e8c8b24a8c7cdd Mon Sep 17 00:00:00 2001
From: Andrei Kochin <andrei.kochin@intel.com>
Date: Mon, 23 Sep 2024 11:45:48 +0400
Subject: [PATCH 12/57] Update src/cpp/src/llm_pipeline.cpp

---
 src/cpp/src/llm_pipeline.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 3c3aebe9de..5372f934ba 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -528,7 +528,7 @@ ov::genai::LLMPipeline::LLMPipeline(
     const std::string& device,
     const ov::AnyMap& plugin_config
 ){
-    std::cout << "Using continuous batching backend.\n";
+    // std::cout << "Using continuous batching backend.\n";
     auto start_time = std::chrono::steady_clock::now();
     if (plugin_config.find(ov::genai::scheduler_config.name()) != plugin_config.end()) {
         auto config_without_scheduler_config = plugin_config;

From e07881886ed6595b54b5f49295951fcfd54282c2 Mon Sep 17 00:00:00 2001
From: Andrei Kochin <andrei.kochin@intel.com>
Date: Mon, 23 Sep 2024 11:57:38 +0400
Subject: [PATCH 13/57] Update src/cpp/src/llm_pipeline.cpp

---
 src/cpp/src/llm_pipeline.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 5372f934ba..e45e7258f6 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -568,7 +568,7 @@ ov::genai::LLMPipeline::LLMPipeline(
         config_without_scheduler_config.erase(ov::genai::scheduler_config.name());
         auto& scheduler_config = config.at(ov::genai::scheduler_config.name()).as<SchedulerConfig>();
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(path, scheduler_config, device, config_without_scheduler_config);
-        std::cout << "Found custom SchedulerConfig.\n";
+        // std::cout << "Found custom SchedulerConfig.\n";
     } else if (true) {
         SchedulerConfig scheduler_config;
         scheduler_config.num_kv_blocks= 64;

From 001d3a0932d5fc7db94aaa01611ba7c31124e7b3 Mon Sep 17 00:00:00 2001
From: Andrei Kochin <andrei.kochin@intel.com>
Date: Mon, 23 Sep 2024 11:57:45 +0400
Subject: [PATCH 14/57] Update src/cpp/src/llm_pipeline.cpp

---
 src/cpp/src/llm_pipeline.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index e45e7258f6..40ef2c74ef 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -561,7 +561,7 @@ ov::genai::LLMPipeline::LLMPipeline(
     const std::string& device,
     const ov::AnyMap& config
 ){
-    std::cout << "Using continuous batching backend.\n";
+    // std::cout << "Using continuous batching backend.\n";
     auto start_time = std::chrono::steady_clock::now();
     if (config.find(ov::genai::scheduler_config.name()) != config.end()) {
         auto config_without_scheduler_config = config;

From a0a964f483d1a542b27c47269ca1774963fbe145 Mon Sep 17 00:00:00 2001
From: Andrei Kochin <andrei.kochin@intel.com>
Date: Wed, 25 Sep 2024 18:32:54 +0400
Subject: [PATCH 15/57] Update src/cpp/src/llm_pipeline.cpp

---
 src/cpp/src/llm_pipeline.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 40ef2c74ef..eb0d6d22e3 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -571,7 +571,7 @@ ov::genai::LLMPipeline::LLMPipeline(
         // std::cout << "Found custom SchedulerConfig.\n";
     } else if (true) {
         SchedulerConfig scheduler_config;
-        scheduler_config.num_kv_blocks= 64;
+        scheduler_config.cache_size = 1;
         scheduler_config.enable_prefix_caching = true;
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(
             path,

From 3cb2105c82866165459335a355a94bc29fdb430c Mon Sep 17 00:00:00 2001
From: Andrei Kochin <andrei.kochin@intel.com>
Date: Wed, 25 Sep 2024 18:33:01 +0400
Subject: [PATCH 16/57] Update src/cpp/src/llm_pipeline.cpp

---
 src/cpp/src/llm_pipeline.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index eb0d6d22e3..0732adf61c 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -538,7 +538,7 @@ ov::genai::LLMPipeline::LLMPipeline(
         // std::cout << "Found custom SchedulerConfig.\n";
     } else if (true) {
         SchedulerConfig scheduler_config;
-        scheduler_config.num_kv_blocks = 64;
+        scheduler_config.cache_size = 1;
         scheduler_config.enable_prefix_caching = true;
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(
             model_path,

From 40ea516e2236528d86e567ea762329be31da77af Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Wed, 25 Sep 2024 17:24:06 +0200
Subject: [PATCH 17/57] Limit max new tokens.

---
 src/cpp/src/llm_pipeline.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 0732adf61c..6f2e2def08 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -7,6 +7,7 @@
 #include <algorithm>
 #include <nlohmann/json.hpp>
 #include <openvino/openvino.hpp>
+#include <limits>
 #include "openvino/genai/continuous_batching_pipeline.hpp"
 #include "openvino/genai/generation_config.hpp"
 #include "openvino/genai/llm_pipeline.hpp"
@@ -611,6 +612,9 @@ void ov::genai::LLMPipeline::set_generation_config(const GenerationConfig& confi
     if (config.eos_token_id == -1)
         m_pimpl->m_generation_config.eos_token_id = default_eos_token_id;
 
+    if (config.m_generation_config.max_new_tokens == SIZE_MAX)
+        config.m_generation_config.max_new_tokens = 100;
+    
     m_pimpl->m_generation_config.validate();
 }
 

From 193df7e705a9eeec7598188d010971d86b31cb5d Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Wed, 25 Sep 2024 17:29:23 +0200
Subject: [PATCH 18/57] Fixed error

---
 src/cpp/src/llm_pipeline.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 6f2e2def08..e3aa795b2c 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -612,8 +612,8 @@ void ov::genai::LLMPipeline::set_generation_config(const GenerationConfig& confi
     if (config.eos_token_id == -1)
         m_pimpl->m_generation_config.eos_token_id = default_eos_token_id;
 
-    if (config.m_generation_config.max_new_tokens == SIZE_MAX)
-        config.m_generation_config.max_new_tokens = 100;
+    if (config.max_new_tokens == SIZE_MAX)
+        m_pimpl->m_generation_config.max_new_tokens = 100;
     
     m_pimpl->m_generation_config.validate();
 }

From 1704548afea68747161e29811bbdec7eecc2c216 Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Mon, 30 Sep 2024 07:58:14 +0400
Subject: [PATCH 19/57] Clean up

---
 src/python/py_generate_pipeline.cpp | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp
index 0104364891..85ae6b3e67 100644
--- a/src/python/py_generate_pipeline.cpp
+++ b/src/python/py_generate_pipeline.cpp
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include <filesystem>
+#include <pybind11/iostream.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include <pybind11/stl_bind.h>
@@ -10,7 +11,6 @@
 #include "openvino/genai/llm_pipeline.hpp"
 #include <openvino/runtime/auto/properties.hpp>
 #include "../cpp/src/tokenizers_path.hpp"
-#include <pybind11/iostream.h>
 
 #include "./utils.hpp"
 
@@ -434,7 +434,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
     m.doc() = "Pybind11 binding for LLM Pipeline";
 
     py::class_<LLMPipeline>(m, "LLMPipeline", "This class is used for generation with LLMs")
-        .def(py::init([&](
+        .def(py::init([](
             const std::string& model_path, 
             const std::string& device,
             const std::map<std::string, py::object>& config
@@ -442,8 +442,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
             ScopedVar env_manager(utils::ov_tokenizers_module_path());
             return std::make_unique<LLMPipeline>(model_path, device, utils::properties_to_any_map(config));
         }),
-        py::call_guard<py::scoped_ostream_redirect,
-                     py::scoped_estream_redirect>(),
+        py::call_guard<py::scoped_ostream_redirect, py::scoped_estream_redirect>(),  // Respect std::cout flushes from constructor.
         py::arg("model_path"), "folder with openvino_model.xml and openvino_tokenizer[detokenizer].xml files", 
         py::arg("device") = "CPU", "device on which inference will be done",
         py::arg("config") = ov::AnyMap({}), "openvino.properties map",
@@ -463,8 +462,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
             ScopedVar env_manager(utils::ov_tokenizers_module_path());
             return std::make_unique<LLMPipeline>(model_path, tokenizer, device, utils::properties_to_any_map(config));
         }),
-        py::call_guard<py::scoped_ostream_redirect,
-                     py::scoped_estream_redirect>(),
+        py::call_guard<py::scoped_ostream_redirect, py::scoped_estream_redirect>(),  // Respect std::cout flushes from constructor.
         py::arg("model_path"),
         py::arg("tokenizer"),
         py::arg("device") = "CPU",

From 086c7b8698ed4901c0f18d67568eaf7570ec04db Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Mon, 30 Sep 2024 10:33:47 +0400
Subject: [PATCH 20/57] Default destructors

---
 src/cpp/src/continuous_batching_pipeline.cpp | 2 --
 src/cpp/src/timer.hpp                        | 2 --
 2 files changed, 4 deletions(-)

diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp
index 13f2005acd..2ed14a86ff 100644
--- a/src/cpp/src/continuous_batching_pipeline.cpp
+++ b/src/cpp/src/continuous_batching_pipeline.cpp
@@ -40,8 +40,6 @@ class ContinuousBatchingPipeline::Impl {
         float m_paged_attention_time_ms = 0.0f;
         float m_matmul_time_ms = 0.0f;
         float m_infer_total_ms = 0.0f;
-
-        ~PerfTime() {}
     } m_perf;
 
     // current requests to process
diff --git a/src/cpp/src/timer.hpp b/src/cpp/src/timer.hpp
index 03367a5530..15976a54a0 100644
--- a/src/cpp/src/timer.hpp
+++ b/src/cpp/src/timer.hpp
@@ -25,6 +25,4 @@ class ManualTimer {
         auto m_end = std::chrono::steady_clock::now();
         m_total += std::chrono::duration<double, std::milli>(m_end - m_start).count();
     }
-
-    ~ManualTimer() {}
 };

From 741c13bf475069db5fa101347ea734af8762e98d Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Mon, 30 Sep 2024 11:11:19 +0400
Subject: [PATCH 21/57] Default ~PerfTime

---
 src/cpp/src/continuous_batching_impl_interface.hpp | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/src/cpp/src/continuous_batching_impl_interface.hpp b/src/cpp/src/continuous_batching_impl_interface.hpp
index a3615b5828..a9ae021de0 100644
--- a/src/cpp/src/continuous_batching_impl_interface.hpp
+++ b/src/cpp/src/continuous_batching_impl_interface.hpp
@@ -26,14 +26,6 @@ class ContinuousBatchingPipeline::ImplInterface {
         float m_paged_attention_time_ms = 0.0f;
         float m_matmul_time_ms = 0.0f;
         float m_infer_total_ms = 0.0f;
-
-        ~PerfTime() {
-            std::cout << "Inference requests aggregated statistic: " << std::endl;
-            std::cout << "Paged attention % of inference execution: " << (m_paged_attention_time_ms / m_infer_total_ms) * 100 << std::endl;
-            std::cout << "MatMul % of inference execution: " << (m_matmul_time_ms / m_infer_total_ms) * 100 << std::endl;
-            std::cout << "Total inference execution secs: " << m_infer_total_ms / 1000. << std::endl;
-            std::cout << std::endl;
-        }
     } m_perf;
     bool m_is_chat_conversation = false;
     ChatHistory m_history;

From 8d7d39d112f2589fb133a1825eea6d40fe253f9f Mon Sep 17 00:00:00 2001
From: Andrei Kochin <andrei.kochin@intel.com>
Date: Fri, 11 Oct 2024 17:19:40 +0400
Subject: [PATCH 22/57] Update src/cpp/src/llm_pipeline.cpp

---
 src/cpp/src/llm_pipeline.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 79d71ce9dc..78f646e031 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -545,6 +545,7 @@ ov::genai::LLMPipeline::LLMPipeline(
         SchedulerConfig scheduler_config;
         scheduler_config.cache_size = 1;
         scheduler_config.enable_prefix_caching = true;
+        scheduler_config.max_num_batched_tokens = 2048;
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(
             model_path,
             tokenizer,

From c4e8e05bba9ce698f7582de3a49e9b6877ace69e Mon Sep 17 00:00:00 2001
From: Andrei Kochin <andrei.kochin@intel.com>
Date: Fri, 11 Oct 2024 17:19:46 +0400
Subject: [PATCH 23/57] Update src/cpp/src/llm_pipeline.cpp

---
 src/cpp/src/llm_pipeline.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 78f646e031..fbb3aae5c8 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -579,6 +579,7 @@ ov::genai::LLMPipeline::LLMPipeline(
         SchedulerConfig scheduler_config;
         scheduler_config.cache_size = 1;
         scheduler_config.enable_prefix_caching = true;
+        scheduler_config.max_num_batched_tokens = 2048;
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(
             path,
             scheduler_config,

From 81163424bb4432d9be3d639defc4d3698c3af075 Mon Sep 17 00:00:00 2001
From: Andrei Kochin <andrei.kochin@intel.com>
Date: Fri, 11 Oct 2024 17:32:01 +0400
Subject: [PATCH 24/57] Update src/cpp/src/llm_pipeline.cpp

---
 src/cpp/src/llm_pipeline.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index fbb3aae5c8..199b8eaceb 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -546,6 +546,7 @@ ov::genai::LLMPipeline::LLMPipeline(
         scheduler_config.cache_size = 1;
         scheduler_config.enable_prefix_caching = true;
         scheduler_config.max_num_batched_tokens = 2048;
+        scheduler_config.dynamic_split_fuse = false;
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(
             model_path,
             tokenizer,

From b87d0f6c56b1d92cfde9c2e41a4057a0911ab875 Mon Sep 17 00:00:00 2001
From: Andrei Kochin <andrei.kochin@intel.com>
Date: Fri, 11 Oct 2024 17:32:07 +0400
Subject: [PATCH 25/57] Update src/cpp/src/llm_pipeline.cpp

---
 src/cpp/src/llm_pipeline.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 199b8eaceb..af8739157f 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -581,6 +581,7 @@ ov::genai::LLMPipeline::LLMPipeline(
         scheduler_config.cache_size = 1;
         scheduler_config.enable_prefix_caching = true;
         scheduler_config.max_num_batched_tokens = 2048;
+        scheduler_config.dynamic_split_fuse = false;
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(
             path,
             scheduler_config,

From 1806fa0de8b0be4f5eca3888c60960778e391026 Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Fri, 11 Oct 2024 19:04:33 +0400
Subject: [PATCH 26/57] CB: fix deadlock (#71)

---
 src/cpp/src/continuous_batching_impl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
index 8df6d6a185..4af4593bd0 100644
--- a/src/cpp/src/continuous_batching_impl.cpp
+++ b/src/cpp/src/continuous_batching_impl.cpp
@@ -258,7 +258,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
     bool continue_generation = true;
     while (has_non_finished_requests() && continue_generation) {
         step();
-        if (streamer_ptr) {
+        if (streamer_ptr && generations.at(0)->can_read()) {
             std::unordered_map<uint64_t, GenerationOutput> token = generations.at(0).get()->back();
             OPENVINO_ASSERT(1 == token.size());
             OPENVINO_ASSERT(1 == token.begin()->second.generated_ids.size());

From 4bbcd0ea8e13c70d6ce8b9ed635b82cc8ada065f Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Sat, 12 Oct 2024 21:43:52 +0400
Subject: [PATCH 27/57] Increase timeouts for tests

---
 .github/workflows/causal_lm_cpp.yml | 46 ++++++++++++++---------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index b8fbe397d2..e8dcd73f99 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -57,7 +57,7 @@ jobs:
       - run: >
           . ./ov/setupvars.sh
           && export PYTHONPATH=./build/:$PYTHONPATH
-          && timeout 25s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./open_llama_3b_v2/ "return 0"
+          && timeout 50s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./open_llama_3b_v2/ "return 0"
           | diff <(timeout 25s samples/python/greedy_causal_lm/greedy_causal_lm.py ./open_llama_3b_v2/ "return 0") -
 
   cpp-beam_search_causal_lm-ubuntu:
@@ -100,7 +100,7 @@ jobs:
           source ./ov/setupvars.sh
           export PYTHONPATH=./build/:$PYTHONPATH  # C++ ignores that
 
-          timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?" > ./pred.txt
+          timeout 50s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?" > ./pred.txt
           python -c "
           import transformers
           with open('pred.txt', 'r') as file:
@@ -116,7 +116,7 @@ jobs:
           "
           echo "Why is the Sun yellow?" passed
 
-          timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ 69 > ./pred.txt
+          timeout 50s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ 69 > ./pred.txt
           python -c "
           import transformers
           with open('pred.txt', 'r') as file:
@@ -132,7 +132,7 @@ jobs:
           "
           echo 69 passed
 
-          timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ Hi > ./pred.txt
+          timeout 50s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ Hi > ./pred.txt
           python -c "
           import transformers
           with open('pred.txt', 'r') as file:
@@ -148,7 +148,7 @@ jobs:
           "
           echo "Hi" passed
 
-          timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "return 0" > ./pred.txt
+          timeout 50s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "return 0" > ./pred.txt
           python -c "
           import transformers
           with open('pred.txt', 'r') as file:
@@ -164,7 +164,7 @@ jobs:
           "
           echo "return 0" passed
 
-          timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "你好！ 你好嗎？" > ./pred.txt
+          timeout 50s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "你好！ 你好嗎？" > ./pred.txt
           python -c "
           import transformers
           with open('pred.txt', 'r', errors='ignore') as file:
@@ -180,7 +180,7 @@ jobs:
           "
           echo "你好！ 你好嗎？" passed
 
-          timeout 1m ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "Alan Turing was a" "return 0" "你好！ 你好嗎？" > ./pred.txt
+          timeout 2m ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "Alan Turing was a" "return 0" "你好！ 你好嗎？" > ./pred.txt
           python -c "
           import transformers
           with open('pred.txt', 'r', errors='ignore') as file:
@@ -285,7 +285,7 @@ jobs:
       - run: >
           . ./ov/setupvars.sh
           && export PYTHONPATH=./build/:$PYTHONPATH
-          && timeout 2m ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./Qwen-7B-Chat/ 69 | diff <(timeout 2m samples/python/greedy_causal_lm/greedy_causal_lm.py ./Qwen-7B-Chat/ 69) -
+          && timeout 4m ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./Qwen-7B-Chat/ 69 | diff <(timeout 4m samples/python/greedy_causal_lm/greedy_causal_lm.py ./Qwen-7B-Chat/ 69) -
 
   cpp-beam_search_causal_lm-Qwen1_5-7B-Chat:
     runs-on: ubuntu-20.04-16-cores
@@ -318,8 +318,8 @@ jobs:
       - run: >
           . ./ov/setupvars.sh
           && export PYTHONPATH=./build/:$PYTHONPATH
-          && timeout 50s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./Qwen1.5-7B-Chat/ "你好！"
-          | diff <(timeout 50s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./Qwen1.5-7B-Chat/ "你好！") -
+          && timeout 100s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./Qwen1.5-7B-Chat/ "你好！"
+          | diff <(timeout 100s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./Qwen1.5-7B-Chat/ "你好！") -
 
   cpp-beam_search_causal_lm-Phi-2:
     runs-on: ubuntu-20.04-16-cores
@@ -352,8 +352,8 @@ jobs:
       - run: >
           . ./ov/setupvars.sh
           && export PYTHONPATH=./build/:$PYTHONPATH
-          && timeout 50s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./phi-2/ 69
-          | diff <(timeout 50s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./phi-2/ 69) -
+          && timeout 100s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./phi-2/ 69
+          | diff <(timeout 100s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./phi-2/ 69) -
 
   cpp-beam_search_causal_lm-notus-7b-v1:
     runs-on: ubuntu-20.04-16-cores
@@ -386,8 +386,8 @@ jobs:
       - run: >
           . ./ov/setupvars.sh
           && export PYTHONPATH=./build/:$PYTHONPATH
-          && timeout 50s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./notus-7b-v1/ 69
-          | diff <(timeout 50s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./notus-7b-v1/ 69) -
+          && timeout 100s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./notus-7b-v1/ 69
+          | diff <(timeout 100s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./notus-7b-v1/ 69) -
 
   cpp-speculative_decoding_lm-ubuntu:
     runs-on: ubuntu-20.04-16-cores
@@ -536,7 +536,7 @@ jobs:
       - name: Run Generation
         run: |
           source ./ov/setupvars.sh
-          timeout 50s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_greedy.txt
+          timeout 100s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_greedy.txt
       - name: Compare
         run: |
           python -c "
@@ -556,7 +556,7 @@ jobs:
       - run: >
           . ./ov/setupvars.sh
           && export PYTHONPATH=./build/:$PYTHONPATH
-          && timeout 50s samples/python/greedy_causal_lm/greedy_causal_lm.py ./phi-1_5/ "Alan Turing was a"
+          && timeout 100s samples/python/greedy_causal_lm/greedy_causal_lm.py ./phi-1_5/ "Alan Turing was a"
           | diff ./pred_greedy.txt -
 
   cpp-greedy_causal_lm-redpajama-3b-chat:
@@ -590,7 +590,7 @@ jobs:
       - name: Run Generation
         run: |
           source ./ov/setupvars.sh
-          timeout 50s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./redpajama-3b-chat/ "Alan Turing was a" > ./pred_greedy.txt
+          timeout 100s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./redpajama-3b-chat/ "Alan Turing was a" > ./pred_greedy.txt
       - name: Compare
         run: |
           python -c "
@@ -610,7 +610,7 @@ jobs:
       - run: >
           . ./ov/setupvars.sh
           && export PYTHONPATH=./build/:$PYTHONPATH
-          && timeout 50s samples/python/greedy_causal_lm/greedy_causal_lm.py ./redpajama-3b-chat/ "Alan Turing was a"
+          && timeout 100s samples/python/greedy_causal_lm/greedy_causal_lm.py ./redpajama-3b-chat/ "Alan Turing was a"
           | diff ./pred_greedy.txt -
 
   cpp-chat_sample-ubuntu:
@@ -645,7 +645,7 @@ jobs:
         run: |
           source ./ov/setupvars.sh
           printf 'What is 2 + 2?\nWhat is the previous answer?\nAdd 1 to it.\nSubtract 5 from it.\nWhy is the sun yellow?\nWhat was my first question?\n' > ./input.txt
-          timeout 30s ./build/samples/cpp/chat_sample/chat_sample ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred.txt
+          timeout 60s ./build/samples/cpp/chat_sample/chat_sample ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred.txt
           python -c "
           from transformers import LlamaTokenizer, AutoModelForCausalLM
           model_id = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
@@ -675,7 +675,7 @@ jobs:
           diff pred.txt ref.txt
           echo "Chat sample cpp" passed
           export PYTHONPATH=./build/:$PYTHONPATH
-          timeout 30s ./samples/python/chat_sample/chat_sample.py ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred2.txt
+          timeout 60s ./samples/python/chat_sample/chat_sample.py ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred2.txt
           diff pred2.txt ref.txt
           echo "Chat sample python" passed
 
@@ -708,7 +708,7 @@ jobs:
       - name: Run visual_language_chat sample - MiniCPM-V-2_6
         run: >
           source ./ov/setupvars.sh
-          && timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ cat.jpg
+          && timeout 240s ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ cat.jpg
           <<< $'What is on the image?\nWhat is special on the image?'
       - name: Download and convert LLaVa 1.5 model and an image
         run: |
@@ -720,7 +720,7 @@ jobs:
       - name: Run visual_language_chat sample - LLaVa 1.5
         run: >
           source ./ov/setupvars.sh
-          && timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./llava_1_5_7b_ov/ monalisa.jpg
+          && timeout 240s ./build/samples/cpp/visual_language_chat/visual_language_chat ./llava_1_5_7b_ov/ monalisa.jpg
           <<< $'Who drew this painting?\nWhen did the painter live?'
 
       - name: Run python chat sample
@@ -728,7 +728,7 @@ jobs:
           source ./ov/setupvars.sh
           export PYTHONPATH=./build/:$PYTHONPATH
           printf 'What is on the image?\nWhat is special on the image?\n' > ./input.txt
-          timeout 120s python ./samples/python/visual_language_chat/visual_language_chat.py ./miniCPM-V-2_6/ cat.jpg < input.txt > ./pred.txt
+          timeout 240s python ./samples/python/visual_language_chat/visual_language_chat.py ./miniCPM-V-2_6/ cat.jpg < input.txt > ./pred.txt
 
   cpp-continuous-batching-ubuntu:
     runs-on: ubuntu-20.04-8-cores

From 743e018baa058a14d0353bc186ecae50a116ef0b Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Sun, 13 Oct 2024 02:17:15 +0400
Subject: [PATCH 28/57] Update causal_lm_cpp.yml

---
 .github/workflows/causal_lm_cpp.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index e8dcd73f99..0f0192a7a1 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -479,7 +479,7 @@ jobs:
               predicted_greedy = f.readline()
           with open('predictions_prompt_lookup.txt', 'r') as f:
               predicted_prompt_lookup = f.readline()
-          assert predicted_greedy == predicted_prompt_lookup
+          assert predicted_greedy == predicted_prompt_lookup, "Expected {predicted_greedy}, actual {predicted_prompt_lookup}"
           "
           echo "Prompt lookup" passed
       - name: run and compare (model with seq_length_axis = 1)

From cfccefa1f6db1464bb544cd3ca321f6055b97713 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Sun, 13 Oct 2024 02:20:32 +0400
Subject: [PATCH 29/57] Use split_core_complile_config for CB

---
 src/cpp/src/continuous_batching_impl.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
index 4af4593bd0..c6163dd540 100644
--- a/src/cpp/src/continuous_batching_impl.cpp
+++ b/src/cpp/src/continuous_batching_impl.cpp
@@ -4,6 +4,7 @@
 #include "text_callback_streamer.hpp"
 #include "continuous_batching_impl.hpp"
 #include "paged_attention_transformations.hpp"
+#include "utils.hpp"
 
 namespace ov::genai {
 template<class... Ts> struct overloaded : Ts... {using Ts::operator()...;};
@@ -18,15 +19,18 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl(
     m_tokenizer = tokenizer;
     ov::Core core;
 
+    auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_complile_config(plugin_config);
+    core.set_property(core_plugin_config);
+
     // The model can be compiled for GPU as well
     std::shared_ptr<ov::Model> model = core.read_model(models_path + "/openvino_model.xml");
 
-    DeviceConfig device_config(core, scheduler_config, device, plugin_config);
+    DeviceConfig device_config(core, scheduler_config, device, compile_plugin_config);
 
     bool is_need_per_layer_cache_control = scheduler_config.use_cache_eviction;
     apply_paged_attention_transformations(model, device_config, is_need_per_layer_cache_control);
 
-    ov::InferRequest infer_request = core.compile_model(model, device_config.get_device(), plugin_config).create_infer_request();
+    ov::InferRequest infer_request = core.compile_model(model, device_config.get_device(), compile_plugin_config).create_infer_request();
 
     // setup KV caches
     m_cache_manager = std::make_shared<CacheManager>(device_config, core);

From 03965d6677c59ee8732bef368d635c7a07e0cada Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Sun, 13 Oct 2024 02:27:19 +0400
Subject: [PATCH 30/57] Update causal_lm_cpp.yml

---
 .github/workflows/causal_lm_cpp.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 0f0192a7a1..398a83c23e 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -501,7 +501,7 @@ jobs:
               predicted_greedy = f.readline()
           with open('predictions_prompt_lookup.txt', 'r') as f:
               predicted_prompt_lookup = f.readline()
-          assert predicted_greedy == predicted_prompt_lookup
+          assert predicted_greedy == predicted_prompt_lookup, "Expected {predicted_greedy}, actual {predicted_prompt_lookup}"
           "
           echo "Prompt lookup" passed
 

From 784c3312dc0a7d7cf2025b9e1a2d6144a9cf4cff Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Sun, 13 Oct 2024 05:22:19 +0400
Subject: [PATCH 31/57] Drop request if it's aborted by streamer

---
 src/cpp/src/continuous_batching_impl.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
index c6163dd540..683e22feb4 100644
--- a/src/cpp/src/continuous_batching_impl.cpp
+++ b/src/cpp/src/continuous_batching_impl.cpp
@@ -273,6 +273,16 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
         streamer_ptr->end();
     }
 
+    if (!continue_generation && !m_requests.empty()) {
+        SequenceGroup::Ptr request = m_requests[0];
+        for (const auto& sequence: request->get_sequences()) {
+            if (m_scheduler->has_block_table(sequence->get_id())) {
+                m_scheduler->free_sequence(sequence->get_id());
+            }
+        }
+        m_sampler->clear_beam_search_info(request->get_request_id());
+    }
+
     for (size_t generation_idx = 0; generation_idx < generations.size(); ++generation_idx) {
         const auto& generation = generations[generation_idx];
         EncodedGenerationResult result;

From 93b8c38e0f74bf4228da3d310b84c25806dc424b Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Sun, 13 Oct 2024 08:46:57 +0400
Subject: [PATCH 32/57] Update src/cpp/src/continuous_batching_impl.cpp

---
 src/cpp/src/continuous_batching_impl.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
index 683e22feb4..1a2a21e649 100644
--- a/src/cpp/src/continuous_batching_impl.cpp
+++ b/src/cpp/src/continuous_batching_impl.cpp
@@ -281,6 +281,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
             }
         }
         m_sampler->clear_beam_search_info(request->get_request_id());
+        m_requests.erase(m_requests.begin());
     }
 
     for (size_t generation_idx = 0; generation_idx < generations.size(); ++generation_idx) {

From 043d842c7f725b7009dceb004d55da9f3d90a3a8 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Mon, 14 Oct 2024 13:41:49 +0400
Subject: [PATCH 33/57] Drop request in case of exceptions, etc

---
 src/cpp/src/continuous_batching_impl.cpp | 31 ++++++++++++++++--------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
index 1a2a21e649..7df72de2c9 100644
--- a/src/cpp/src/continuous_batching_impl.cpp
+++ b/src/cpp/src/continuous_batching_impl.cpp
@@ -259,9 +259,25 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
     std::vector<EncodedGenerationResult> results;
     results.reserve(m_awaiting_requests.size());
 
+    auto drop_current_request = [&] () {
+        SequenceGroup::Ptr request = m_requests[0];
+        for (const auto& sequence: request->get_sequences()) {
+            if (m_scheduler->has_block_table(sequence->get_id())) {
+                m_scheduler->free_sequence(sequence->get_id());
+            }
+        }
+        m_sampler->clear_beam_search_info(request->get_request_id());
+        m_requests.erase(m_requests.begin());
+    };
+
     bool continue_generation = true;
     while (has_non_finished_requests() && continue_generation) {
-        step();
+        try {
+            step();
+        } catch (...) {
+            drop_current_request();
+            throw;
+        }
         if (streamer_ptr && generations.at(0)->can_read()) {
             std::unordered_map<uint64_t, GenerationOutput> token = generations.at(0).get()->back();
             OPENVINO_ASSERT(1 == token.size());
@@ -273,15 +289,10 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
         streamer_ptr->end();
     }
 
-    if (!continue_generation && !m_requests.empty()) {
-        SequenceGroup::Ptr request = m_requests[0];
-        for (const auto& sequence: request->get_sequences()) {
-            if (m_scheduler->has_block_table(sequence->get_id())) {
-                m_scheduler->free_sequence(sequence->get_id());
-            }
-        }
-        m_sampler->clear_beam_search_info(request->get_request_id());
-        m_requests.erase(m_requests.begin());
+    if (!continue_generation) {
+        drop_current_request();
+    } else {
+        OPENVINO_ASSERT(m_requests.empty(), "Internal error: current request is supposed to be dropped within step() function as completed");
     }
 
     for (size_t generation_idx = 0; generation_idx < generations.size(); ++generation_idx) {

From fdad63cfa81b6531f1bdcdec09ce2ec7c304fc28 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Mon, 14 Oct 2024 14:37:10 +0400
Subject: [PATCH 34/57] Turned off prefix caching

---
 src/cpp/src/llm_pipeline.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 20afe9f5ab..af625beb89 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -547,7 +547,7 @@ ov::genai::LLMPipeline::LLMPipeline(
     } else if (true) {
         SchedulerConfig scheduler_config;
         scheduler_config.cache_size = 1;
-        scheduler_config.enable_prefix_caching = true;
+        scheduler_config.enable_prefix_caching = false;
         scheduler_config.max_num_batched_tokens = 2048;
         scheduler_config.dynamic_split_fuse = false;
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(
@@ -582,7 +582,7 @@ ov::genai::LLMPipeline::LLMPipeline(
     } else if (true) {
         SchedulerConfig scheduler_config;
         scheduler_config.cache_size = 1;
-        scheduler_config.enable_prefix_caching = true;
+        scheduler_config.enable_prefix_caching = false;
         scheduler_config.max_num_batched_tokens = 2048;
         scheduler_config.dynamic_split_fuse = false;
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(

From a21f7255e1a8b281f4b55c62adee08b5e83eff10 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Mon, 14 Oct 2024 14:52:00 +0400
Subject: [PATCH 35/57] Apply suggestions from code review

---
 src/cpp/src/llm_pipeline.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index af625beb89..e870db98a2 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -548,8 +548,7 @@ ov::genai::LLMPipeline::LLMPipeline(
         SchedulerConfig scheduler_config;
         scheduler_config.cache_size = 1;
         scheduler_config.enable_prefix_caching = false;
-        scheduler_config.max_num_batched_tokens = 2048;
-        scheduler_config.dynamic_split_fuse = false;
+        scheduler_config.dynamic_split_fuse = true;
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(
             model_path,
             tokenizer,
@@ -582,8 +581,7 @@ ov::genai::LLMPipeline::LLMPipeline(
     } else if (true) {
         SchedulerConfig scheduler_config;
         scheduler_config.cache_size = 1;
-        scheduler_config.enable_prefix_caching = false;
-        scheduler_config.max_num_batched_tokens = 2048;
+        scheduler_config.max_num_batched_tokens = 256;
         scheduler_config.dynamic_split_fuse = false;
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(
             path,

From a66be9ef9f10416058783bc72fc645a21c50158d Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Mon, 14 Oct 2024 14:53:05 +0400
Subject: [PATCH 36/57] Apply suggestions from code review

---
 src/cpp/src/llm_pipeline.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index e870db98a2..d2f7112e17 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -548,7 +548,6 @@ ov::genai::LLMPipeline::LLMPipeline(
         SchedulerConfig scheduler_config;
         scheduler_config.cache_size = 1;
         scheduler_config.enable_prefix_caching = false;
-        scheduler_config.dynamic_split_fuse = true;
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(
             model_path,
             tokenizer,
@@ -581,8 +580,7 @@ ov::genai::LLMPipeline::LLMPipeline(
     } else if (true) {
         SchedulerConfig scheduler_config;
         scheduler_config.cache_size = 1;
-        scheduler_config.max_num_batched_tokens = 256;
-        scheduler_config.dynamic_split_fuse = false;
+        scheduler_config.enable_prefix_caching = false;
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(
             path,
             scheduler_config,

From 82fceb5baaa752afc71b00776b97d5c67d484211 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Mon, 14 Oct 2024 15:43:30 +0400
Subject: [PATCH 37/57] Update continuous_batching_impl.cpp

---
 src/cpp/src/continuous_batching_impl.cpp | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
index 7df72de2c9..981e85b671 100644
--- a/src/cpp/src/continuous_batching_impl.cpp
+++ b/src/cpp/src/continuous_batching_impl.cpp
@@ -259,15 +259,16 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
     std::vector<EncodedGenerationResult> results;
     results.reserve(m_awaiting_requests.size());
 
-    auto drop_current_request = [&] () {
-        SequenceGroup::Ptr request = m_requests[0];
-        for (const auto& sequence: request->get_sequences()) {
-            if (m_scheduler->has_block_table(sequence->get_id())) {
-                m_scheduler->free_sequence(sequence->get_id());
+    auto drop_requests = [&] () {
+        for (const std::shared_ptr<ov::genai::SequenceGroup> request : m_requests) {
+            for (const auto& sequence: request->get_sequences()) {
+                if (m_scheduler->has_block_table(sequence->get_id())) {
+                    m_scheduler->free_sequence(sequence->get_id());
+                }
             }
+            m_sampler->clear_beam_search_info(request->get_request_id());
         }
-        m_sampler->clear_beam_search_info(request->get_request_id());
-        m_requests.erase(m_requests.begin());
+        m_requests.clear();
     };
 
     bool continue_generation = true;
@@ -275,7 +276,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
         try {
             step();
         } catch (...) {
-            drop_current_request();
+            drop_requests();
             throw;
         }
         if (streamer_ptr && generations.at(0)->can_read()) {
@@ -290,7 +291,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<o
     }
 
     if (!continue_generation) {
-        drop_current_request();
+        drop_requests();
     } else {
         OPENVINO_ASSERT(m_requests.empty(), "Internal error: current request is supposed to be dropped within step() function as completed");
     }
@@ -330,7 +331,8 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<s
         constexpr bool add_generation_prompt = true;
         std::string history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
         timer.start();
-        input_ids.push_back(m_tokenizer.encode(history).input_ids);
+        // ov::genai::add_special_tokens(false) is aligned with stateful pipeline
+        input_ids.push_back(m_tokenizer.encode(history, ov::genai::add_special_tokens(false)).input_ids);
         timer.end();
     } else {
         input_ids.reserve(prompts.size());

From 73a88729948d1774a332a6e910a2068723ce197f Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Mon, 14 Oct 2024 17:31:33 +0400
Subject: [PATCH 38/57] Apply suggestions from code review

---
 .github/workflows/causal_lm_cpp.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index d411ec2aa7..f578e4d7b2 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -479,7 +479,7 @@ jobs:
               predicted_greedy = f.readline()
           with open('predictions_prompt_lookup.txt', 'r') as f:
               predicted_prompt_lookup = f.readline()
-          assert predicted_greedy == predicted_prompt_lookup, "Expected {predicted_greedy}, actual {predicted_prompt_lookup}"
+          assert predicted_greedy == predicted_prompt_lookup, 'Expected {predicted_greedy}, actual {predicted_prompt_lookup}'
           "
           echo "Prompt lookup" passed
       - name: run and compare (model with seq_length_axis = 1)
@@ -501,7 +501,7 @@ jobs:
               predicted_greedy = f.readline()
           with open('predictions_prompt_lookup.txt', 'r') as f:
               predicted_prompt_lookup = f.readline()
-          assert predicted_greedy == predicted_prompt_lookup, "Expected {predicted_greedy}, actual {predicted_prompt_lookup}"
+          assert predicted_greedy == predicted_prompt_lookup, 'Expected {predicted_greedy}, actual {predicted_prompt_lookup}'
           "
           echo "Prompt lookup" passed
 

From 401967830487e22a9a92b8b6b7b6bf39f97e4a52 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Mon, 14 Oct 2024 20:50:27 +0400
Subject: [PATCH 39/57] Apply suggestions from code review

---
 .github/workflows/causal_lm_cpp.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index f578e4d7b2..e4a2a4d184 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -479,7 +479,7 @@ jobs:
               predicted_greedy = f.readline()
           with open('predictions_prompt_lookup.txt', 'r') as f:
               predicted_prompt_lookup = f.readline()
-          assert predicted_greedy == predicted_prompt_lookup, 'Expected {predicted_greedy}, actual {predicted_prompt_lookup}'
+          assert predicted_greedy == predicted_prompt_lookup, f'Expected {predicted_greedy}, actual {predicted_prompt_lookup}'
           "
           echo "Prompt lookup" passed
       - name: run and compare (model with seq_length_axis = 1)
@@ -501,7 +501,7 @@ jobs:
               predicted_greedy = f.readline()
           with open('predictions_prompt_lookup.txt', 'r') as f:
               predicted_prompt_lookup = f.readline()
-          assert predicted_greedy == predicted_prompt_lookup, 'Expected {predicted_greedy}, actual {predicted_prompt_lookup}'
+          assert predicted_greedy == predicted_prompt_lookup, f'Expected {predicted_greedy}, actual {predicted_prompt_lookup}'
           "
           echo "Prompt lookup" passed
 

From feae5469a0ea266dc38062ff2edefd7169e2e534 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Mon, 14 Oct 2024 22:48:10 +0400
Subject: [PATCH 40/57] Update causal_lm_cpp.yml

---
 .github/workflows/causal_lm_cpp.yml | 40 ++++++++++++++---------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 1ba7f83fea..2735d3380f 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -57,7 +57,7 @@ jobs:
       - run: >
           . ./ov/setupvars.sh
           && export PYTHONPATH=./build/:$PYTHONPATH
-          && timeout 50s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./open_llama_3b_v2/ "return 0"
+          && timeout 25s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./open_llama_3b_v2/ "return 0"
           | diff <(timeout 25s samples/python/greedy_causal_lm/greedy_causal_lm.py ./open_llama_3b_v2/ "return 0") -
 
   cpp-beam_search_causal_lm-ubuntu:
@@ -100,7 +100,7 @@ jobs:
           source ./ov/setupvars.sh
           export PYTHONPATH=./build/:$PYTHONPATH  # C++ ignores that
 
-          timeout 50s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?" > ./pred.txt
+          timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?" > ./pred.txt
           python -c "
           import transformers
           with open('pred.txt', 'r') as file:
@@ -116,7 +116,7 @@ jobs:
           "
           echo "Why is the Sun yellow?" passed
 
-          timeout 50s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ 69 > ./pred.txt
+          timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ 69 > ./pred.txt
           python -c "
           import transformers
           with open('pred.txt', 'r') as file:
@@ -132,7 +132,7 @@ jobs:
           "
           echo 69 passed
 
-          timeout 50s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ Hi > ./pred.txt
+          timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ Hi > ./pred.txt
           python -c "
           import transformers
           with open('pred.txt', 'r') as file:
@@ -148,7 +148,7 @@ jobs:
           "
           echo "Hi" passed
 
-          timeout 50s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "return 0" > ./pred.txt
+          timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "return 0" > ./pred.txt
           python -c "
           import transformers
           with open('pred.txt', 'r') as file:
@@ -164,7 +164,7 @@ jobs:
           "
           echo "return 0" passed
 
-          timeout 50s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "你好！ 你好嗎？" > ./pred.txt
+          timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "你好！ 你好嗎？" > ./pred.txt
           python -c "
           import transformers
           with open('pred.txt', 'r', errors='ignore') as file:
@@ -180,7 +180,7 @@ jobs:
           "
           echo "你好！ 你好嗎？" passed
 
-          timeout 2m ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "Alan Turing was a" "return 0" "你好！ 你好嗎？" > ./pred.txt
+          timeout 1m ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "Alan Turing was a" "return 0" "你好！ 你好嗎？" > ./pred.txt
           python -c "
           import transformers
           with open('pred.txt', 'r', errors='ignore') as file:
@@ -285,7 +285,7 @@ jobs:
       - run: >
           . ./ov/setupvars.sh
           && export PYTHONPATH=./build/:$PYTHONPATH
-          && timeout 4m ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./Qwen-7B-Chat/ 69 | diff <(timeout 4m samples/python/greedy_causal_lm/greedy_causal_lm.py ./Qwen-7B-Chat/ 69) -
+          && timeout 2m ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./Qwen-7B-Chat/ 69 | diff <(timeout 4m samples/python/greedy_causal_lm/greedy_causal_lm.py ./Qwen-7B-Chat/ 69) -
 
   cpp-beam_search_causal_lm-Qwen1_5-7B-Chat:
     runs-on: ubuntu-20.04-16-cores
@@ -318,8 +318,8 @@ jobs:
       - run: >
           . ./ov/setupvars.sh
           && export PYTHONPATH=./build/:$PYTHONPATH
-          && timeout 100s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./Qwen1.5-7B-Chat/ "你好！"
-          | diff <(timeout 100s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./Qwen1.5-7B-Chat/ "你好！") -
+          && timeout 50s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./Qwen1.5-7B-Chat/ "你好！"
+          | diff <(timeout 50s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./Qwen1.5-7B-Chat/ "你好！") -
 
   cpp-beam_search_causal_lm-Phi-2:
     runs-on: ubuntu-20.04-16-cores
@@ -352,8 +352,8 @@ jobs:
       - run: >
           . ./ov/setupvars.sh
           && export PYTHONPATH=./build/:$PYTHONPATH
-          && timeout 100s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./phi-2/ 69
-          | diff <(timeout 100s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./phi-2/ 69) -
+          && timeout 50s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./phi-2/ 69
+          | diff <(timeout 50s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./phi-2/ 69) -
 
   cpp-beam_search_causal_lm-notus-7b-v1:
     runs-on: ubuntu-20.04-16-cores
@@ -386,8 +386,8 @@ jobs:
       - run: >
           . ./ov/setupvars.sh
           && export PYTHONPATH=./build/:$PYTHONPATH
-          && timeout 100s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./notus-7b-v1/ 69
-          | diff <(timeout 100s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./notus-7b-v1/ 69) -
+          && timeout 50s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./notus-7b-v1/ 69
+          | diff <(timeout 50s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./notus-7b-v1/ 69) -
 
   cpp-speculative_decoding_lm-ubuntu:
     runs-on: ubuntu-20.04-16-cores
@@ -536,7 +536,7 @@ jobs:
       - name: Run Generation
         run: |
           source ./ov/setupvars.sh
-          timeout 100s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_greedy.txt
+          timeout 50s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_greedy.txt
       - name: Compare
         run: |
           python -c "
@@ -556,7 +556,7 @@ jobs:
       - run: >
           . ./ov/setupvars.sh
           && export PYTHONPATH=./build/:$PYTHONPATH
-          && timeout 100s samples/python/greedy_causal_lm/greedy_causal_lm.py ./phi-1_5/ "Alan Turing was a"
+          && timeout 50s samples/python/greedy_causal_lm/greedy_causal_lm.py ./phi-1_5/ "Alan Turing was a"
           | diff ./pred_greedy.txt -
 
   cpp-greedy_causal_lm-redpajama-3b-chat:
@@ -590,7 +590,7 @@ jobs:
       - name: Run Generation
         run: |
           source ./ov/setupvars.sh
-          timeout 100s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./redpajama-3b-chat/ "Alan Turing was a" > ./pred_greedy.txt
+          timeout 50s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./redpajama-3b-chat/ "Alan Turing was a" > ./pred_greedy.txt
       - name: Compare
         run: |
           python -c "
@@ -610,7 +610,7 @@ jobs:
       - run: >
           . ./ov/setupvars.sh
           && export PYTHONPATH=./build/:$PYTHONPATH
-          && timeout 100s samples/python/greedy_causal_lm/greedy_causal_lm.py ./redpajama-3b-chat/ "Alan Turing was a"
+          && timeout 50s samples/python/greedy_causal_lm/greedy_causal_lm.py ./redpajama-3b-chat/ "Alan Turing was a"
           | diff ./pred_greedy.txt -
 
   cpp-chat_sample-ubuntu:
@@ -645,7 +645,7 @@ jobs:
         run: |
           source ./ov/setupvars.sh
           printf 'What is 2 + 2?\nWhat is the previous answer?\nAdd 1 to it.\nSubtract 5 from it.\nWhy is the sun yellow?\nWhat was my first question?\n' > ./input.txt
-          timeout 60s ./build/samples/cpp/chat_sample/chat_sample ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred.txt
+          timeout 30s ./build/samples/cpp/chat_sample/chat_sample ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred.txt
           python -c "
           from transformers import LlamaTokenizer, AutoModelForCausalLM
           model_id = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
@@ -675,7 +675,7 @@ jobs:
           diff pred.txt ref.txt
           echo "Chat sample cpp" passed
           export PYTHONPATH=./build/:$PYTHONPATH
-          timeout 60s ./samples/python/chat_sample/chat_sample.py ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred2.txt
+          timeout 30s ./samples/python/chat_sample/chat_sample.py ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred2.txt
           diff pred2.txt ref.txt
           echo "Chat sample python" passed
 

From 5bdf7791ad236af3bbf844dd47a086b3372fa91f Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Mon, 14 Oct 2024 22:49:42 +0400
Subject: [PATCH 41/57] Apply suggestions from code review

---
 .github/workflows/causal_lm_cpp.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 2735d3380f..14c42fb7bb 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -285,7 +285,7 @@ jobs:
       - run: >
           . ./ov/setupvars.sh
           && export PYTHONPATH=./build/:$PYTHONPATH
-          && timeout 2m ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./Qwen-7B-Chat/ 69 | diff <(timeout 4m samples/python/greedy_causal_lm/greedy_causal_lm.py ./Qwen-7B-Chat/ 69) -
+          && timeout 2m ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./Qwen-7B-Chat/ 69 | diff <(timeout 2m samples/python/greedy_causal_lm/greedy_causal_lm.py ./Qwen-7B-Chat/ 69) -
 
   cpp-beam_search_causal_lm-Qwen1_5-7B-Chat:
     runs-on: ubuntu-20.04-16-cores
@@ -708,7 +708,7 @@ jobs:
       - name: Run visual_language_chat sample - MiniCPM-V-2_6
         run: >
           source ./ov/setupvars.sh
-          && timeout 240s ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ cat.jpg
+          && timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ cat.jpg
           <<< $'What is on the image?\nWhat is special on the image?'
       - name: Download and convert LLaVa 1.5 model and an image
         run: |
@@ -729,7 +729,7 @@ jobs:
           source ./ov/setupvars.sh
           export PYTHONPATH=./build/:$PYTHONPATH
           printf 'What is on the image?\nWhat is special on the image?\n' > ./input.txt
-          timeout 240s python ./samples/python/visual_language_chat/visual_language_chat.py ./miniCPM-V-2_6/ cat.jpg < input.txt > ./pred.txt
+          timeout 120s python ./samples/python/visual_language_chat/visual_language_chat.py ./miniCPM-V-2_6/ cat.jpg < input.txt > ./pred.txt
 
   cpp-continuous-batching-ubuntu:
     runs-on: ubuntu-20.04-8-cores

From 7827199e5ee92ad46f6eb3c29ac3c2751b841d90 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Mon, 21 Oct 2024 21:47:56 +0400
Subject: [PATCH 42/57] Apply suggestions from code review

---
 src/python/py_generate_pipeline.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp
index ea15da09d6..f6a4de4ff4 100644
--- a/src/python/py_generate_pipeline.cpp
+++ b/src/python/py_generate_pipeline.cpp
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include <filesystem>
-#include <pybind11/iostream.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include <pybind11/stl_bind.h>
@@ -404,7 +403,6 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
             ScopedVar env_manager(utils::ov_tokenizers_module_path());
             return std::make_unique<LLMPipeline>(model_path, device, utils::properties_to_any_map(config));
         }),
-        py::call_guard<py::scoped_ostream_redirect, py::scoped_estream_redirect>(),  // Respect std::cout flushes from constructor.
         py::arg("model_path"), "folder with openvino_model.xml and openvino_tokenizer[detokenizer].xml files", 
         py::arg("device") = "CPU", "device on which inference will be done",
         py::arg("config") = ov::AnyMap({}), "openvino.properties map",
@@ -424,7 +422,6 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
             ScopedVar env_manager(utils::ov_tokenizers_module_path());
             return std::make_unique<LLMPipeline>(model_path, tokenizer, device, utils::properties_to_any_map(config));
         }),
-        py::call_guard<py::scoped_ostream_redirect, py::scoped_estream_redirect>(),  // Respect std::cout flushes from constructor.
         py::arg("model_path"),
         py::arg("tokenizer"),
         py::arg("device") = "CPU",

From 467ab86baa49f13da7b2cec5332a202ec604de66 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Tue, 22 Oct 2024 11:20:38 +0400
Subject: [PATCH 43/57] Apply suggestions from code review

---
 src/cpp/src/llm_pipeline.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 2683f8f253..8b22d3a074 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -590,7 +590,7 @@ ov::genai::LLMPipeline::LLMPipeline(
     if (config.find(ov::genai::scheduler_config.name()) != config.end()) {
         auto config_without_scheduler_config = config;
         config_without_scheduler_config.erase(ov::genai::scheduler_config.name());
-        auto& scheduler_config = properties.at(ov::genai::scheduler_config.name()).as<SchedulerConfig>();
+        auto& scheduler_config = config.at(ov::genai::scheduler_config.name()).as<SchedulerConfig>();
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, scheduler_config, device, config_without_scheduler_config);
     } else if (true) {
         SchedulerConfig scheduler_config;
@@ -600,7 +600,7 @@ ov::genai::LLMPipeline::LLMPipeline(
             models_path,
             scheduler_config,
             device,
-            properties
+            config
         );
     } else if ("NPU" == device) {
         m_pimpl = std::make_unique<StaticLLMPipeline>(models_path, device, config);

From 9fad1d24e9acf58a9eda3e21b11d71b46d67824a Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Mon, 11 Nov 2024 12:51:45 +0400
Subject: [PATCH 44/57] Update linux.yml

---
 .github/workflows/linux.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 0942483a65..42ba010fb5 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -259,7 +259,7 @@ jobs:
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager
-          python -m pytest ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
+          python -m pytest -s -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
         env:
           PYTHONPATH: "./build/:$PYTHONPATH"
 
@@ -267,13 +267,13 @@ jobs:
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels
-          python -m pytest ./tests/python_tests --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template"
+          python -m pytest -s -v ./tests/python_tests --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template"
 
       - run: python -m pip install git+https://github.com/eaidova/optimum-intel.git@ea/minicpmv
       - run: python -m pip install -U "optimum<1.23" --no-dependencies
       - run: >
             source ${OV_INSTALL_DIR}/setupvars.sh
-            && python -m pytest ./tests/python_tests/test_vlm_api.py
+            && python -m pytest -s -v ./tests/python_tests/test_vlm_api.py
 
   genai_python_lib_whisper:
     name: OpenVINO genai extension whisper tests (cmake + wheel)
@@ -352,7 +352,7 @@ jobs:
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager
-          python -m pytest ./tests/python_tests/test_whisper_generate_api.py -k test_smoke
+          python -m pytest -s -v ./tests/python_tests/test_whisper_generate_api.py -k test_smoke
         env:
           PYTHONPATH: "./build/:$PYTHONPATH"
 
@@ -360,7 +360,7 @@ jobs:
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels
-          python -m pytest ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke"
+          python -m pytest -s -v ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke"
 
   genai_package:
     name: OpenVINO genai extension (install to OpenVINO package)

From 35f4ff2b2a544533e6b3f6d88a24bde5edca1171 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Mon, 11 Nov 2024 12:52:32 +0400
Subject: [PATCH 45/57] Update windows.yml

---
 .github/workflows/windows.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 7dede276d2..5ced333b4a 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -236,7 +236,7 @@ jobs:
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels --upgrade-strategy eager
-          python -m pytest ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
+          python -m pytest -s -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
         env:
           PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
 
@@ -244,7 +244,7 @@ jobs:
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           python -m pip install . --verbose
-          python -m pytest ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template"
+          python -m pytest -s -v ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template"
 
   genai_python_lib_whisper:
     name: OpenVINO genai extension whisper tests (cmake + wheel)
@@ -300,7 +300,7 @@ jobs:
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels --upgrade-strategy eager
-          python -m pytest ./tests/python_tests/test_whisper_generate_api.py -k test_smoke
+          python -m pytest -s -v ./tests/python_tests/test_whisper_generate_api.py -k test_smoke
         env:
           PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
 
@@ -308,7 +308,7 @@ jobs:
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           python -m pip install . --verbose
-          python -m pytest ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke"
+          python -m pytest -s -v ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke"
 
   genai_python_lib_vlm:
     name: OpenVINO genai VLM tests (cmake + wheel)
@@ -366,7 +366,7 @@ jobs:
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels --upgrade-strategy eager
           python -m pip install git+https://github.com/eaidova/optimum-intel.git@ea/minicpmv
           python -m pip install -U "optimum<1.23" --no-dependencies
-          python -m pytest ./tests/python_tests/test_vlm_api.py
+          python -m pytest -s -v ./tests/python_tests/test_vlm_api.py
         env:
           PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
 

From ad78839c017305032bf285ea26feac89781f7a1f Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Mon, 11 Nov 2024 12:53:22 +0400
Subject: [PATCH 46/57] Update mac.yml

---
 .github/workflows/mac.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index 980c689e19..3afa236843 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -225,7 +225,7 @@ jobs:
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager
-          python -m pytest ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
+          python -m pytest -s -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
         env:
           PYTHONPATH: "./build/:$PYTHONPATH"
 
@@ -234,7 +234,7 @@ jobs:
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels
           python -c "from openvino_genai import LLMPipeline"
-          python -m pytest ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template"
+          python -m pytest -s -v ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template"
 
   genai_python_lib_whisper:
     name: OpenVINO genai extension whisper tests (cmake + wheel)
@@ -289,7 +289,7 @@ jobs:
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager
-          python -m pytest ./tests/python_tests/test_whisper_generate_api.py -k test_smoke
+          python -m pytest -s -v ./tests/python_tests/test_whisper_generate_api.py -k test_smoke
         env:
           PYTHONPATH: "./build/:$PYTHONPATH"
 
@@ -298,7 +298,7 @@ jobs:
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels
           python -c "from openvino_genai import LLMPipeline"
-          python -m pytest ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke"
+          python -m pytest -s -v ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke"
 
   genai_package:
     name: OpenVINO genai extension (install to OpenVINO package)

From c5201e4c45a1f6d57d69ae3d4d6ddced454c903b Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Mon, 11 Nov 2024 16:18:24 +0400
Subject: [PATCH 47/57] Update linux.yml

---
 .github/workflows/linux.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 42ba010fb5..158930b89e 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -259,7 +259,7 @@ jobs:
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager
-          python -m pytest -s -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
+          python -m pytest -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
         env:
           PYTHONPATH: "./build/:$PYTHONPATH"
 
@@ -267,13 +267,13 @@ jobs:
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels
-          python -m pytest -s -v ./tests/python_tests --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template"
+          python -m pytest -v ./tests/python_tests --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py -k "not test_set_chat_template"
 
       - run: python -m pip install git+https://github.com/eaidova/optimum-intel.git@ea/minicpmv
       - run: python -m pip install -U "optimum<1.23" --no-dependencies
       - run: >
             source ${OV_INSTALL_DIR}/setupvars.sh
-            && python -m pytest -s -v ./tests/python_tests/test_vlm_api.py
+            && python -m pytest -v ./tests/python_tests/test_vlm_api.py
 
   genai_python_lib_whisper:
     name: OpenVINO genai extension whisper tests (cmake + wheel)
@@ -352,7 +352,7 @@ jobs:
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager
-          python -m pytest -s -v ./tests/python_tests/test_whisper_generate_api.py -k test_smoke
+          python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k test_smoke
         env:
           PYTHONPATH: "./build/:$PYTHONPATH"
 
@@ -360,7 +360,7 @@ jobs:
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels
-          python -m pytest -s -v ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke"
+          python -m pytest -v ./tests/python_tests/test_whisper_generate_api.py -k "not test_smoke"
 
   genai_package:
     name: OpenVINO genai extension (install to OpenVINO package)

From 4dd053c9fb635fc8f118bece3e597888b580d2d3 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Mon, 11 Nov 2024 23:23:48 +0400
Subject: [PATCH 48/57] Update llm_pipeline.cpp

---
 src/cpp/src/llm_pipeline.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 92ade0e7de..8f6da3537a 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -575,7 +575,9 @@ ov::genai::LLMPipeline::LLMPipeline(
         config_without_scheduler_config.erase(ov::genai::scheduler_config.name());
         auto& scheduler_config = properties.at(ov::genai::scheduler_config.name()).as<SchedulerConfig>();
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, scheduler_config, device, config_without_scheduler_config);
-        // std::cout << "Found custom SchedulerConfig.\n";
+        // std::cout << "Found custom SchedulerConfig.\n
+    } else if ("NPU" == device) {
+        m_pimpl = std::make_unique<StaticLLMPipeline>(models_path, tokenizer, device, properties);
     } else if (true) {
         SchedulerConfig scheduler_config;
         scheduler_config.cache_size = 1;
@@ -587,8 +589,6 @@ ov::genai::LLMPipeline::LLMPipeline(
             device,
             properties
         );
-    } else if ("NPU" == device) {
-        m_pimpl = std::make_unique<StaticLLMPipeline>(models_path, tokenizer, device, properties);
     } else {
         m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, tokenizer, device, properties);
     }
@@ -608,6 +608,8 @@ ov::genai::LLMPipeline::LLMPipeline(
         config_without_scheduler_config.erase(ov::genai::scheduler_config.name());
         auto& scheduler_config = config.at(ov::genai::scheduler_config.name()).as<SchedulerConfig>();
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, scheduler_config, device, config_without_scheduler_config);
+    } else if ("NPU" == device) {
+        m_pimpl = std::make_unique<StaticLLMPipeline>(models_path, device, config);
     } else if (true) {
         SchedulerConfig scheduler_config;
         scheduler_config.cache_size = 1;
@@ -618,8 +620,6 @@ ov::genai::LLMPipeline::LLMPipeline(
             device,
             config
         );
-    } else if ("NPU" == device) {
-        m_pimpl = std::make_unique<StaticLLMPipeline>(models_path, device, config);
     } else {
         m_pimpl = std::make_unique<StatefulLLMPipeline>(models_path, device, config);
     }

From 78470603f84b1f37b9b020dd59b04cc2ab4613a9 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Tue, 12 Nov 2024 16:45:22 +0400
Subject: [PATCH 49/57] Update llm_pipeline.cpp

---
 src/cpp/src/llm_pipeline.cpp | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 64be0f4b89..393458d281 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -7,7 +7,6 @@
 #include <algorithm>
 #include <nlohmann/json.hpp>
 #include <openvino/openvino.hpp>
-#include <limits>
 #include "openvino/genai/continuous_batching_pipeline.hpp"
 #include "openvino/genai/generation_config.hpp"
 #include "openvino/genai/llm_pipeline.hpp"
@@ -568,14 +567,12 @@ ov::genai::LLMPipeline::LLMPipeline(
     const std::string& device,
     const ov::AnyMap& properties
 ){
-    // std::cout << "Using continuous batching backend.\n";
     auto start_time = std::chrono::steady_clock::now();
     if (properties.find(ov::genai::scheduler_config.name()) != properties.end()) {
         auto config_without_scheduler_config = properties;
         config_without_scheduler_config.erase(ov::genai::scheduler_config.name());
         auto& scheduler_config = properties.at(ov::genai::scheduler_config.name()).as<SchedulerConfig>();
         m_pimpl = std::make_unique<ContinuousBatchingAdapter>(models_path, tokenizer, scheduler_config, device, config_without_scheduler_config);
-        // std::cout << "Found custom SchedulerConfig.\n
     } else if ("NPU" == device) {
         m_pimpl = std::make_unique<StaticLLMPipeline>(models_path, tokenizer, device, properties);
     } else if (true) {
@@ -601,7 +598,6 @@ ov::genai::LLMPipeline::LLMPipeline(
     const std::string& device,
     const ov::AnyMap& config
 ){
-    // std::cout << "Using continuous batching backend.\n";
     auto start_time = std::chrono::steady_clock::now();
     if (config.find(ov::genai::scheduler_config.name()) != config.end()) {
         auto config_without_scheduler_config = config;
@@ -649,9 +645,6 @@ void ov::genai::LLMPipeline::set_generation_config(const GenerationConfig& confi
     // if eos_token_id was not provided in config forward from default config
     if (config.eos_token_id == -1)
         m_pimpl->m_generation_config.set_eos_token_id(default_eos_token_id);
-
-    if (config.max_new_tokens == SIZE_MAX)
-        m_pimpl->m_generation_config.max_new_tokens = 100;
     
     m_pimpl->m_generation_config.validate();
 }

From d11db7ea74782583ad319a6dd58a99e4cbcee124 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Tue, 12 Nov 2024 16:45:55 +0400
Subject: [PATCH 50/57] Apply suggestions from code review

---
 src/cpp/src/llm_pipeline.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index 393458d281..e9cfde29d8 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -645,7 +645,6 @@ void ov::genai::LLMPipeline::set_generation_config(const GenerationConfig& confi
     // if eos_token_id was not provided in config forward from default config
     if (config.eos_token_id == -1)
         m_pimpl->m_generation_config.set_eos_token_id(default_eos_token_id);
-    
     m_pimpl->m_generation_config.validate();
 }
 

From 9acf368d374d1deb4885d9facfb6ab69d1bf7b19 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Tue, 12 Nov 2024 16:46:19 +0400
Subject: [PATCH 51/57] Update llm_pipeline.cpp

---
 src/cpp/src/llm_pipeline.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index e9cfde29d8..b79403338c 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -645,6 +645,7 @@ void ov::genai::LLMPipeline::set_generation_config(const GenerationConfig& confi
     // if eos_token_id was not provided in config forward from default config
     if (config.eos_token_id == -1)
         m_pimpl->m_generation_config.set_eos_token_id(default_eos_token_id);
+
     m_pimpl->m_generation_config.validate();
 }
 

From 3c835af27d1324b323da58f7b685c85db4b6f28e Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Tue, 12 Nov 2024 16:46:48 +0400
Subject: [PATCH 52/57] Update causal_lm_cpp.yml

---
 .github/workflows/causal_lm_cpp.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 4764c6f747..29f1d082f2 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -510,7 +510,7 @@ jobs:
               predicted_greedy = f.readline()
           with open('predictions_prompt_lookup.txt', 'r') as f:
               predicted_prompt_lookup = f.readline()
-          assert predicted_greedy == predicted_prompt_lookup, f'Expected {predicted_greedy}, actual {predicted_prompt_lookup}'
+          assert predicted_greedy == predicted_prompt_lookup
           "
           echo "Prompt lookup" passed
       - name: run and compare (model with seq_length_axis = 1)
@@ -531,7 +531,7 @@ jobs:
               predicted_greedy = f.readline()
           with open('predictions_prompt_lookup.txt', 'r') as f:
               predicted_prompt_lookup = f.readline()
-          assert predicted_greedy == predicted_prompt_lookup, f'Expected {predicted_greedy}, actual {predicted_prompt_lookup}'
+          assert predicted_greedy == predicted_prompt_lookup
           "
           echo "Prompt lookup" passed
 

From c807011cdbaca089ae472f57f4a870094af83285 Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Fri, 22 Nov 2024 15:45:40 +0400
Subject: [PATCH 53/57] Fix validation

---
 src/cpp/src/llm_pipeline.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
index d33c0ae07e..991eb427d5 100644
--- a/src/cpp/src/llm_pipeline.cpp
+++ b/src/cpp/src/llm_pipeline.cpp
@@ -428,7 +428,8 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
         tokenizer,
         scheduler_config,
         device,
-        plugin_config} {
+        plugin_config
+    } {
         m_generation_config = m_impl.get_config();
     }
 
@@ -442,7 +443,8 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase {
         m_tokenizer,
         scheduler_config,
         device,
-        plugin_config} {
+        plugin_config
+    } {
         m_generation_config = m_impl.get_config();
     }
 

From 70971647a38e1039d3c742ea849775771adf75f6 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Mon, 25 Nov 2024 11:06:38 +0400
Subject: [PATCH 54/57] Update linux.yml

---
 .github/workflows/linux.yml | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 3c3e0347e7..a0507b45f4 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -255,14 +255,6 @@ jobs:
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
 
-      - name: Test bindings
-        run: |
-          source ${OV_INSTALL_DIR}/setupvars.sh
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager
-          python -m pytest -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
-        env:
-          PYTHONPATH: "./build/:$PYTHONPATH"
-
       - name: Test bindings (wheel)
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh

From 3538bbeb17334f672a39c85c9308aef8122345ad Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Mon, 25 Nov 2024 11:07:27 +0400
Subject: [PATCH 55/57] Update windows.yml

---
 .github/workflows/windows.yml | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 1e4164aa0b..5805695edd 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -232,14 +232,6 @@ jobs:
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
 
-      - name: Test bindings
-        run: |
-          . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels --upgrade-strategy eager
-          python -m pytest -v ./tests/python_tests/test_chat_generate_api.py::test_set_chat_template
-        env:
-          PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
-
       - name: Test bindings (wheel)
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"

From a510e77750f6b6222bc4b854201c9e048fa034d5 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Tue, 26 Nov 2024 11:49:04 +0400
Subject: [PATCH 56/57] Update linux.yml

---
 .github/workflows/linux.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index a0507b45f4..a01dc10b2a 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -255,6 +255,13 @@ jobs:
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
 
+      - name: Install tokenizers
+        run: |
+          source ${OV_INSTALL_DIR}/setupvars.sh
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager
+        env:
+          PYTHONPATH: "./build/:$PYTHONPATH"
+
       - name: Test bindings (wheel)
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh

From eb0b0f41d72ce4ae4f9b572f1ec610cded65173b Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Tue, 26 Nov 2024 11:49:25 +0400
Subject: [PATCH 57/57] Update windows.yml

---
 .github/workflows/windows.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 5805695edd..fa195dd04f 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -232,6 +232,13 @@ jobs:
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
 
+      - name: Install tokenizers
+        run: |
+          . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels --upgrade-strategy eager
+        env:
+          PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
+
       - name: Test bindings (wheel)
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"