From f460002dcc24171f279e032b4f91df3feab00c35 Mon Sep 17 00:00:00 2001
From: Nikita Malinin <nikita.malinin@intel.com>
Date: Mon, 15 Jul 2024 19:04:40 +0200
Subject: [PATCH 01/19] Update nncf_utils.py (#616)

Updated default configurations based on results from CVS-143530.
---
 llm_bench/python/utils/nncf_utils.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/llm_bench/python/utils/nncf_utils.py b/llm_bench/python/utils/nncf_utils.py
index 51d2c67979..25ef8aff18 100644
--- a/llm_bench/python/utils/nncf_utils.py
+++ b/llm_bench/python/utils/nncf_utils.py
@@ -38,10 +38,9 @@ def get_compressed_path(output_dir: str, base_precision, option: str):
 
 
 INT4_MODEL_CONFIGURATION = {
-    "dolly-v2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
+    "dolly-v2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 1.0, "scale": True},
     "gpt-j-6b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64},
     "opt-6.7b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64, "ratio": 0.8},
-    "bloomz-7b1": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 32, "ratio": 0.6},
     "red-pajama-incite-7b-instruct": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128},
     "zephyr-7b-beta": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.8,
                        "dataset": {"name": "wikitext,wikitext-2-v1,train[:1000],text", "awq": True}},
@@ -58,7 +57,7 @@ def get_compressed_path(output_dir: str, base_precision, option: str):
     "rocket-3b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.8},
     "chatglm2-6b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.72},
     "qwen-7b-chat": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.6},
-    "open-llama-3b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "all_layers": True},
+    "open-llama-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64, "ratio": 1.0, "all_layers": True},
     "falcon-7b-instruct": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "all_layers": True},
     "orca-mini-3b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "all_layers": True,
                      "dataset": {"name": "wikitext,wikitext-2-v1,train[:1000],text", "awq": False}},
@@ -70,7 +69,13 @@ def get_compressed_path(output_dir: str, base_precision, option: str):
     "mistral-7b-v0.1": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.9},
     "llama-7b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.7},
     "opt-2.7b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.7},
-    "red-pajama-incite-chat-3b-v1": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
+    "red-pajama-incite-chat-3b-v1": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 1.0, "scale": True},
     "vicuna-7b-v1.5": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 1.0},
     "stablelm-tuned-alpha-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
+    "gpt-2": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.5, "scale": True},
+    "longchat-b7": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.9},
+    "starcoder2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.9},
+    "tiny-llama-1.1b-chat": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
+    "stablelm-7b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.6, "scale": True},
+    "phi-2": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.9},
 }

From de71ce9cffce8c2f0e79b825ebfea498cadb40ec Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Mon, 15 Jul 2024 19:40:35 +0200
Subject: [PATCH 02/19] Clear beam search info when generate() is finished.
 (#615)

When generate() is launched multiple times `beam_search_info` is not
cleared and cause failing of sampling.
To fix it added clearing of `beam_search_info` when generate() is
finished.
---
 src/cpp/src/continuous_batching_pipeline.cpp | 1 +
 src/cpp/src/sampler.hpp                      | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp
index dbacf3c243..e8cc4c9260 100644
--- a/src/cpp/src/continuous_batching_pipeline.cpp
+++ b/src/cpp/src/continuous_batching_pipeline.cpp
@@ -61,6 +61,7 @@ class ContinuousBatchingPipeline::Impl {
                 for (const auto& sequence: request->get_sequences()) {
                     m_scheduler->free_sequence(sequence->get_id());
                 }
+                m_sampler->clear_beam_search_info(request->get_request_id());
                 requests_iterator = m_requests.erase(requests_iterator);
             } else {
                 requests_iterator++;
diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp
index 4f60939ea1..6390fc8725 100644
--- a/src/cpp/src/sampler.hpp
+++ b/src/cpp/src/sampler.hpp
@@ -252,6 +252,8 @@ class Sampler {
     SamplerOutput sample(std::vector<SequenceGroup::Ptr> & sequence_groups, ov::Tensor logits);
 
     void set_seed(size_t seed) { rng_engine.seed(seed); }
+
+    void clear_beam_search_info(uint64_t request_id);
 };
 
 SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups, ov::Tensor logits) {
@@ -583,4 +585,8 @@ void GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, SamplerOutp
         }
     }
 }
+
+void Sampler::clear_beam_search_info(uint64_t request_id) { 
+    m_beam_search_info.erase(request_id);
+}
 }

From f0682f95916f5b80f2112bc69e17bde4a420ce0e Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 15 Jul 2024 22:56:51 +0400
Subject: [PATCH 03/19] Bump diffusers from 0.27.2 to 0.29.2 (#631)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [diffusers](https://github.com/huggingface/diffusers) from 0.27.2
to 0.29.2.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/huggingface/diffusers/releases">diffusers's
releases</a>.</em></p>
<blockquote>
<h2>v0.29.2: fix deprecation and LoRA bugs 🐞</h2>
<h2>All commits</h2>
<ul>
<li>[SD3] Fix mis-matched shape when num_images_per_prompt &gt; 1 using
without T5 (text_encoder_3=None) by <a
href="https://github.com/Dalanke"><code>@​Dalanke</code></a> in <a
href="https://redirect.github.com/huggingface/diffusers/issues/8558">#8558</a></li>
<li>[LoRA] refactor lora conversion utility. by <a
href="https://github.com/sayakpaul"><code>@​sayakpaul</code></a> in <a
href="https://redirect.github.com/huggingface/diffusers/issues/8295">#8295</a></li>
<li>[LoRA] fix conversion utility so that lora dora loads correctly by
<a href="https://github.com/sayakpaul"><code>@​sayakpaul</code></a> in
<a
href="https://redirect.github.com/huggingface/diffusers/issues/8688">#8688</a></li>
<li>[Chore] remove deprecation from transformer2d regarding the output
class. by <a
href="https://github.com/sayakpaul"><code>@​sayakpaul</code></a> in <a
href="https://redirect.github.com/huggingface/diffusers/issues/8698">#8698</a></li>
<li>[LoRA] fix vanilla fine-tuned lora loading. by <a
href="https://github.com/sayakpaul"><code>@​sayakpaul</code></a> in <a
href="https://redirect.github.com/huggingface/diffusers/issues/8691">#8691</a></li>
<li>Release: v0.29.2 by <a
href="https://github.com/sayakpaul"><code>@​sayakpaul</code></a> (direct
commit on v0.29.2-patch)</li>
</ul>
<h2>v0.29.1: SD3 ControlNet, Expanded SD3 <code>from_single_file</code>
support, Using long Prompts with T5 Text Encoder &amp; Bug fixes</h2>
<h2>SD3 CntrolNet</h2>
<!-- raw HTML omitted -->
<pre lang="python"><code>import torch
from diffusers import StableDiffusion3ControlNetPipeline
from diffusers.models import SD3ControlNetModel, SD3MultiControlNetModel
from diffusers.utils import load_image
<p>controlnet =
SD3ControlNetModel.from_pretrained(&quot;InstantX/SD3-Controlnet-Canny&quot;,
torch_dtype=torch.float16)</p>
<p>pipe = StableDiffusion3ControlNetPipeline.from_pretrained(
&quot;stabilityai/stable-diffusion-3-medium-diffusers&quot;,
controlnet=controlnet, torch_dtype=torch.float16
)
pipe.to(&quot;cuda&quot;)
control_image = load_image(&quot;<a
href="https://huggingface.co/InstantX/SD3-Controlnet-Canny/resolve/main/canny.jpg">https://huggingface.co/InstantX/SD3-Controlnet-Canny/resolve/main/canny.jpg</a>&quot;)
prompt = &quot;A girl holding a sign that says InstantX&quot;
image = pipe(prompt, control_image=control_image,
controlnet_conditioning_scale=0.7).images[0]
image.save(&quot;sd3.png&quot;)
</code></pre></p>
<p>📜 Refer to the official docs <a
href="https://huggingface.co/docs/diffusers/api/pipelines/controlnet_sd3">here</a>
to learn more about it.</p>
<p>Thanks to <a
href="https://github.com/haofanwang"><code>@​haofanwang</code></a> <a
href="https://github.com/wangqixun"><code>@​wangqixun</code></a> from
the <a
href="https://github.com/ResearcherXman"><code>@​ResearcherXman</code></a>
team for contributing this pipeline!</p>
<h2>Expanded single file support</h2>
<p>We now support all available single-file checkpoints for sd3 in
diffusers! To load the single file checkpoint with t5</p>
<pre lang="python"><code>import torch
from diffusers import StableDiffusion3Pipeline
<p>pipe = StableDiffusion3Pipeline.from_single_file(
&quot;<a
href="https://huggingface.co/stabilityai/stable-diffusion-3-medium/blob/main/sd3_medium_incl_clips_t5xxlfp8.safetensors">https://huggingface.co/stabilityai/stable-diffusion-3-medium/blob/main/sd3_medium_incl_clips_t5xxlfp8.safetensors</a>&quot;,
torch_dtype=torch.float16,
)
pipe.enable_model_cpu_offload()</p>
<p>image = pipe(&quot;a picture of a cat holding a sign that says hello
world&quot;).images[0]
&lt;/tr&gt;&lt;/table&gt;
</code></pre></p>
</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/huggingface/diffusers/commit/c586aadef6bb66d355fa40a2b95a0bea8a6fe79c"><code>c586aad</code></a>
Release: v0.29.2</li>
<li><a
href="https://github.com/huggingface/diffusers/commit/1479729dde391f5c9a477a8b900f8ba969b4007c"><code>1479729</code></a>
[LoRA] fix vanilla fine-tuned lora loading. (<a
href="https://redirect.github.com/huggingface/diffusers/issues/8691">#8691</a>)</li>
<li><a
href="https://github.com/huggingface/diffusers/commit/64b20500dcead52cbd6e9aca4e4d227f1fcec6cb"><code>64b2050</code></a>
[Chore] remove deprecation from transformer2d regarding the output
class. (<a
href="https://redirect.github.com/huggingface/diffusers/issues/8">#8</a>...</li>
<li><a
href="https://github.com/huggingface/diffusers/commit/aa2b3a3bb6eb0501d48a4bfa06f056e940f4c2be"><code>aa2b3a3</code></a>
[LoRA] fix conversion utility so that lora dora loads correctly (<a
href="https://redirect.github.com/huggingface/diffusers/issues/8688">#8688</a>)</li>
<li><a
href="https://github.com/huggingface/diffusers/commit/edc1c8928b8a6db3fbbb58f9af7e40fd3d976d5d"><code>edc1c89</code></a>
[LoRA] refactor lora conversion utility. (<a
href="https://redirect.github.com/huggingface/diffusers/issues/8295">#8295</a>)</li>
<li><a
href="https://github.com/huggingface/diffusers/commit/a0a542702869f963baeea63847f17a3145ca2287"><code>a0a5427</code></a>
[SD3] Fix mis-matched shape when num_images_per_prompt &gt; 1 using
without T5 (...</li>
<li><a
href="https://github.com/huggingface/diffusers/commit/dc74c7ec4fe63488125638cec9a3239a3b05c62b"><code>dc74c7e</code></a>
fix from_single_file for checkpoints with t5 (<a
href="https://redirect.github.com/huggingface/diffusers/issues/8631">#8631</a>)</li>
<li><a
href="https://github.com/huggingface/diffusers/commit/2eafde786af63f88624a003cff06085ab4508cd5"><code>2eafde7</code></a>
Support SD3 ControlNet and Multi-ControlNet. (<a
href="https://redirect.github.com/huggingface/diffusers/issues/8566">#8566</a>)</li>
<li><a
href="https://github.com/huggingface/diffusers/commit/7ec060d449e73ea6846b8d9105dd21aec0a27461"><code>7ec060d</code></a>
Fix gradient checkpointing issue for Stable Diffusion 3 (<a
href="https://redirect.github.com/huggingface/diffusers/issues/8542">#8542</a>)</li>
<li><a
href="https://github.com/huggingface/diffusers/commit/828e364fbeefdf278d89fa7234a0024271ffb68f"><code>828e364</code></a>
[SD3 Inference] T5 Token limit (<a
href="https://redirect.github.com/huggingface/diffusers/issues/8506">#8506</a>)</li>
<li>Additional commits viewable in <a
href="https://github.com/huggingface/diffusers/compare/v0.27.2...v0.29.2">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=diffusers&package-manager=pip&previous-version=0.27.2&new-version=0.29.2)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)


</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 image_generation/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/image_generation/requirements.txt b/image_generation/requirements.txt
index 795dd10cb2..5c346f3844 100644
--- a/image_generation/requirements.txt
+++ b/image_generation/requirements.txt
@@ -1,2 +1,2 @@
 -r ../samples/requirements.txt
-diffusers==0.27.2
+diffusers==0.29.2

From 3cbd691efabed7fea5bad2a6a9322f039f4548ce Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Tue, 16 Jul 2024 11:03:30 +0200
Subject: [PATCH 04/19] Enable random sampling CI (#628)

---
 .github/workflows/genai_python_lib.yml  |  18 +-
 tests/python_tests/common.py            |   2 +-
 tests/python_tests/test_generate_api.py |   4 +-
 tests/python_tests/test_preemption.py   | 122 +++++++++----
 tests/python_tests/test_sampling.py     | 217 +++++++++++++++++-------
 5 files changed, 265 insertions(+), 98 deletions(-)

diff --git a/.github/workflows/genai_python_lib.yml b/.github/workflows/genai_python_lib.yml
index 34d5fbf924..29ceda216a 100644
--- a/.github/workflows/genai_python_lib.yml
+++ b/.github/workflows/genai_python_lib.yml
@@ -59,7 +59,6 @@ jobs:
       - run: python -m pytest ./tests/python_tests/
 
   windows_genai_python_lib:
-    if: false
     runs-on: windows-latest
     env:
       CMAKE_BUILD_PARALLEL_LEVEL: null
@@ -73,13 +72,18 @@ jobs:
       - uses: actions/setup-python@v4
         with:
           python-version: 3.8
-      - run: curl --output ov.zip ${{ env.l_ov_link }}
-      - run: unzip -d ov ov.zip
-      - run: dirs=(ov/*) && mv ov/*/* ov && rmdir "${dirs[@]}"
+      - name: Install OpenVINO
+        run: |
+          curl --output ov.zip ${{ env.w_ov_link }}
+          unzip -d ov ov.zip
+          dirs=(ov/*) && mv ov/*/* ov && rmdir "${dirs[@]}"
         shell: bash
-      - run: call ./ov/setupvars.bat && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-      - run: call ./ov/setupvars.bat && cmake --build ./build/ --config Release -j
-      - run: call ./ov/setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager
+      - name: Install dependencies and build
+        run: |
+          call .\ov\setupvars.bat
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
+          cmake --build ./build/ --config Release -j
       # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
       - run: set "PYTHONPATH=./build/" && call ./ov/setupvars.bat && python -m pytest ./tests/python_tests/
       - run: call ./ov/setupvars.bat && python -m pip install . --verbose
diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py
index 9b53a6b78b..dec38f45ce 100644
--- a/tests/python_tests/common.py
+++ b/tests/python_tests/common.py
@@ -79,7 +79,7 @@ def get_multinomial_temperature() -> GenerationConfig:
 def get_multinomial_temperature_and_num_return_sequence() -> GenerationConfig:
     generation_config = GenerationConfig()
     generation_config.do_sample = True
-    generation_config.temperature = 0.9
+    generation_config.temperature = 0.7
     generation_config.num_return_sequences = 3
     generation_config.max_new_tokens = 30
     return generation_config
diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index 40eba92277..40bc121293 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -163,9 +163,9 @@ def test_decoding(model_descr, generation_config, prompt):
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.xfail(
     raises=TypeError, 
-    reason="pybind was unable to find overloads with tensor inputs on Linux",
+    reason="pybind was unable to find ov::Tensor from openvino yet",
     strict=False,
-    condition=sys.platform == "linux"
+    condition=sys.platform in ["linux", "win32"]
 )
 @pytest.mark.precommit
 def test_ov_tensors(model_descr, inputs):
diff --git a/tests/python_tests/test_preemption.py b/tests/python_tests/test_preemption.py
index a38e8d9be1..8c9bda1d33 100644
--- a/tests/python_tests/test_preemption.py
+++ b/tests/python_tests/test_preemption.py
@@ -9,7 +9,7 @@
     DEFAULT_SCHEDULER_CONFIG, get_scheduler_config, run_test_pipeline, get_models_list, get_beam_search, get_greedy, \
     get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \
     get_multinomial_temperature_and_top_k, get_multinomial_temperature, get_multinomial_temperature_and_top_p
-from test_sampling import RandomSamplingTestStruct
+from test_sampling import RandomSamplingTestStruct, get_current_plarform_ref_texts
 
 def get_greedy_seq_len_300() -> GenerationConfig:
     generation_config = GenerationConfig()
@@ -38,23 +38,48 @@ def get_beam_search_seq_len_300() -> GenerationConfig:
 def test_preemption(tmp_path, params):
     run_test_pipeline(tmp_path, "facebook/opt-125m", params[0], params[1])
 
-multinomial_params = RandomSamplingTestStruct(generation_config=[get_multinomial_temperature(),
-                                                          get_multinomial_temperature_and_top_p(),
-                                                          get_multinomial_temperature_and_top_k()],
-                                                       prompts=["What is OpenVINO?",
-                                                                "How are you?",
-                                                                "Tell me something about Canada?",
-                                                                ],
-                                                       ref_texts=[ ["\n\nOpenVINO is a live platform that allows users to create and manage a new library for open source applications.\n\nOpenVINO is"],
-                                                                   ["  You're getting much better results from doing this, than you are by not doing this.  I have a BH and I was so far"],
-                                                                   ["\nI'm from Canada, and I'm from the US, so I'm not sure.\nI think you mean the Canadian version."]])
 
+multinomial_params = RandomSamplingTestStruct(
+    generation_config=[
+        get_multinomial_temperature(),
+        get_multinomial_temperature_and_top_p(),
+        get_multinomial_temperature_and_top_k(),
+    ],
+    prompts=[
+        "What is OpenVINO?",
+        "How are you?",
+        "Tell me something about Canada?",
+    ],
+    ref_texts=get_current_plarform_ref_texts({
+        "linux": [
+            [
+                "\n\nOpenVINO is a live platform that allows users to create and manage a new library for open source applications.\n\nOpenVINO is"
+            ],
+            [
+                "  You're getting much better results from doing this, than you are by not doing this.  I have a BH and I was so far"
+            ],
+            [
+                "\nI'm from Canada, and I'm from the US, so I'm not sure.\nI think you mean the Canadian version."
+            ],
+        ],
+        "win32": [
+            [
+                "\n\nOpenVINO is a live platform that allows users to create and manage a new library of applications on the Virtuoso server, which can"
+            ],
+            [
+                "  You're getting much better results from doing this, than you are by not doing this.  If you are truly trying to do something good,"
+            ],
+            [
+                "\nI'm from Canada, and I'm from the US, so I'm not sure what you're talking about.\nI'm Canadian and I"
+            ],
+        ],
+    }),
+)
 
 
 # todo: Anastasiia Pnevskaya: fix the test because it is hanging according max_new_tokens = std::numeric_limits<std::size_t>::max()
 @pytest.mark.parametrize("dynamic_split_fuse", [True, False])
 @pytest.mark.precommit
-@pytest.mark.xfail(raises=AssertionError, reason="assert ref_text == ov_text fails in CI.", condition=sys.platform in ["win32", "darwin"], strict=True)
 def test_preemption_with_multinomial(tmp_path, dynamic_split_fuse):
     generation_configs = multinomial_params.generation_config
     for config in generation_configs:
@@ -69,36 +94,73 @@ def test_preemption_with_multinomial(tmp_path, dynamic_split_fuse):
     scheduler_config = get_scheduler_config({"num_kv_blocks": 3, "block_size": 32, "dynamic_split_fuse": dynamic_split_fuse, "max_num_batched_tokens": 256, "max_num_seqs": 256})
     generate_and_compare_with_reference_text(model_path, multinomial_params.prompts, multinomial_params.ref_texts, generation_configs, scheduler_config)
 
-multinomial_params_n_seq = RandomSamplingTestStruct(generation_config=[
+
+multinomial_params_n_seq = RandomSamplingTestStruct(
+    generation_config=[
         get_multinomial_temperature(),
         get_multinomial_temperature_and_num_return_sequence(),
         get_multinomial_all_parameters(),
     ],
     prompts=[
-            "Artificial intelligence ",
-            "What is the current",
-            "Tell me something about UAE?",
+        "Artificial intelligence ",
+        "What is the current",
+        "Tell me something about UAE?",
+    ],
+    ref_texts=get_current_plarform_ref_texts({
+        "linux": [
+            [
+                "\nI've seen this expression used too many times without making sense.\nAs an AI engineer, and as a scientist, we should make everything easier"
+            ],
+            [
+                " position of the Z-shaped groove?\n0.41\nWhat is the current position of the Z-shaped groove?\n0.11\n",
+                " status of all of this? I can't stop thinking about it.\nIt's been a while since I've seen it. I found it a",
+                " status of your blog? Do you accept feedback?\nYes, I’m happy to accept feedback at this time (I’m a"
+            ],
+            [
+                "\nIt's in the middle of nowhere if you haven’t seen one yet! It might be more convenient there than anywhere else.. maybe take",
+                "\nUAE is a country with some great culture that has been living under Islamic oppression for almost 60 years now (including 20 years as part of Arab",
+                "\nNope, just wanted to say how awesome and beautiful it was when my brother came back from an adventure trip across Asia - our 2nd year",
+                "\nI don't know anything.  I'm not sure what kind this sub wants though... but apparently they are pretty bad at making videos/photos",
             ],
-    ref_texts=[
-        [
-            "\nI've seen this expression used too many times without making sense.\nAs an AI engineer, and as a scientist, we should make everything easier"
         ],
-        [
-            ' significance of 3862?\n3829\nWhat is the greatest common divisor of 15 and 7763?\n9\nCalculate the',
-            ' third derivative of 939*v**3*r**2 + 133*v**3*r**2 + v**3 - 16*',
-            " climate in the future?  Do we have things to catch on fire, and if so does that mean we'll have a new climate change or is"
+        "win32": [
+            [
+                "\nI've had a friend with the capacity to test this in his own words.\nThe big problem with real-world results is the economics of"
+            ],
+            [
+                " position of the patent application number of the present invention?\n\nIn the present invention, the present invention relates to an improved method for manufacturing a semic",
+                " status of your town? How many houses do you have?\nThere are about three houses in our town. The closest place to us is about 25",
+                " status of all the other passengers?\nWe're the only ones left, so no...\nI don't think they'll really leave.\nThey"
+            ],
+            [
+                "\nI don't have any knowledge on them. We are based out near Dubai so hopefully they will take care of us soon enough :) thanks though :",
+                "\nUAE is not one of the richest countries in Asia but definitely among those most corrupt nations because this corruption (and its own endemic practices) still",
+                "\nNope, I'm just going through my first semester there right now and it was nice to see some people who were doing well haha - we",
+                "\nIt's a country where your parents can never give you anything at all!  It also has an extremely low education system for many years... You",
+            ],
         ],
-        [
-            "\nIt's in the middle of nowhere if you haven’t seen one yet! It might be more convenient there than anywhere else.. maybe take",
-            '\nUAE is a country with some great culture that has been living under Islamic oppression for almost 60 years now (including 20 years as part of Arab',
-            '\nNope, just wanted to say how awesome and beautiful it was when my brother came back from an adventure trip across Asia - our 2nd year',
-            '\nI don\'t know anything.  I\'m not sure what kind this sub wants though... but apparently they are pretty bad at making videos/photos'
+        "darwin": [
+            [
+                "\nI've had a friend with the capacity to test this in his own words.\nThe big problem with real-world results is the rigidity"
+            ],
+            [
+               " position of the patent application number of the present invention?\n\nIn the present invention, the present invention relates to an improved method for manufacturing a semic",
+               " status of your town? How many houses do you have?\nThere are about three houses in our town. The closest place to us is about 25",
+               " status of all the other passengers?\nWe're the only ones left, so no...\nI don't think they'll really leave.\nThey"
+            ],
+            [
+                "\nI don't have any knowledge on them. We are based out near Dubai so hopefully they will take care of us soon enough :) thanks though :",
+                "\nUAE is not one of the richest countries in Asia but definitely among those most corrupt nations because this corruption (and its own endemic practices) still",
+                "\nNope, I'm just going through my first semester there right now and it was nice to see some people who were doing well haha - we",
+                "\nIt's a country where your parents can never give you anything at all!  It also has an extremely low education system for many years... You",
+            ],
         ],
-    ])
+    }),
+)
+
 
 @pytest.mark.parametrize("dynamic_split_fuse", [True, False])
 @pytest.mark.precommit
-@pytest.mark.xfail(reason="assert ref_text == ov_text fails", condition=sys.platform in ["win32", "darwin"])
 def test_preemption_with_multinomial_n_seq(tmp_path, dynamic_split_fuse):
     generation_configs = multinomial_params_n_seq.generation_config
     for config in generation_configs:
diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py
index f4f35deace..f9b478bd14 100644
--- a/tests/python_tests/test_sampling.py
+++ b/tests/python_tests/test_sampling.py
@@ -8,7 +8,7 @@
 from dataclasses import dataclass
 from pathlib import Path
 from openvino_genai import ContinuousBatchingPipeline, GenerationConfig
-from typing import List
+from typing import List, Optional, TypedDict
 
 from common import run_test_pipeline, get_models_list, get_model_and_tokenizer, save_ov_model_from_optimum, \
     generate_and_compare_with_reference_text, get_greedy, get_beam_search, get_multinomial_temperature, \
@@ -93,70 +93,171 @@ def test_individual_generation_configs_deterministic(tmp_path, generation_config
     generate_and_compare_with_hf(model_id, prompts, generation_configs, DEFAULT_SCHEDULER_CONFIG, tmp_path)
 
 
+class PlatformsRefTexts(TypedDict, total=False):
+    linux: List[List[str]]
+    win32: List[List[str]]
+    darwin: List[List[str]]
+
+
+def get_current_plarform_ref_texts(ref_texts: PlatformsRefTexts) -> List[List[str]]:
+    # mac and win often have identical results
+    # to avoid duplication, use win32 ref_text if no mac ref_texts were found
+    if sys.platform == "darwin":
+        result = ref_texts.get("darwin") or ref_texts.get("win32")
+    else:
+        result = ref_texts.get(sys.platform)
+    if not result:
+        raise RuntimeError("No ref_texts were provided")
+    return result
+
+
 @dataclass
 class RandomSamplingTestStruct:
     generation_config: GenerationConfig
     prompts: List[str]
     ref_texts: List[List[str]]
 
+
 RANDOM_SAMPLING_TEST_CASES = [
-    RandomSamplingTestStruct(generation_config=get_multinomial_temperature(),
-                             prompts=["What is OpenVINO?"],
-                             ref_texts=[ ["\n\nOpenVINO is a software development platform developed by OpenVINO, a set of technology companies and startups that enables developers to use the most"] ]),
-    pytest.param(RandomSamplingTestStruct(generation_config=get_multinomial_temperature_and_top_p(),
-                             prompts=["What is OpenVINO?"],
-                             ref_texts=[ ["\nOpenVINO is an online application that allows users to create, test, and analyze their own software using a collection of software packages. The application"] ]),
-                             marks=[pytest.mark.xfail(reason="assert ref_text == ov_text fails in CI.", strict=True, condition=sys.platform in ["darwin", "win32"])]),
-    RandomSamplingTestStruct(generation_config=get_multinomial_temperature_and_top_k(),
-                             prompts=["What is OpenVINO?"],
-                             ref_texts=[ ["\n\nOpenVINO is a software that allows users to create a virtual machine with the ability to create a virtual machine in a virtual environment. Open"] ]),
-    pytest.param(RandomSamplingTestStruct(generation_config=get_multinomial_temperature_top_p_and_top_k(),
-                             prompts=["What is OpenVINO?"],
-                             ref_texts=[ ["\nOpenVINO is an open source software that allows developers to create, manage, and distribute software. It is an open source project that allows developers"] ]),
-                             marks=[pytest.mark.xfail(reason="assert ref_text == ov_text fails in CI.", strict=True, condition=sys.platform in ["darwin", "win32"])]),
-    RandomSamplingTestStruct(generation_config=get_multinomial_temperature_and_repetition_penalty(),
-                             prompts=["What is OpenVINO?"],
-                             ref_texts=[ ["\nOpen Vino's are a new and improved way to find cheap, fast-investment frozen vegetables that have no waste or calories. They're"] ]),
-    pytest.param(RandomSamplingTestStruct(generation_config=get_multinomial_temperature_and_num_return_sequence(),
-                             prompts=["What is location of"],
-                             ref_texts=[
-                                [
-                                    ' your instruments?  Are they in an armpit?  Is it warm?  Are your instruments clear?  Are there any cuts and scratches',
-                                    ' map and where does the game player base base?    I tend to like to do all draws on a specific spot (sometimes wide area,',
-                                    ' them?\nJust the Mario Maker App, the location is they'
-                                ]
-                             ]), 
-                             marks=[pytest.mark.xfail(reason="assert ref_text == ov_text fails in CI.", strict=True)]),
-    pytest.param(RandomSamplingTestStruct(generation_config=get_multinomial_all_parameters(),
-                             prompts=["Tell me something about UAE"],
-                             ref_texts=[
-                                [
-                                    " and how it's not like we're all in the same boat right now lol (or even close) 😂😁! Just curious :) If",
-                                    "?  You are my country... so what does our military do here?? What am i missing out on?? And why don't u tell us?",
-                                    '?\nThe U.S government has been doing quite well with foreign-made aircraft for many years under US administration....and they have very good reasons',
-                                    '? I think that is a bit of an anomaly, but you might want to ask yourself this question: Where can some young people from Dubai or Bahrain'
-                                ]
-                             ]),
-                             marks=[pytest.mark.xfail(reason="assert ref_text == ov_text fails in CI.", strict=True, condition=sys.platform in ["darwin", "win32"])]),
-    RandomSamplingTestStruct(generation_config=get_multinomial_temperature_and_presence_penalty(),
-                             prompts=["What is OpenVINO?"],
-                             ref_texts=[ ["\n\nOpenVINO is a software development platform developed by OpenVINO, Inc., which uses a RESTful API for server-side web applications"] ]),
-    RandomSamplingTestStruct(generation_config=get_multinomial_temperature_and_frequence_penalty(),
-                             prompts=["What is OpenVINO?"],
-                             ref_texts=[ ["\n\nOpenVINO is a software development platform developed by OpenVINO, Inc., which offers the Linux-based platform. OpenVINO's"] ]),
-    RandomSamplingTestStruct(generation_config=get_greedy_with_penalties(),
-                             prompts=["What is OpenVINO?"],
-                             ref_texts=[ ["\nOpenVINO is a software that allows users to create and manage their own virtual machines. It's designed for use with Windows, Mac OS X"] ]),
-    pytest.param(RandomSamplingTestStruct(generation_config=get_multinomial_max_and_min_token(),
-                             prompts=["What is OpenVINO?"],
-                             ref_texts=[
-                                [
-                                    "\nOpenVINO is a Linux distro. It's not as simple as using the Linux distro itself. OpenVINO is essentially a dist",
-                                    '\nOpenVINO is an open-source open-source software that allows anyone to work with a virtual machine, from a smartphone to an iPhone,',
-                                    '\n\nOpenVINO is a social networking tool. OpenVINO is a free virtualization service that works at scale. The tool provides the ability'
-                                ]
-                            ]),
-                            marks=[pytest.mark.xfail(reason="assert ref_text == ov_text fails in CI.", strict=True, condition=sys.platform in ["darwin", "win32"])]),
+    RandomSamplingTestStruct(
+        generation_config=get_multinomial_temperature(),
+        prompts=["What is OpenVINO?"],
+        ref_texts=[
+            [
+                "\n\nOpenVINO is a software development platform developed by OpenVINO, a set of technology companies and startups that enables developers to use the most"
+            ]
+        ],
+    ),
+    RandomSamplingTestStruct(
+        generation_config=get_multinomial_temperature_and_top_p(),
+        prompts=["What is OpenVINO?"],
+        ref_texts=get_current_plarform_ref_texts({
+            "linux": [
+                [
+                    "\nOpenVINO is an online application that allows users to create, test, and analyze their own software using a collection of software packages. The application"
+                ]
+            ],
+            "win32": [
+                [
+                    "\n\nOpenVINO is a software development platform designed to allow developers to develop and commercialize the most important software products on the web. OpenV"
+                ]
+            ],
+        })
+    ),
+    RandomSamplingTestStruct(
+        generation_config=get_multinomial_temperature_and_top_k(),
+        prompts=["What is OpenVINO?"],
+        ref_texts=[
+            [
+                "\n\nOpenVINO is a software that allows users to create a virtual machine with the ability to create a virtual machine in a virtual environment. Open"
+            ]
+        ],
+    ),
+    RandomSamplingTestStruct(
+        generation_config=get_multinomial_temperature_top_p_and_top_k(),
+        prompts=["What is OpenVINO?"],
+        ref_texts=get_current_plarform_ref_texts({
+            "linux": [
+                [
+                    "\nOpenVINO is an open source software that allows developers to create, manage, and distribute software. It is an open source project that allows developers"
+                ]
+            ],
+            "win32": [
+                [
+                    "\n\nOpenVINO is a software that allows users to create a virtual machine with the ability to create a virtual machine in a virtual environment. Open"
+                ]
+            ],
+        }),
+    ),
+    RandomSamplingTestStruct(
+        generation_config=get_multinomial_temperature_and_repetition_penalty(),
+        prompts=["What is OpenVINO?"],
+        ref_texts=[
+            [
+                "\nOpen Vino's are a new and improved way to find cheap, fast-investment frozen vegetables that have no waste or calories. They're"
+            ]
+        ],
+    ),
+    RandomSamplingTestStruct(
+        generation_config=get_multinomial_temperature_and_num_return_sequence(),
+        prompts=["What is location of"],
+        ref_texts=[
+            [
+                " the exact same image?\nI've tried multiple times to find it, but I'm still not sure. I am sure it's the exact same",
+                " your new house?\nAnywhere that has a GPS. It will be up to you.",
+                " your cat?  He is more likely to be on the floor with him.\nTalduck"
+            ]
+        ],
+    ),
+    RandomSamplingTestStruct(
+        generation_config=get_multinomial_all_parameters(),
+        prompts=["Tell me something about UAE"],
+        ref_texts=get_current_plarform_ref_texts({
+            "linux": [
+                [
+                    " and how it's not like we're all in the same boat right now lol (or even close) 😂😁! Just curious :) If",
+                    "?  You are my country... so what does our military do here?? What am i missing out on?? And why don't u tell us?",
+                    "?\nThe U.S government has been doing quite well with foreign-made aircraft for many years under US administration....and they have very good reasons",
+                    "? I think that is a bit of an anomaly, but you might want to ask yourself this question: Where can some young people from Dubai or Bahrain",
+                ]
+            ],
+            "win32": [
+                [
+                    "? I think that is a bit of an anomaly, especially since there aren't many Americans living here (like us). What makes you say they've",
+                    "?  You are my country... so what does our future have to do with your problems?? \U0001f609\U0001f608\U0001f495 \U0001f5a4\ufffd",
+                    "?\nThe U.S government has been doing quite well for decades now when compared strictly directly or indirectly as regards security issues.. They even made some",
+                    " and how it's not like we're all in the same boat either! We had such fun meeting each other at different times this past summer :) It",
+                ]
+            ],
+        }),
+    ),
+    RandomSamplingTestStruct(
+        generation_config=get_multinomial_temperature_and_presence_penalty(),
+        prompts=["What is OpenVINO?"],
+        ref_texts=[
+            [
+                "\n\nOpenVINO is a software development platform developed by OpenVINO, Inc., which uses a RESTful API for server-side web applications"
+            ]
+        ],
+    ),
+    RandomSamplingTestStruct(
+        generation_config=get_multinomial_temperature_and_frequence_penalty(),
+        prompts=["What is OpenVINO?"],
+        ref_texts=[
+            [
+                "\n\nOpenVINO is a software development platform developed by OpenVINO, Inc., which offers the Linux-based platform. OpenVINO's"
+            ]
+        ],
+    ),
+    RandomSamplingTestStruct(
+        generation_config=get_greedy_with_penalties(),
+        prompts=["What is OpenVINO?"],
+        ref_texts=[
+            [
+                "\nOpenVINO is a software that allows users to create and manage their own virtual machines. It's designed for use with Windows, Mac OS X"
+            ]
+        ],
+    ),
+    RandomSamplingTestStruct(
+        generation_config=get_multinomial_max_and_min_token(),
+        prompts=["What is OpenVINO?"],
+        ref_texts=get_current_plarform_ref_texts({
+            "linux": [
+                [
+                    "\nOpenVINO is a Linux distro. It's not as simple as using the Linux distro itself. OpenVINO is essentially a dist",
+                    "\nOpenVINO is an open-source open-source software that allows anyone to work with a virtual machine, from a smartphone to an iPhone,",
+                    "\n\nOpenVINO is a social networking tool. OpenVINO is a free virtualization service that works at scale. The tool provides the ability",
+                ]
+            ],
+            "win32": [
+                [
+                    "\nOpenVINO is the latest addition to the OpenVINO series of platforms. OpenVINO is an open source software development framework for all platforms",
+                    "\nOpenVINO is a browser-based virtual assistant that enables developers and developers to quickly communicate with their own virtual machines. Using this virtual assistant,",
+                    "\n\nOpenVINO is a program designed to help you find the best open source open source software. The program, which is a lightweight package and",
+                ]
+            ],
+        }),
+    ),
 ]
 
 
From 7f5e8d293468754e148c274533b7c3e790b78198 Mon Sep 17 00:00:00 2001
From: Anatoliy Talamanov <anatoliy.talamanov@intel.com>
Date: Wed, 17 Jul 2024 13:47:38 +0100
Subject: [PATCH 05/19] Support chat conversation for StaticLLMPipeline (#580)

# Overview

Adding chat mode support for `StaticLLMPipeline`.

The current implementation is naive - aggregates the entire chat
conversation and pass as new prompt on every new `generate` call.

---------

Co-authored-by: Pavel Esir <pavel.esir@gmail.com>
---
 samples/cpp/chat_sample/chat_sample.cpp |  2 +-
 src/cpp/src/llm_pipeline_static.cpp     | 59 ++++++++++++++++++-------
 src/cpp/src/llm_pipeline_static.hpp     | 12 +++--
 3 files changed, 48 insertions(+), 25 deletions(-)

diff --git a/samples/cpp/chat_sample/chat_sample.cpp b/samples/cpp/chat_sample/chat_sample.cpp
index d9d9c2b2de..ae4dad88a2 100644
--- a/samples/cpp/chat_sample/chat_sample.cpp
+++ b/samples/cpp/chat_sample/chat_sample.cpp
@@ -10,7 +10,7 @@ int main(int argc, char* argv[]) try {
     std::string prompt;
     std::string model_path = argv[1];
 
-    std::string device = "CPU";  // GPU can be used as well
+    std::string device = "CPU";  // GPU, NPU can be used as well
     ov::genai::LLMPipeline pipe(model_path, "CPU");
     
     ov::genai::GenerationConfig config;
diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index 070472792a..3f50d30ec9 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -77,18 +77,15 @@ void reshape_to_static(std::shared_ptr<ov::Model> model,
     model->reshape(new_shapes);
 }
 
-void fill_tensor(ov::Tensor tensor, int64_t fill_val) {
+void fill_tensor(ov::Tensor tensor, int64_t fill_val, size_t offset = 0u) {
     int64_t* tensor_data = tensor.data<int64_t>();
-    std::fill(tensor_data, tensor_data + tensor.get_size(), fill_val);
+    std::fill(tensor_data + offset, tensor_data + tensor.get_size(), fill_val);
 }
 
-void copy_with_left_offset(const ov::Tensor& orig, ov::Tensor& padded) {
-    const auto orig_size = orig.get_size();
-    const auto padded_size = padded.get_size();
-    const auto kLeftOffset = padded_size - orig_size;
+void copy_with_offset(const ov::Tensor& orig, const int32_t offset, ov::Tensor& padded) {
     int64_t* orig_data = orig.data<int64_t>();
     int64_t* padded_data = padded.data<int64_t>();
-    std::copy(orig_data, orig_data + orig_size, padded_data + kLeftOffset);
+    std::copy(orig_data, orig_data + orig.get_size(), padded_data + offset);
 }
 
 ov::AnyMap extract_config_or_default(const ov::AnyMap& config, const std::string& config_name) {
@@ -111,7 +108,7 @@ ov::AnyMap extract_config_or_default(const ov::AnyMap& config, const std::string
             { "NPUW_FOLD", "YES" },
             { "NPUW_DCOFF_TYPE", "f16" },
             { "NPUW_DCOFF_SCALE", "YES" },
-            { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm" },
+            { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add" },
             { "NPUW_PARALLEL_COMPILE", "YES" },
             { "NPUW_FUNCALL_ASYNC", "YES" }
         };
@@ -179,6 +176,18 @@ StaticLLMPipeline::StaticLLMPipeline(
 ) : StaticLLMPipeline(path, path.string(), device, config) {
 }
 
+void StaticLLMPipeline::start_chat(const std::string& system_message) {
+    if (!system_message.empty()) {
+        m_history.push_back({{"role", "system"}, {"content", system_message}});
+    }
+    m_is_chat_conversation = true;
+};
+
+void StaticLLMPipeline::finish_chat() {
+    m_is_chat_conversation = false;
+    m_history.clear();
+};
+
 void StaticLLMPipeline::prepare_for_new_conversation() {
     fill_tensor(m_prefill_request.get_tensor("input_ids"), m_tokenizer.get_pad_token_id());
     fill_tensor(m_prefill_request.get_tensor("position_ids"), 0u);
@@ -198,9 +207,23 @@ DecodedResults StaticLLMPipeline::generate(
     }
 
     OPENVINO_ASSERT(std::holds_alternative<std::string>(inputs));
-    auto tokenized_input = m_tokenizer.encode(std::get<std::string>(inputs));
+    auto& prompt = std::get<std::string>(inputs);
+
+    if (m_is_chat_conversation) {
+        m_history.push_back({{"role", "user"}, {"content", prompt}});
+        constexpr bool add_generation_prompt = true;
+        prompt = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
+    }
+
+    auto tokenized_input = m_tokenizer.encode(prompt);
     auto encoded_results = generate(tokenized_input, config, streamer);
-    return {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores};
+    DecodedResults decoded_results = {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores};
+
+    if (m_is_chat_conversation) {
+        auto answer = decoded_results.texts[0];
+        m_history.push_back({{"role", "assistant"}, {"content", answer}});
+    }
+    return decoded_results;
 }
 
 EncodedResults StaticLLMPipeline::generate(
@@ -245,22 +268,25 @@ EncodedResults StaticLLMPipeline::generate(
     ov::genai::EncodedResults results;
     // NB: Only batch=1 is supported now
     results.scores.resize(1u);
+    results.scores[0] = 0u;
     results.tokens.resize(1u);
 
-    // NB: Check if input prompt less than maximum size
+    // NB: Check if there is enough space in KV-cache to process input prompt
     auto prompt_len = input_ids.get_size();
     if (prompt_len > m_kvcache_desc.total_size) {
         OPENVINO_THROW("Currently static pipeline only process up to " + std::to_string(m_kvcache_desc.total_size) + " tokens");
     }
 
-    // NB: Reset tensors on every generate call - chat conversation isn't supported yet!
+    // NB: From the "generate" perspective, every call is treated as start of new conversation,
+    // but if continuation is needed, prompt contains information about the entire conversation.
     prepare_for_new_conversation();
 
     auto padded_input_ids = m_prefill_request.get_tensor("input_ids");
-    copy_with_left_offset(input_ids, padded_input_ids);
+    const size_t offset = padded_input_ids.get_size() - input_ids.get_size();
+    copy_with_offset(input_ids, offset, padded_input_ids);
 
     auto padded_attention_mask = m_prefill_request.get_tensor("attention_mask");
-    copy_with_left_offset(attention_mask, padded_attention_mask);
+    fill_tensor(padded_attention_mask, 1u, offset);
 
     auto padded_position_ids = m_prefill_request.get_tensor("position_ids");
     auto* padded_pos_data = padded_position_ids.data<int64_t>();
@@ -271,13 +297,13 @@ EncodedResults StaticLLMPipeline::generate(
     // NB: Now there are prompt_len tokens in KV-cache
     m_kvcache_desc.num_stored_tokens += prompt_len;
     int64_t last_token = utils::argmax(m_prefill_request.get_tensor("logits"), 0);
+    results.tokens[0].push_back(last_token);
     if (streamer_ptr && streamer_ptr->put(last_token)) {
         return results;
     }
 
     padded_attention_mask.copy_to(m_kvcache_request.get_tensor("attention_mask"));
 
-
     // Inputs: input_ids, attention_mask, position_ids, ...
     // Outputs: logits, ...
     const auto kStartInputKVCacheLayers = 3u;
@@ -309,13 +335,12 @@ EncodedResults StaticLLMPipeline::generate(
 
         last_token = utils::argmax(m_kvcache_request.get_tensor("logits"), 0);
         results.tokens[0].push_back(last_token);
-        results.scores[0] = 0u;
 
         if (streamer_ptr && streamer_ptr->put(last_token)) {
             break;
         }
 
-        if (last_token == m_generation_config.eos_token_id) {
+        if (last_token == config.eos_token_id && !config.ignore_eos) {
             break;
         }
 
diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp
index 8c2f19ffa7..85488e1880 100644
--- a/src/cpp/src/llm_pipeline_static.hpp
+++ b/src/cpp/src/llm_pipeline_static.hpp
@@ -35,13 +35,8 @@ class StaticLLMPipeline final : public LLMPipelineImplBase {
         StreamerVariant streamer
     ) override;
 
-    void start_chat(const std::string& system_message) override {
-        OPENVINO_THROW("Currently chat conversation mode isn't supported");
-    };
-    void finish_chat() override {
-        OPENVINO_THROW("Currently chat conversation mode isn't supported");
-    };
-
+    void start_chat(const std::string& system_message) override;
+    void finish_chat() override;
 private:
     void prepare_for_new_conversation();
 
@@ -54,6 +49,9 @@ class StaticLLMPipeline final : public LLMPipelineImplBase {
     KVCacheDesc m_kvcache_desc;
     ov::InferRequest m_kvcache_request;
     ov::InferRequest m_prefill_request;
+
+    bool m_is_chat_conversation = false;
+    ChatHistory m_history;
 };
 
 }  // namespace genai

From fcc309ef00ef0020a8a93bf1f7e08664eb6d2bcb Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@gmail.com>
Date: Thu, 18 Jul 2024 11:30:31 +0200
Subject: [PATCH 06/19] add testing chat_templates for models from continuous
 batching (#643)

and added missing chat_templates for models from
https://github.com/ilya-lavrenov/openvino.genai/blob/ct-beam-search/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models

Missing models were:
`mistralai/Mistral-7B-Instruct-v0.1`
`microsoft/Phi-3-mini-4k-instruct, microsoft/Phi-3-mini-128k-instruct` -
same templates
`THUDM/chatglm3-6b`

Mistral will be added separately. Also increased priority to enable
apply_chat_template firstly for CB models from the list above.
---
 tests/python_tests/ov_genai_test_utils.py |  5 ++++-
 tests/python_tests/tokenizer_configs.py   | 19 +++++++++++++++++--
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py
index 4ba71a1d48..7bceb29458 100644
--- a/tests/python_tests/ov_genai_test_utils.py
+++ b/tests/python_tests/ov_genai_test_utils.py
@@ -81,6 +81,10 @@ def get_chat_templates():
     # but skips some models that currently are not processed correctly.
 
     skipped_models = {
+        # TODO: openchat/openchat_3.5 and berkeley-nest/Starling-LM-7B-alpha have the same template.
+        # Need to enable and unskip, since it's preset in continious batching and has >100 000 downloads.
+        "openchat/openchat-3.5-0106",
+        
         # These models fail even on HF so no need to check if applying chat matches.
         "vibhorag101/llama-2-13b-chat-hf-phr_mental_therapy",
         "codellama/CodeLlama-34b-Instruct-hf",
@@ -101,7 +105,6 @@ def get_chat_templates():
         "deepseek-ai/deepseek-coder-6.7b-instruct",
         "maldv/winter-garden-7b-alpha",
         "ishorn5/RTLCoder-Deepseek-v1.1",
-        "openchat/openchat-3.5-0106",
         "casperhansen/llama-3-70b-instruct-awq",
         "TheBloke/deepseek-coder-33B-instruct-GPTQ",
         "AI-Sweden-Models/gpt-sw3-356m-instruct",
diff --git a/tests/python_tests/tokenizer_configs.py b/tests/python_tests/tokenizer_configs.py
index eb83f50836..4caf031463 100644
--- a/tests/python_tests/tokenizer_configs.py
+++ b/tests/python_tests/tokenizer_configs.py
@@ -980,5 +980,20 @@ def get_tokenizer_configs():
         "pad_token": None,
         "unk_token": "<unk>",
         "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{%set system_message = 'Je bent een behulpzame, respectvolle en eerlijke assistent. Antwoord altijd zo behulpzaam mogelijk. Je antwoorden mogen geen schadelijke, onethische, racistische, seksistische, gevaarlijke of illegale inhoud bevatten. Zorg ervoor dat je antwoorden sociaal onbevooroordeeld en positief van aard zijn.\n\nAls een vraag nergens op slaat of feitelijk niet coherent is, leg dan uit waarom in plaats van iets niet correct te antwoorden. Als je het antwoord op een vraag niet weet, deel dan geen onjuiste informatie.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\n' + content.strip() + '\n<</SYS>>\n\n' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"
-        }
-    }
\ No newline at end of file
+        },
+        "THUDM/chatglm3-6b": {
+        "bos_token": None,
+        "eos_token": "</s>",
+        "pad_token": "<unk>",
+        "unk_token": "<unk>",
+        "chat_template": "{% for message in messages %}{% if loop.first %}[gMASK]sop<|{{ message['role'] }}|>\n {{ message['content'] }}{% else %}<|{{ message['role'] }}|>\n {{ message['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}"
+        },
+        "microsoft/Phi-3-mini-4k-instruct": {
+        "bos_token": "<s>",
+        "eos_token": "<|endoftext|>",
+        "pad_token": "<|endoftext|>",
+        "unk_token": "<unk>",
+        "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}"
+        },
+    }
+    

From 0c2b68e469008fcc33f53da884d3e3b87df8dad0 Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Fri, 19 Jul 2024 13:27:33 +0400
Subject: [PATCH 07/19] rm .github/ISSUE_TEMPLATE (#646)

GenAI issues found by the commpunity tend to be crated using that
template which isn't correct because they usually expect us to address
them.
---
 .github/ISSUE_TEMPLATE/good_first_issue.yml | 67 ---------------------
 1 file changed, 67 deletions(-)
 delete mode 100644 .github/ISSUE_TEMPLATE/good_first_issue.yml

diff --git a/.github/ISSUE_TEMPLATE/good_first_issue.yml b/.github/ISSUE_TEMPLATE/good_first_issue.yml
deleted file mode 100644
index f0192d1598..0000000000
--- a/.github/ISSUE_TEMPLATE/good_first_issue.yml
+++ /dev/null
@@ -1,67 +0,0 @@
-name: Good First Issue
-description: Create a Good First Issue for new contributors.
-title: "[Good First Issue]: "
-labels: ["good first issue"]
-body:
-  - type: textarea
-    id: context
-    attributes:
-      label: Context
-      description: |
-          Let the contributors know what your component is responsible for,
-          what's the importance of the change and why it's needed.
-          Keep in mind the Good First Issue is for new contributors.
-      placeholder: What is it and why is it important?
-    validations:
-      required: true
-
-  - type: textarea
-    id: todo_list
-    attributes:
-      label: What needs to be done?
-      description: |
-          Be as verbose as possible, provide a TODO list if viable.
-    validations:
-      required: true
-
-  - type: textarea
-    id: example_prs
-    attributes:
-      label: Example Pull Requests
-      description: |
-          Provide example Pull requests, if there are any.
-    validations:
-      required: false
-  
-  - type: textarea
-    id: resources
-    attributes:
-      label: Resources
-      description: |
-          Any materials related to the task, such as operator specifications,
-          discussions, guides.
-      value: |
-          - [Contribution guide - start here!](https://github.com/openvinotoolkit/openvino/blob/master/CONTRIBUTING.md)
-          - [Intel DevHub Discord channel](https://discord.gg/7pVRxUwdWG) - engage in discussions, ask questions and talk to OpenVINO developers
-          - [How to link your Pull Request to an issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue#manually-linking-a-pull-request-to-an-issue-using-the-pull-request-sidebar)
-    validations:
-      required: true
-
-  - type: textarea
-    id: contact_points
-    attributes:
-      label: Contact points
-      description: |
-          People who can be asked questions about the task.
-      placeholder: GitHub users
-    validations:
-      required: true
-
-  - type: textarea
-    id: ticket
-    attributes:
-      label: Ticket
-      description: |
-          Provide the ticket number, if available.
-    validations:
-      required: false

From d24a683ba095c0540a23948b8c24fee8ae2047a8 Mon Sep 17 00:00:00 2001
From: Yaroslav Tarkan <yaroslav.tarkan@intel.com>
Date: Mon, 22 Jul 2024 21:12:18 +0300
Subject: [PATCH 08/19] [master] Fix symbol encode error (#645)

---
 .github/workflows/causal_lm_cpp.yml             |  4 ++++
 .github/workflows/genai_package.yml             |  1 +
 .github/workflows/genai_python_lib.yml          |  1 +
 samples/cpp/beam_search_causal_lm/README.md     | 14 +++++++++++++-
 samples/cpp/chat_sample/README.md               | 14 +++++++++++++-
 samples/cpp/greedy_causal_lm/README.md          | 14 +++++++++++++-
 samples/cpp/multinomial_causal_lm/README.md     | 14 +++++++++++++-
 samples/cpp/prompt_lookup_decoding_lm/README.md | 14 +++++++++++++-
 samples/cpp/speculative_decoding_lm/README.md   | 14 +++++++++++++-
 9 files changed, 84 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 80089a4e81..5a96741b5b 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -191,6 +191,8 @@ jobs:
 
   cpp-greedy_causal_lm-windows:
     runs-on: windows-latest
+    env:
+      PYTHONIOENCODING: "utf8"
     defaults:
       run:
         shell: cmd
@@ -626,6 +628,8 @@ jobs:
 
   cpp-continuous-batching-windows:
     runs-on: windows-latest
+    env:
+      PYTHONIOENCODING: "utf8"
     defaults:
       run:
         shell: cmd
diff --git a/.github/workflows/genai_package.yml b/.github/workflows/genai_package.yml
index 06e589dfb9..9e439eb11e 100644
--- a/.github/workflows/genai_package.yml
+++ b/.github/workflows/genai_package.yml
@@ -80,6 +80,7 @@ jobs:
     runs-on: windows-latest
     env:
       CMAKE_BUILD_PARALLEL_LEVEL: null
+      PYTHONIOENCODING: "utf8"
     defaults:
       run:
         shell: cmd
diff --git a/.github/workflows/genai_python_lib.yml b/.github/workflows/genai_python_lib.yml
index 29ceda216a..38552174b6 100644
--- a/.github/workflows/genai_python_lib.yml
+++ b/.github/workflows/genai_python_lib.yml
@@ -62,6 +62,7 @@ jobs:
     runs-on: windows-latest
     env:
       CMAKE_BUILD_PARALLEL_LEVEL: null
+      PYTHONIOENCODING: "utf8"
     defaults:
       run:
         shell: cmd
diff --git a/samples/cpp/beam_search_causal_lm/README.md b/samples/cpp/beam_search_causal_lm/README.md
index a104288911..2d7c0a69d8 100644
--- a/samples/cpp/beam_search_causal_lm/README.md
+++ b/samples/cpp/beam_search_causal_lm/README.md
@@ -17,8 +17,20 @@ optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B
 
 `beam_search_causal_lm TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"`
 
-To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
 
 Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU.
 
 See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models.
+
+### Troubleshooting
+
+#### Unicode characters encoding error on Windows
+
+Example error:
+```
+UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to <undefined>
+```
+
+If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this:
+1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
+2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`.
diff --git a/samples/cpp/chat_sample/README.md b/samples/cpp/chat_sample/README.md
index 4baa8385ef..2d39077c66 100644
--- a/samples/cpp/chat_sample/README.md
+++ b/samples/cpp/chat_sample/README.md
@@ -17,8 +17,20 @@ optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B
 
 `chat_sample TinyLlama-1.1B-Chat-v1.0`
 
-To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
 
 Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU.
 
 See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models.
+
+### Troubleshooting
+
+#### Unicode characters encoding error on Windows
+
+Example error:
+```
+UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to <undefined>
+```
+
+If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this:
+1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
+2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`.
diff --git a/samples/cpp/greedy_causal_lm/README.md b/samples/cpp/greedy_causal_lm/README.md
index 3c0758ee6b..932f3d6ec5 100644
--- a/samples/cpp/greedy_causal_lm/README.md
+++ b/samples/cpp/greedy_causal_lm/README.md
@@ -17,8 +17,20 @@ optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B
 
 `greedy_causal_lm TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"`
 
-To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
 
 Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU.
 
 See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models.
+
+### Troubleshooting
+
+#### Unicode characters encoding error on Windows
+
+Example error:
+```
+UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to <undefined>
+```
+
+If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this:
+1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
+2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`.
diff --git a/samples/cpp/multinomial_causal_lm/README.md b/samples/cpp/multinomial_causal_lm/README.md
index 731d03e3c1..1642b61856 100644
--- a/samples/cpp/multinomial_causal_lm/README.md
+++ b/samples/cpp/multinomial_causal_lm/README.md
@@ -17,8 +17,20 @@ optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B
 
 `multinomial_causal_lm TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"`
 
-To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
 
 Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU.
 
 See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models.
+
+### Troubleshooting
+
+#### Unicode characters encoding error on Windows
+
+Example error:
+```
+UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to <undefined>
+```
+
+If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this:
+1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
+2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`.
diff --git a/samples/cpp/prompt_lookup_decoding_lm/README.md b/samples/cpp/prompt_lookup_decoding_lm/README.md
index 980c0cd19c..398fdd49b8 100644
--- a/samples/cpp/prompt_lookup_decoding_lm/README.md
+++ b/samples/cpp/prompt_lookup_decoding_lm/README.md
@@ -20,8 +20,20 @@ optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B
 
 `prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "return 0;"`
 
-To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
 
 Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU.
 
 See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models.
+
+### Troubleshooting
+
+#### Unicode characters encoding error on Windows
+
+Example error:
+```
+UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to <undefined>
+```
+
+If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this:
+1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
+2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`.
diff --git a/samples/cpp/speculative_decoding_lm/README.md b/samples/cpp/speculative_decoding_lm/README.md
index 7abcb6782a..480a14d762 100644
--- a/samples/cpp/speculative_decoding_lm/README.md
+++ b/samples/cpp/speculative_decoding_lm/README.md
@@ -24,8 +24,20 @@ optimum-cli export openvino --trust-remote-code --model meta-llama/Llama-2-7b-ch
 
 `speculative_decoding_lm TinyLlama-1.1B-Chat-v1.0 Llama-2-7b-chat-hf "Why is the Sun yellow?"`
 
-To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
 
 Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU.
 
 See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models.
+
+### Troubleshooting
+
+#### Unicode characters encoding error on Windows
+
+Example error:
+```
+UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to <undefined>
+```
+
+If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this:
+1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot.
+2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`.

From 5d21486f2db4054617adac691704e79db0cb05b4 Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Tue, 23 Jul 2024 11:29:21 +0400
Subject: [PATCH 09/19] Merge releases/2024/3 into master (#640)

Co-authored-by: Alina Kladieva <alina.kladieva@intel.com>
Co-authored-by: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Co-authored-by: Nikita Malinin <nikita.malinin@intel.com>
---
 .github/workflows/causal_lm_cpp.yml           |  2 +-
 CMakeLists.txt                                | 26 ++++++++++++++++--
 .../continuous_batching_accuracy.cpp          |  4 ++-
 .../genai/continuous_batching_pipeline.hpp    | 19 ++++++++++++-
 .../include/openvino/genai/llm_pipeline.hpp   |  4 +--
 src/cpp/include/openvino/genai/tokenizer.hpp  |  2 +-
 src/cpp/src/continuous_batching_pipeline.cpp  | 27 +++++++++++++------
 src/python/CMakeLists.txt                     | 16 +++--------
 src/python/py_generate_pipeline.cpp           | 10 ++++---
 tests/python_tests/common.py                  |  2 +-
 tests/python_tests/test_sampling.py           |  6 ++---
 thirdparty/openvino_tokenizers                |  2 +-
 12 files changed, 83 insertions(+), 37 deletions(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 5a96741b5b..e26ceefa66 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -652,7 +652,7 @@ jobs:
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
-          cmake -DCMAKE_BUILD_TYPE=Releas -S ./ -B ./build/
+          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
       - name: Run gtests
         run: |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8965e8b3e0..5f7390f981 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,6 +23,9 @@ project(OpenVINOGenAI
         HOMEPAGE_URL "https://github.com/openvinotoolkit/openvino.genai"
         LANGUAGES CXX)
 
+option(INSTALL_GTEST "Enable installation of googletest. (Projects embedding googletest may want to turn this OFF.)" OFF)
+option(RAPIDJSON_BUILD_DOC "Build rapidjson documentation." OFF)
+
 # Find OpenVINODeveloperPackage first to compile with SDL flags
 find_package(OpenVINODeveloperPackage QUIET
              PATHS "${OpenVINO_DIR}")
@@ -40,13 +43,32 @@ find_file(spda_to_pa_header sdpa_to_paged_attention.hpp
 
 include(cmake/features.cmake)
 
+if(ENABLE_PYTHON)
+    # the following two calls are required for cross-compilation
+    if(OpenVINODeveloperPackage_DIR)
+        ov_find_python3(REQUIRED)
+        ov_detect_python_module_extension()
+    else()
+        if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
+            find_package(Python3 REQUIRED COMPONENTS Interpreter Development.Module)
+        else()
+            find_package(Python3 REQUIRED COMPONENTS Interpreter Development)
+        endif()
+    endif()
+endif()
+
 add_subdirectory(thirdparty)
 add_subdirectory(src)
 add_subdirectory(samples)
 add_subdirectory(tests/cpp)
 
-install(FILES LICENSE DESTINATION licensing COMPONENT licensing_genai RENAME LICENSE-GENAI)
-install(FILES third-party-programs.txt DESTINATION licensing COMPONENT licensing_genai RENAME third-party-programs-genai.txt)
+install(FILES LICENSE DESTINATION docs/licensing COMPONENT licensing_genai RENAME LICENSE-GENAI)
+install(FILES third-party-programs.txt DESTINATION docs/licensing COMPONENT licensing_genai RENAME third-party-programs-genai.txt)
 set(CPACK_ARCHIVE_COMPONENT_INSTALL ON)
 set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY OFF)
+# Workaround https://gitlab.kitware.com/cmake/cmake/-/issues/2614
+set(CPACK_COMPONENTS_ALL core_genai core_genai_dev cpp_samples_genai licensing_genai openvino_tokenizers openvino_tokenizers_licenses)
+if(ENABLE_PYTHON)
+    list(APPEND CPACK_COMPONENTS_ALL pygenai_${Python3_VERSION_MAJOR}_${Python3_VERSION_MINOR})
+endif() 
 include(CPack)
diff --git a/samples/cpp/continuous_batching_accuracy/continuous_batching_accuracy.cpp b/samples/cpp/continuous_batching_accuracy/continuous_batching_accuracy.cpp
index 6e0cb5034f..77485e36db 100644
--- a/samples/cpp/continuous_batching_accuracy/continuous_batching_accuracy.cpp
+++ b/samples/cpp/continuous_batching_accuracy/continuous_batching_accuracy.cpp
@@ -78,7 +78,9 @@ int main(int argc, char* argv[]) try {
     // vLLM specific params
     scheduler_config.max_num_seqs = 2;
 
-    ov::genai::ContinuousBatchingPipeline pipe(models_path, scheduler_config);
+    // It's possible to construct a Tokenizer from a different path.
+    // If the Tokenizer isn't specified, it's loaded from the same folder.
+    ov::genai::ContinuousBatchingPipeline pipe(models_path, ov::genai::Tokenizer{models_path}, scheduler_config);
     std::vector<ov::genai::GenerationResult> generation_results = pipe.generate(prompts, sampling_params);
 
     for (size_t request_id = 0; request_id < generation_results.size(); ++request_id) {
diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
index e30892f9c3..be9a5fd8c1 100644
--- a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
@@ -32,7 +32,24 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
                                const std::string& device = "CPU",
                                const ov::AnyMap& plugin_config = {});
 
-    std::shared_ptr<ov::genai::Tokenizer> get_tokenizer();
+    /**
+    * @brief Constructs a ContinuousBatchingPipeline when ov::genai::Tokenizer is initialized manually using file from the different dirs.
+    *
+    * @param model_path Path to the dir with model, tokenizer .xml/.bin files, and generation_configs.json
+    * @param scheduler_config
+    * @param tokenizer manually initialized ov::genai::Tokenizer
+    * @param device optional device
+    * @param plugin_config optional plugin_config
+    */
+    ContinuousBatchingPipeline(
+        const std::string& model_path,
+        const ov::genai::Tokenizer& tokenizer,
+        const SchedulerConfig& scheduler_config,
+        const std::string& device="CPU",
+        const ov::AnyMap& plugin_config={}
+    );
+
+    ov::genai::Tokenizer get_tokenizer();
 
     ov::genai::GenerationConfig get_config() const;
 
diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp
index b36eab7238..84dc02bd58 100644
--- a/src/cpp/include/openvino/genai/llm_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp
@@ -116,10 +116,10 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
     );
     
     /**
-    * @brief Constructs a LLMPipeline when ov::Tokenizer is initialized manually using file from the different dirs.
+    * @brief Constructs a LLMPipeline when ov::genai::Tokenizer is initialized manually using file from the different dirs.
     *
     * @param model_path Path to the dir with model, tokenizer .xml/.bin files, and generation_configs.json
-    * @param tokenizer manually initialized ov::Tokenizer 
+    * @param tokenizer manually initialized ov::genai::Tokenizer 
     * @param device optional device
     * @param plugin_config optional plugin_config
     */
diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp
index 4af45e7cfd..5a1e181e21 100644
--- a/src/cpp/include/openvino/genai/tokenizer.hpp
+++ b/src/cpp/include/openvino/genai/tokenizer.hpp
@@ -26,7 +26,7 @@ struct TokenizedInputs {
 class OPENVINO_GENAI_EXPORTS Tokenizer {
 public:
     /**
-    * @brief ov::Tokenizer constructor.
+    * @brief ov::genai::Tokenizer constructor.
     * @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path
     */
     Tokenizer(const std::string& tokenizer_path);
diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp
index e8cc4c9260..ddfebc5926 100644
--- a/src/cpp/src/continuous_batching_pipeline.cpp
+++ b/src/cpp/src/continuous_batching_pipeline.cpp
@@ -19,7 +19,7 @@ using namespace ov::genai;
 void apply_paged_attention_transformations(std::shared_ptr<ov::Model> model, DeviceConfig& device_config);
 
 class ContinuousBatchingPipeline::Impl {
-    std::shared_ptr<ov::genai::Tokenizer> m_tokenizer;
+    ov::genai::Tokenizer m_tokenizer;
     std::shared_ptr<Scheduler> m_scheduler;
     std::shared_ptr<CacheManager> m_cache_manager;
     std::shared_ptr<ModelRunner> m_model_runner;
@@ -70,9 +70,9 @@ class ContinuousBatchingPipeline::Impl {
     }
 
 public:
-    Impl(const std::string& models_path, const SchedulerConfig& scheduler_config, const std::string device, const ov::AnyMap& plugin_config) {
+    Impl(const std::string& models_path, const Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const ov::AnyMap& plugin_config) :
+            m_tokenizer{tokenizer} {
         ov::Core core;
-        m_tokenizer = std::make_shared<ov::genai::Tokenizer>(models_path);
 
         // The model can be compiled for GPU as well
         std::shared_ptr<ov::Model> model = core.read_model(models_path + "/openvino_model.xml");
@@ -105,6 +105,9 @@ class ContinuousBatchingPipeline::Impl {
         // read default generation config
     }
 
+    Impl(const std::string& models_path, const SchedulerConfig& scheduler_config, const std::string& device, const ov::AnyMap& plugin_config)
+        : Impl{models_path, Tokenizer(models_path), scheduler_config, device, plugin_config} {}
+
     ov::genai::GenerationConfig get_config() const {
         return m_generation_config;
     }
@@ -113,19 +116,19 @@ class ContinuousBatchingPipeline::Impl {
         return m_pipeline_metrics;
     }
 
-    std::shared_ptr<ov::genai::Tokenizer> get_tokenizer() {
+    ov::genai::Tokenizer get_tokenizer() {
         return m_tokenizer;
     }
 
     GenerationHandle add_request(uint64_t request_id, std::string prompt, ov::genai::GenerationConfig sampling_params) {
-        sampling_params.set_eos_token_id(m_tokenizer->get_eos_token_id());
+        sampling_params.set_eos_token_id(m_tokenizer.get_eos_token_id());
         sampling_params.validate();
 
         ov::Tensor input_ids;
         {
             static ManualTimer timer("tokenize");
             timer.start();
-            input_ids = m_tokenizer->encode(prompt).input_ids;
+            input_ids = m_tokenizer.encode(prompt).input_ids;
             timer.end();
         }
 
@@ -263,7 +266,7 @@ class ContinuousBatchingPipeline::Impl {
             auto num_outputs = std::min(sampling_params[generation_idx].num_return_sequences, generation_outputs.size());
             for (size_t generation_output_idx = 0; generation_output_idx < num_outputs; ++generation_output_idx) {
                 const auto& generation_output = generation_outputs[generation_output_idx];
-                std::string output_text = m_tokenizer->decode(generation_output.generated_token_ids);
+                std::string output_text = m_tokenizer.decode(generation_output.generated_token_ids);
                 result.m_generation_ids.push_back(output_text);
                 result.m_scores.push_back(generation_output.score);
             }
@@ -283,7 +286,15 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::string& model
     m_impl = std::make_shared<Impl>(models_path, scheduler_config, device, plugin_config);
 }
 
-std::shared_ptr<ov::genai::Tokenizer> ContinuousBatchingPipeline::get_tokenizer() {
+ContinuousBatchingPipeline::ContinuousBatchingPipeline(
+    const std::string& model_path,
+    const Tokenizer& tokenizer,
+    const SchedulerConfig& scheduler_config,
+    const std::string& device,
+    const ov::AnyMap& plugin_config
+) : m_impl{std::make_shared<Impl>(model_path, tokenizer, scheduler_config, device, plugin_config)} {}
+
+ov::genai::Tokenizer ContinuousBatchingPipeline::get_tokenizer() {
     return m_impl->get_tokenizer();
 }
 
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index f933d2a64c..a1266fb121 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -11,17 +11,7 @@ FetchContent_Declare(
 FetchContent_GetProperties(pybind11)
 # search for FindPython3.cmake instead of legacy modules
 set(PYBIND11_FINDPYTHON ON)
-# the following two calls are required for cross-compilation
-if(OpenVINODeveloperPackage_DIR)
-    ov_find_python3(REQUIRED)
-    ov_detect_python_module_extension()
-else()
-    if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
-        find_package(Python3 REQUIRED COMPONENTS Interpreter Development.Module)
-    else()
-        find_package(Python3 REQUIRED COMPONENTS Interpreter Development)
-    endif()
-endif()
+
 if(NOT pybind11_POPULATED)
     FetchContent_Populate(pybind11)
     add_subdirectory(${pybind11_SOURCE_DIR} ${pybind11_BINARY_DIR})
@@ -65,10 +55,10 @@ endif()
 install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/__init__.py"
               "${CMAKE_BINARY_DIR}/openvino_genai/__version__.py"
         DESTINATION python/openvino_genai
-        COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR})
+        COMPONENT pygenai_${Python3_VERSION_MAJOR}_${Python3_VERSION_MINOR})
 install(TARGETS py_generate_pipeline
         LIBRARY DESTINATION python/openvino_genai
-        COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR})
+        COMPONENT pygenai_${Python3_VERSION_MAJOR}_${Python3_VERSION_MINOR})
 
 install(FILES "${CMAKE_BINARY_DIR}/openvino_genai/__version__.py"
         DESTINATION openvino_genai
diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp
index 8e475329f1..d7b2aab29c 100644
--- a/src/python/py_generate_pipeline.cpp
+++ b/src/python/py_generate_pipeline.cpp
@@ -596,10 +596,14 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
         .def_readwrite("max_num_seqs", &SchedulerConfig::max_num_seqs);
 
     py::class_<ContinuousBatchingPipeline>(m, "ContinuousBatchingPipeline")
-        .def(py::init([](const std::string& model_path, const SchedulerConfig& config) {
+        .def(py::init([](const std::string& model_path, const SchedulerConfig& scheduler_config, const std::string& device, const std::map<std::string, py::object>& plugin_config) {
             ScopedVar env_manager(ov_tokenizers_module_path());
-            return std::make_unique<ContinuousBatchingPipeline>(model_path, config);
-        }))
+            return std::make_unique<ContinuousBatchingPipeline>(model_path, scheduler_config, device, properties_to_any_map(plugin_config));
+        }), py::arg("model_path"), py::arg("scheduler_config"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap({}))
+        .def(py::init([](const std::string& model_path, const ov::genai::Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const std::map<std::string, py::object>& plugin_config) {
+            ScopedVar env_manager(ov_tokenizers_module_path());
+            return std::make_unique<ContinuousBatchingPipeline>(model_path, tokenizer, scheduler_config, device, properties_to_any_map(plugin_config));
+        }), py::arg("model_path"), py::arg("tokenizer"), py::arg("scheduler_config"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap({}))
         .def("get_tokenizer", &ContinuousBatchingPipeline::get_tokenizer)
         .def("get_config", &ContinuousBatchingPipeline::get_config)
         .def("add_request", &ContinuousBatchingPipeline::add_request)
diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py
index dec38f45ce..95046a463a 100644
--- a/tests/python_tests/common.py
+++ b/tests/python_tests/common.py
@@ -273,7 +273,7 @@ def run_continuous_batching(
     prompts: List[str],
     generation_configs : List[GenerationConfig]
 ) -> List[GenerationResult]:
-    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config)
+    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config, "CPU", {})
     output = pipe.generate(prompts, generation_configs)
     del pipe
     shutil.rmtree(model_path)
diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py
index f9b478bd14..27596359bf 100644
--- a/tests/python_tests/test_sampling.py
+++ b/tests/python_tests/test_sampling.py
@@ -7,8 +7,8 @@
 import sys
 from dataclasses import dataclass
 from pathlib import Path
-from openvino_genai import ContinuousBatchingPipeline, GenerationConfig
-from typing import List, Optional, TypedDict
+from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, Tokenizer
+from typing import List, TypedDict
 
 from common import run_test_pipeline, get_models_list, get_model_and_tokenizer, save_ov_model_from_optimum, \
     generate_and_compare_with_reference_text, get_greedy, get_beam_search, get_multinomial_temperature, \
@@ -306,7 +306,7 @@ def test_post_oom_health(tmp_path):
     model_path : Path = tmp_path / model_id
     save_ov_model_from_optimum(model, hf_tokenizer, model_path)
 
-    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config)
+    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), Tokenizer(model_path.absolute().as_posix()), scheduler_config)
     # First run should return incomplete response
     output = pipe.generate(["What is OpenVINO?"], generation_configs)
     assert(len(output))
diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers
index 880d569cd2..04795c1b78 160000
--- a/thirdparty/openvino_tokenizers
+++ b/thirdparty/openvino_tokenizers
@@ -1 +1 @@
-Subproject commit 880d569cd2f5d52165b940542e2f9190172ed2cb
+Subproject commit 04795c1b78c61e3294d1744c78a8ebb5e129256c

From c86fd779d49998a7fa2d5f0f25b2964654d1be25 Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Tue, 23 Jul 2024 18:51:20 +0400
Subject: [PATCH 10/19] Merge releases/2024/3 into master (#666)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Alina Kladieva <alina.kladieva@intel.com>
Co-authored-by: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Co-authored-by: Nikita Malinin <nikita.malinin@intel.com>
Co-authored-by: Yaroslav Tarkan <yaroslav.tarkan@intel.com>
Co-authored-by: Anatoliy Talamanov <anatoliy.talamanov@intel.com>
Co-authored-by: Pavel Esir <pavel.esir@gmail.com>
Co-authored-by: Miłosz Żeglarski <milosz.zeglarski@intel.com>
Co-authored-by: Alexander Suvorov <alexander.suvorov@intel.com>
Co-authored-by: Xiake Sun <xiake.sun@intel.com>
---
 .github/workflows/causal_lm_cpp.yml           | 66 +++++++++----------
 .github/workflows/genai_package.yml           | 18 ++---
 .github/workflows/genai_python_lib.yml        | 19 +++---
 .github/workflows/lcm_dreamshaper_cpp.yml     |  8 +--
 .../workflows/stable_diffusion_1_5_cpp.yml    |  4 +-
 CMakeLists.txt                                | 16 +++++
 samples/cpp/beam_search_causal_lm/README.md   |  2 +-
 samples/cpp/chat_sample/README.md             |  2 +-
 samples/cpp/greedy_causal_lm/README.md        |  2 +-
 .../cpp/multinomial_causal_lm/CMakeLists.txt  |  2 +-
 samples/cpp/multinomial_causal_lm/README.md   |  2 +-
 .../cpp/prompt_lookup_decoding_lm/README.md   |  2 +-
 samples/cpp/speculative_decoding_lm/README.md |  2 +-
 .../python/beam_search_causal_lm/README.md    |  2 +-
 samples/python/chat_sample/README.md          |  2 +-
 samples/python/greedy_causal_lm/README.md     |  2 +-
 .../python/multinomial_causal_lm/README.md    |  2 +-
 src/README.md                                 |  2 +-
 src/cpp/CMakeLists.txt                        |  3 +-
 src/cpp/src/tokenizer.cpp                     |  7 +-
 src/docs/BUILD.md                             | 39 +++++------
 src/docs/SUPPORTED_MODELS.md                  | 14 +++-
 tests/python_tests/README.md                  | 47 +++++++++++++
 tests/python_tests/conftest.py                |  7 +-
 tests/python_tests/ov_genai_test_utils.py     |  5 +-
 tests/python_tests/test_chat_generate_api.py  |  4 ++
 tests/python_tests/test_generate_api.py       | 32 +++++++++
 27 files changed, 212 insertions(+), 101 deletions(-)
 create mode 100644 tests/python_tests/README.md

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index e26ceefa66..527259f203 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -13,9 +13,9 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240708_x86_64.tgz
-  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240708_x86_64.tgz
-  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/w_openvino_toolkit_windows_2024.3.0.dev20240708_x86_64.zip
+  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240711_x86_64.tgz
+  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240711_x86_64.tgz
+  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/windows/w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64.zip
 jobs:
   cpp-multinomial-greedy_causal_lm-ubuntu:
     runs-on: ubuntu-20.04-8-cores
@@ -34,8 +34,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model openlm-research/open_llama_3b_v2 open_llama_3b_v2
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -77,8 +77,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -210,8 +210,8 @@ jobs:
       - name: Download, convert and build
         run: |
           call .\ov\setupvars.bat
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -255,8 +255,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -282,8 +282,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen1.5-7B-Chat Qwen1.5-7B-Chat
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -310,8 +310,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-2 phi-2
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j 15
@@ -338,8 +338,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model argilla/notus-7b-v1 notus-7b-v1
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -366,8 +366,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-3b dolly-v2-3b
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-7b dolly-v2-7b
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
@@ -403,8 +403,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -447,8 +447,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-1_5 phi-1_5
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j 15
@@ -495,8 +495,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model ikala/redpajama-3b-chat redpajama-3b-chat
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -545,8 +545,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -605,8 +605,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -649,8 +649,8 @@ jobs:
       - name: Install dependencies and build
         run: |
           call .\ov\setupvars.bat
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
@@ -688,8 +688,8 @@ jobs:
       - name: Download, convert and build
         run: |
           source ./ov/setupvars.sh
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release -j
diff --git a/.github/workflows/genai_package.yml b/.github/workflows/genai_package.yml
index 9e439eb11e..cf604b4bcc 100644
--- a/.github/workflows/genai_package.yml
+++ b/.github/workflows/genai_package.yml
@@ -5,9 +5,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.ref_name }}
   cancel-in-progress: true
 env:
-  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240708_x86_64.tgz
-  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240708_x86_64.tgz
-  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/w_openvino_toolkit_windows_2024.3.0.dev20240708_x86_64.zip
+  l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240711_x86_64.tgz
+  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240711_x86_64.tgz
+  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/windows/w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64.zip
 jobs:
   ubuntu_genai_package:
     strategy:
@@ -28,8 +28,8 @@ jobs:
       - run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh
       - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/
       - run: source ./ov/setupvars.sh && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j
-      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-      - run: source ./ov/setupvars.sh && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+      - run: source ./ov/setupvars.sh && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
       - run: source ./ov/setupvars.sh && optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
       - run: source ./ov/setupvars.sh && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix ov
       - run: ov/samples/cpp/build_samples.sh -i ${{ github.workspace }}/s\ pace
@@ -57,8 +57,8 @@ jobs:
       - run: brew install coreutils scons
       - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/
       - run: source ./ov/setupvars.sh && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j
-      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-      - run: source ./ov/setupvars.sh && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+      - run: source ./ov/setupvars.sh && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
       - run: source ./ov/setupvars.sh && optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
       - run: source ./ov/setupvars.sh && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix ov
       - run: ov/samples/cpp/build_samples.sh -i ${{ github.workspace }}/s\ pace
@@ -100,8 +100,8 @@ jobs:
         shell: bash
       - run: call ov\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/
       - run: call ov\setupvars.bat && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j
-      - run: call ov\setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-      - run: call ov\setupvars.bat && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+      - run: call ov\setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+      - run: call ov\setupvars.bat && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
       - run: call ov\setupvars.bat && optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
       - run: call ov\setupvars.bat && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix ov
       - run: call ov\samples\cpp\build_samples_msvc.bat -i "${{ github.workspace }}/samples_install"
diff --git a/.github/workflows/genai_python_lib.yml b/.github/workflows/genai_python_lib.yml
index 38552174b6..141c379da6 100644
--- a/.github/workflows/genai_python_lib.yml
+++ b/.github/workflows/genai_python_lib.yml
@@ -5,9 +5,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.ref_name }}
   cancel-in-progress: true
 env:
-  l_ov_centos_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/l_openvino_toolkit_centos7_2024.3.0.dev20240708_x86_64.tgz
-  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240708_x86_64.tgz
-  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15945-a349dc82f9a/w_openvino_toolkit_windows_2024.3.0.dev20240708_x86_64.zip
+  l_ov_centos_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/linux/l_openvino_toolkit_centos7_2024.3.0.dev20240711_x86_64.tgz
+  m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/macos/m_openvino_toolkit_macos_12_6_2024.3.0.dev20240711_x86_64.tgz
+  w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.3.0rc1/windows/w_openvino_toolkit_windows_2024.3.0.dev20240711_x86_64.zip
 jobs:
   ubuntu_genai_python_lib:
     # A tokenizers' dependency fails to compile on ubuntu-20 n CenOS7 env.
@@ -29,7 +29,7 @@ jobs:
       - run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh
       - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
       - run: source ./ov/setupvars.sh && cmake --build ./build/ --config Release -j
-      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager
+      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --upgrade-strategy eager
       - run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/
       - run: source ./ov/setupvars.sh && python -m pip install . --verbose
       - run: python -m pytest ./tests/python_tests/
@@ -52,7 +52,7 @@ jobs:
       - run: brew install coreutils scons
       - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
       - run: source ./ov/setupvars.sh && cmake --build ./build/ --config Release -j
-      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager
+      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --upgrade-strategy eager
       - run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/
       - run: source ./ov/setupvars.sh && python -m pip install . --verbose
       - run: python -c "from openvino_genai import LLMPipeline"
@@ -79,12 +79,9 @@ jobs:
           unzip -d ov ov.zip
           dirs=(ov/*) && mv ov/*/* ov && rmdir "${dirs[@]}"
         shell: bash
-      - name: Install dependencies and build
-        run: |
-          call .\ov\setupvars.bat
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release -j
+      - run: call ./ov/setupvars.bat && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
+      - run: call ./ov/setupvars.bat && cmake --build ./build/ --config Release -j
+      - run: call ./ov/setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --upgrade-strategy eager
       # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
       - run: set "PYTHONPATH=./build/" && call ./ov/setupvars.bat && python -m pytest ./tests/python_tests/
       - run: call ./ov/setupvars.bat && python -m pip install . --verbose
diff --git a/.github/workflows/lcm_dreamshaper_cpp.yml b/.github/workflows/lcm_dreamshaper_cpp.yml
index 2d450ad9c8..82a74f8cdf 100644
--- a/.github/workflows/lcm_dreamshaper_cpp.yml
+++ b/.github/workflows/lcm_dreamshaper_cpp.yml
@@ -50,8 +50,8 @@ jobs:
         working-directory: ${{ env.working_directory }}
         run: |
           conda activate openvino_lcm_cpp
-          python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install -r ../../requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install -r ../../requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
           
       - name: Download and convert model and tokenizer
         working-directory: ${{ env.working_directory }}
@@ -95,8 +95,8 @@ jobs:
         working-directory: ${{ env.working_directory }}
         run: |
           conda activate openvino_lcm_cpp
-          python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install -r ../../requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install -r ../../requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
 
       - name: Download and convert model and tokenizer
         working-directory: ${{ env.working_directory }}
diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml
index cda567c23b..5197b27da8 100644
--- a/.github/workflows/stable_diffusion_1_5_cpp.yml
+++ b/.github/workflows/stable_diffusion_1_5_cpp.yml
@@ -49,8 +49,8 @@ jobs:
         working-directory: ${{ env.working_directory }}
         run: |
           conda activate openvino_sd_cpp
-          python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install -r ../../requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
+          python -m pip install -r ../../requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
 
       - name: Download and convert model and tokenizer
         working-directory: ${{ env.working_directory }}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5f7390f981..27ed56b453 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -57,11 +57,27 @@ if(ENABLE_PYTHON)
     endif()
 endif()
 
+if(ENABLE_PYTHON)
+    # the following two calls are required for cross-compilation
+    if(OpenVINODeveloperPackage_DIR)
+        ov_find_python3(REQUIRED)
+        ov_detect_python_module_extension()
+    else()
+        if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
+            find_package(Python3 REQUIRED COMPONENTS Interpreter Development.Module)
+        else()
+            find_package(Python3 REQUIRED COMPONENTS Interpreter Development)
+        endif()
+    endif()
+endif()
+
 add_subdirectory(thirdparty)
 add_subdirectory(src)
 add_subdirectory(samples)
 add_subdirectory(tests/cpp)
 
+install(FILES LICENSE DESTINATION docs/licensing COMPONENT licensing_genai RENAME LICENSE-GENAI)
+install(FILES third-party-programs.txt DESTINATION docs/licensing COMPONENT licensing_genai RENAME third-party-programs-genai.txt)
 install(FILES LICENSE DESTINATION docs/licensing COMPONENT licensing_genai RENAME LICENSE-GENAI)
 install(FILES third-party-programs.txt DESTINATION docs/licensing COMPONENT licensing_genai RENAME third-party-programs-genai.txt)
 set(CPACK_ARCHIVE_COMPONENT_INSTALL ON)
diff --git a/samples/cpp/beam_search_causal_lm/README.md b/samples/cpp/beam_search_causal_lm/README.md
index 2d7c0a69d8..0d2ee83bfc 100644
--- a/samples/cpp/beam_search_causal_lm/README.md
+++ b/samples/cpp/beam_search_causal_lm/README.md
@@ -1,4 +1,4 @@
-# Text generation C++ sample that supports most popular models like LLaMA 2
+# Text generation C++ sample that supports most popular models like LLaMA 3
 
 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. It's only possible to change the device for inference to a differnt one, GPU for example, from the command line interface. The sample fearures `ov::genai::LLMPipeline` and configures it to use multiple beam grops. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
 
diff --git a/samples/cpp/chat_sample/README.md b/samples/cpp/chat_sample/README.md
index 2d39077c66..a2eccb4d3d 100644
--- a/samples/cpp/chat_sample/README.md
+++ b/samples/cpp/chat_sample/README.md
@@ -1,4 +1,4 @@
-# C++ chat_sample that supports most popular models like LLaMA 2
+# C++ chat_sample that supports most popular models like LLaMA 3
 
 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it for the chat scenario. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
 
diff --git a/samples/cpp/greedy_causal_lm/README.md b/samples/cpp/greedy_causal_lm/README.md
index 932f3d6ec5..79852e0d10 100644
--- a/samples/cpp/greedy_causal_lm/README.md
+++ b/samples/cpp/greedy_causal_lm/README.md
@@ -1,4 +1,4 @@
-# Text generation C++ greedy_causal_lm that supports most popular models like LLaMA 2
+# Text generation C++ greedy_causal_lm that supports most popular models like LLaMA 3
 
 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it to run the simplest deterministic greedy sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
 
diff --git a/samples/cpp/multinomial_causal_lm/CMakeLists.txt b/samples/cpp/multinomial_causal_lm/CMakeLists.txt
index efcac50f09..98bc76ee3c 100644
--- a/samples/cpp/multinomial_causal_lm/CMakeLists.txt
+++ b/samples/cpp/multinomial_causal_lm/CMakeLists.txt
@@ -11,7 +11,7 @@ set_target_properties(multinomial_causal_lm PROPERTIES
     COMPILE_PDB_NAME multinomial_causal_lm
     # Ensure out of box LC_RPATH on macOS with SIP
     INSTALL_RPATH_USE_LINK_PATH ON)
-target_compile_features(greedy_causal_lm PRIVATE cxx_std_11)
+target_compile_features(multinomial_causal_lm PRIVATE cxx_std_11)
 install(TARGETS multinomial_causal_lm
     RUNTIME DESTINATION samples_bin/
     COMPONENT samples_bin
diff --git a/samples/cpp/multinomial_causal_lm/README.md b/samples/cpp/multinomial_causal_lm/README.md
index 1642b61856..21c9a07e77 100644
--- a/samples/cpp/multinomial_causal_lm/README.md
+++ b/samples/cpp/multinomial_causal_lm/README.md
@@ -1,4 +1,4 @@
-# Text generation C++ multinomial_causal_lm that supports most popular models like LLaMA 2
+# Text generation C++ multinomial_causal_lm that supports most popular models like LLaMA 3
 
 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it to run random sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
 
diff --git a/samples/cpp/prompt_lookup_decoding_lm/README.md b/samples/cpp/prompt_lookup_decoding_lm/README.md
index 398fdd49b8..c5517c5bf6 100644
--- a/samples/cpp/prompt_lookup_decoding_lm/README.md
+++ b/samples/cpp/prompt_lookup_decoding_lm/README.md
@@ -1,4 +1,4 @@
-# prompt_lookup_decoding_lm C++ sample that supports most popular models like LLaMA 2
+# prompt_lookup_decoding_lm C++ sample that supports most popular models like LLaMA 3
 
 [Prompt Lookup decoding](https://github.com/apoorvumang/prompt-lookup-decoding) is [assested-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) technique where the draft model is replaced with simple string matching the prompt to generate candidate token sequences. This method highly effective for input grounded generation (summarization, document QA, multi-turn chat, code editing), where there is high n-gram overlap between LLM input (prompt) and LLM output. This could be entity names, phrases, or code chunks that the LLM directly copies from the input while generating the output. Prompt lookup exploits this pattern to speed up autoregressive decoding in LLMs. This results in significant speedups with no effect on output quality.
 
diff --git a/samples/cpp/speculative_decoding_lm/README.md b/samples/cpp/speculative_decoding_lm/README.md
index 480a14d762..644ebd2c94 100644
--- a/samples/cpp/speculative_decoding_lm/README.md
+++ b/samples/cpp/speculative_decoding_lm/README.md
@@ -1,4 +1,4 @@
-# speculative_decoding_lm C++ sample that supports most popular models like LLaMA 2
+# speculative_decoding_lm C++ sample that supports most popular models like LLaMA 3
 
 Speculative decoding (or [assisted-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) in HF terminology) is a recent technique, that allows to speed up token generation when an additional smaller draft model is used alonside with the main model.
 
diff --git a/samples/python/beam_search_causal_lm/README.md b/samples/python/beam_search_causal_lm/README.md
index ff5286d010..5e80aa69da 100644
--- a/samples/python/beam_search_causal_lm/README.md
+++ b/samples/python/beam_search_causal_lm/README.md
@@ -1,4 +1,4 @@
-# Text generation Python sample that supports most popular models like LLaMA 2
+# Text generation Python sample that supports most popular models like LLaMA 3
 
 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. It's only possible to change the device for inference to a differnt one, GPU for example, from the command line interface. The sample fearures `openvino_genai.LLMPipeline` and configures it to use multiple beam grops. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
 
diff --git a/samples/python/chat_sample/README.md b/samples/python/chat_sample/README.md
index 34d71fab8a..983789d0eb 100644
--- a/samples/python/chat_sample/README.md
+++ b/samples/python/chat_sample/README.md
@@ -1,4 +1,4 @@
-# Python chat_sample that supports most popular models like LLaMA 2
+# Python chat_sample that supports most popular models like LLaMA 3
 
 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `openvino_genai.LLMPipeline` and configures it for the chat scenario. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
 
diff --git a/samples/python/greedy_causal_lm/README.md b/samples/python/greedy_causal_lm/README.md
index 7c87b04aad..97b044eb51 100644
--- a/samples/python/greedy_causal_lm/README.md
+++ b/samples/python/greedy_causal_lm/README.md
@@ -1,4 +1,4 @@
-# Text generation Python greedy_causal_lm that supports most popular models like LLaMA 2
+# Text generation Python greedy_causal_lm that supports most popular models like LLaMA 3
 
 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `openvino_genai.LLMPipeline` and configures it to run the simplest deterministic greedy sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
 
diff --git a/samples/python/multinomial_causal_lm/README.md b/samples/python/multinomial_causal_lm/README.md
index d76b933663..d39142f3de 100644
--- a/samples/python/multinomial_causal_lm/README.md
+++ b/samples/python/multinomial_causal_lm/README.md
@@ -1,4 +1,4 @@
-# Text generation Python multinomial_causal_lm that supports most popular models like LLaMA 2
+# Text generation Python multinomial_causal_lm that supports most popular models like LLaMA 3
 
 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it to run random sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python.
 
diff --git a/src/README.md b/src/README.md
index c67a60eaec..445b88aa58 100644
--- a/src/README.md
+++ b/src/README.md
@@ -23,7 +23,7 @@ To build OpenVINO™ GenAI library from source, refer to the [Build Instructions
     > git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git
     > cd openvino.genai
     > # Install python dependencies
-    > python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+    > python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
     > python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt
     > ```
 
diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt
index 454c53b944..c140bf9ac7 100644
--- a/src/cpp/CMakeLists.txt
+++ b/src/cpp/CMakeLists.txt
@@ -103,7 +103,8 @@ install(TARGETS ${TARGET_NAME} EXPORT OpenVINOGenAITargets
 install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/
         DESTINATION runtime/include COMPONENT core_genai_dev)
 install(EXPORT OpenVINOGenAITargets FILE OpenVINOGenAITargets.cmake
-        NAMESPACE openvino:: DESTINATION runtime/cmake)
+        NAMESPACE openvino:: DESTINATION runtime/cmake
+        COMPONENT core_genai_dev)
 
 include(CMakePackageConfigHelpers)
 configure_package_config_file("${OpenVINOGenAI_SOURCE_DIR}/cmake/templates/OpenVINOGenAIConfig.cmake.in"
diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index 9b4a206a1e..ac6b925dcb 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -98,8 +98,11 @@ class Tokenizer::TokenizerImpl {
                                                    device).create_infer_request();
 
         // Get special token ids by inference if they are not defined.
-        // todo: do not call until CVS-143410 is resolved
-        // infer_special_tokens_if_necessary();
+        infer_special_tokens_if_necessary();
+        // Initialize tokenizer's cache to save time later.
+        // infer_special_tokens_if_necessary() already could do that
+        // but it didn't run decode() for sure.
+        decode(encode("").input_ids);
     }
 
     // load special tokens ids from config.json
diff --git a/src/docs/BUILD.md b/src/docs/BUILD.md
index 710428139e..3b89995dc2 100644
--- a/src/docs/BUILD.md
+++ b/src/docs/BUILD.md
@@ -1,5 +1,8 @@
 # How to Build OpenVINO™ GenAI
 
+> **NOTE**: There is a known Python API issue with `ov::Tensor`. The issue is reproduced when building OpenVINO GenAI from sources while using OpenVINO from archives. Using `ov::Tensor` with OpenVINO GenAI fails. Possible errors: `TypeError: generate(): incompatible function arguments.`, `TypeError: __init__(): incompatible constructor arguments.`, `TypeError: Unregistered type : ov::Tensor`.
+The preferred approach is to build both OpenVINO and OpenVINO GenAI from sources using the same build environment. Or to install prebuilt OpenVINO GenAI from [distribution channels](https://docs.openvino.ai/2024/get-started/install-openvino.html).
+
 ## Build for Linux Systems
 
 ### Software Requirements 
@@ -10,20 +13,16 @@
 
 ### Build Instructions
 
-1. Clone OpenVINO GenAI repository and init submodules:
+1. Build and install OpenVINO from sources following the [instructions](https://github.com/openvinotoolkit/openvino/wiki#how-to-build).  
+The path to the openvino install directory is referred as <INSTALL_DIR> throughout the document.
+2. Clone OpenVINO GenAI repository and init submodules:
     ```sh
     git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git
     cd openvino.genai
     ```
-2. Download OpenVINO archive and install dependencies:
-    ```sh
-    mkdir ./ov/
-    curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15805-6138d624dc1/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240626_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
-    sudo ./ov/install_dependencies/install_openvino_dependencies.sh
-    ```
 3. Build the project:
     ```sh
-    source ./ov/setupvars.sh
+    source <INSTALL_DIR>/setupvars.sh
     cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
     cmake --build ./build/ --config Release --target package -j
     cmake --install ./build/ --config Release --prefix ov
@@ -40,21 +39,16 @@
 
 ### Build Instructions
 
-1. Clone OpenVINO GenAI repository and init submodules:
+1. Build and install OpenVINO from sources following the [instructions](https://github.com/openvinotoolkit/openvino/wiki#how-to-build)  
+The path to the openvino install directory is referred as <INSTALL_DIR> throughout the document.
+2. Clone OpenVINO GenAI repository and init submodules:
     ```sh
     git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git
     cd openvino.genai
     ```
-2. Download OpenVINO archive and install dependencies:
-    ```sh
-    mkdir ./ov/
-    curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15805-6138d624dc1/w_openvino_toolkit_windows_2024.3.0.dev20240626_x86_64.zip
-    unzip ov.zip
-    mklink /D ov w_openvino_toolkit_windows_2024.3.0.dev20240626_x86_64
-    ```
 3. Build the project:
     ```sh
-    call ov\setupvars.bat
+    call <INSTALL_DIR>\setupvars.bat
     cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
     cmake --build ./build/ --config Release --target package -j
     cmake --install ./build/ --config Release --prefix ov
@@ -77,19 +71,16 @@
 
 ### Build Instructions
 
-1. Clone OpenVINO GenAI repository and init submodules:
+1. Build and install OpenVINO from sources following the [instructions](https://github.com/openvinotoolkit/openvino/wiki#how-to-build)  
+The path to the openvino install directory is referred as <INSTALL_DIR> throughout the document.
+2. Clone OpenVINO GenAI repository and init submodules:
     ```sh
     git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git
     cd openvino.genai
     ```
-2. Download OpenVINO archive and install dependencies:
-    ```sh
-    mkdir ./ov/
-    curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.3.0-15805-6138d624dc1/l_openvino_toolkit_ubuntu20_2024.3.0.dev20240626_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz
-    ```
 3. Build the project:
     ```sh
-    source ./ov/setupvars.sh
+    source <INSTALL_DIR>/setupvars.sh
     cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
     cmake --build ./build/ --config Release --target package -j
     cmake --install ./build/ --config Release --prefix ov
diff --git a/src/docs/SUPPORTED_MODELS.md b/src/docs/SUPPORTED_MODELS.md
index 0e6099db03..3eb2af17b4 100644
--- a/src/docs/SUPPORTED_MODELS.md
+++ b/src/docs/SUPPORTED_MODELS.md
@@ -45,7 +45,19 @@
       </td>
     </tr>
     <tr>
-      <td rowspan="3" vertical-align="top"><code>LlamaForCausalLM</code></td>
+      <td rowspan="4" vertical-align="top"><code>LlamaForCausalLM</code></td>
+      <td>Llama 3</td>
+      <td>
+        <ul>
+          <li><a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B"><code>meta-llama/Meta-Llama-3-8B</code></a></li>
+          <li><a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct"><code>meta-llama/Meta-Llama-3-8B-Instruct</code></a></li>
+          <li><a href="https://huggingface.co/meta-llama/Meta-Llama-3-70B"><code>meta-llama/Meta-Llama-3-70B</code></a></li>
+          <li><a href="https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct"><code>meta-llama/Meta-Llama-3-70B-Instruct</code></a></li>
+        </ul>
+      </td>
+    </tr>
+    <tr>
+      <!-- <td><code>LlamaForCausalLM</code></td> -->
       <td>Llama 2</td>
       <td>
         <ul>
diff --git a/tests/python_tests/README.md b/tests/python_tests/README.md
new file mode 100644
index 0000000000..e5381708de
--- /dev/null
+++ b/tests/python_tests/README.md
@@ -0,0 +1,47 @@
+# OpenVINO™ GenAI Tests
+
+This tests aim to validate support for vanilla and continuous batching GenAI APIs.
+
+## Setup environemnt
+
+In order to run tests first of all build or install OpenVINO GenAI library, follow instructions [GenAI Library README](../../src/README.md).
+
+Then install requirements for tests:
+```sh
+pip install -r tests/python_tests/requirements.txt
+```
+
+## Run Tests
+
+```sh
+python -m pytest tests/python_tests/ -m precommit
+```
+
+During the test downloaded HuggingFace (HF) models will be saved into the current directory. If you wish to place them somewhere else you can specify `GENAI_MODELS_PATH_PREFIX` environenment variable, e.g.
+```sh
+GENAI_MODELS_PATH_PREFIX=$HOME/test_models python -m pytest tests/python_tests/ -m precommit
+```
+
+If you have built GenAI library by yourself instead of using wheel please set `PYTHONPATH` so that test could find library, e.g.
+```sh
+PYTHONPATH=$PYTHONPATH:.../openvino.genai/build-Release/ python -m pytest tests/python_tests/ -m precommit
+```
+
+## Customise tests run
+
+Tests have `precommit` and `nightly` set of models. `precommit` contains lightweight models which can be quickly inferred, `nightly` models are heavier and required more time for interence. If you wish to run specific tests only for nightly models, you can use `-k` option, for example to run only multibatch and chat tests:
+```sh
+python -m pytest tests/python_tests/ -m nightly -k "test_multibatch and test_chat"
+```
+
+If you wish to run all tests except beam search do the following:
+```sh
+python -m pytest tests/python_tests/ -m precommit -k "not test_beam_search"
+```
+
+Argument `--model_ids` can be used to run tests selectively only for specific models. HF model ids should be separated by space, e.g:
+```sh
+python -m pytest tests/python_tests/ -m nightly -k "test_multibatch" --model_ids "TinyLlama/TinyLlama-1.1B-Chat-v1.0 Qwen/Qwen2-0.5B-Instruct"
+```
+
+List of currently supported `nightly` and `precommit` models can be found in tests/python_tests/ov_genai_test_utils.py:get_models_list
diff --git a/tests/python_tests/conftest.py b/tests/python_tests/conftest.py
index 66212468af..f98f47ecf3 100644
--- a/tests/python_tests/conftest.py
+++ b/tests/python_tests/conftest.py
@@ -14,6 +14,11 @@ def pytest_make_parametrize_id(config, val, argname):
         return f'{argname}={val}'
     return None
 
-def pytest_configure(config):
+def pytest_addoption(parser):
+    parser.addoption("--model_ids", help="Select models to run")
+
+def pytest_configure(config: pytest.Config):
     marker = 'precommit' if config.getoption('-m') == 'precommit' else 'nightly'
     pytest.run_marker = marker
+    pytest.selected_model_ids = config.getoption('--model_ids', default=None)
+
diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py
index 7bceb29458..7560486d42 100644
--- a/tests/python_tests/ov_genai_test_utils.py
+++ b/tests/python_tests/ov_genai_test_utils.py
@@ -49,7 +49,10 @@ def get_models_list():
         model_ids = precommit_models
     else:
         model_ids = nightly_models
-
+    
+    if pytest.selected_model_ids:
+        model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')]
+    # pytest.set_trace()
     prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', ''))
     return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids]
 
diff --git a/tests/python_tests/test_chat_generate_api.py b/tests/python_tests/test_chat_generate_api.py
index 94de8f6cc2..5a73d481d3 100644
--- a/tests/python_tests/test_chat_generate_api.py
+++ b/tests/python_tests/test_chat_generate_api.py
@@ -33,6 +33,7 @@
 @pytest.mark.parametrize("generation_config", configs)
 @pytest.mark.parametrize("model_descr", get_chat_models_list())
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_chat_compare_with_HF(model_descr, generation_config: Dict):
     device = 'CPU'
     chat_history_hf = []
@@ -69,6 +70,7 @@ def test_chat_compare_with_HF(model_descr, generation_config: Dict):
 @pytest.mark.parametrize("generation_config", configs)
 @pytest.mark.parametrize("model_descr", get_chat_models_list())
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_chat_compare_text_history_with_HF(model_descr, generation_config: Dict):
     # compares with HF when history in ov_genai is save as a text
     device = 'CPU'
@@ -104,6 +106,7 @@ def test_chat_compare_text_history_with_HF(model_descr, generation_config: Dict)
 @pytest.mark.parametrize("generation_config", configs)
 @pytest.mark.parametrize("model_descr", get_chat_models_list())
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_chat_compare_statefull_vs_text_history(model_descr, generation_config: Dict):
     # Check that when history is stored in KV cache results are the same as when history stored in a text.
     device ='CPU'
@@ -144,6 +147,7 @@ def test_chat_compare_statefull_vs_text_history(model_descr, generation_config:
     {'role': 'user', 'content': 'What was my first question?'},
 ]
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.parametrize('chat_config', get_chat_templates())
 def test_apply_chat_template(model_tmp_path, chat_config: Tuple[str, Dict]):
     tokenizer_config = chat_config[1]
diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py
index 40bc121293..b4e275eef2 100644
--- a/tests/python_tests/test_generate_api.py
+++ b/tests/python_tests/test_generate_api.py
@@ -151,6 +151,7 @@ def hf_ov_genai_tensors_comparison(
 @pytest.mark.parametrize("generation_config,prompt", test_cases)
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_decoding(model_descr, generation_config, prompt):
     run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt)
 
@@ -168,6 +169,7 @@ def test_decoding(model_descr, generation_config, prompt):
     condition=sys.platform in ["linux", "win32"]
 )
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_ov_tensors(model_descr, inputs):
     hf_ov_genai_tensors_comparison(read_model(model_descr), dict(max_new_tokens=20), *inputs)
 
@@ -182,6 +184,7 @@ def test_ov_tensors(model_descr, inputs):
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.parametrize("prompt", prompts)
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.xfail(
     raises=TypeError, 
     reason="pybind was unable to find ov::Tensor from openvino yet",
@@ -217,6 +220,7 @@ def test_genai_tokenizer_encode(model_descr, prompt):
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.parametrize("encoded_prompt", encoded_prompts)
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.xfail(
     raises=TypeError, 
     reason="pybind was unable to find ov::Tensor from openvino yet",
@@ -252,6 +256,7 @@ def test_genai_tokenizer_decode(model_descr, encoded_prompt):
 @pytest.mark.parametrize("prompts", batched_prompts)
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_multibatch(model_descr, generation_config, prompts):
     run_hf_ov_genai_comparison_batched(read_model(model_descr), generation_config, prompts)
 
@@ -264,6 +269,7 @@ def test_multibatch(model_descr, generation_config, prompts):
 @pytest.mark.parametrize("prompt", prompts)
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_beam_search_decoding(model_descr, num_beam_groups, group_size,
                               max_new_tokens, diversity_penalty, prompt):
     generation_config = dict(
@@ -281,6 +287,7 @@ def test_beam_search_decoding(model_descr, num_beam_groups, group_size,
 @pytest.mark.parametrize("max_new_tokens", [10, 80])
 @pytest.mark.parametrize("model_descr", get_models_list())
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_stop_criteria(model_descr, stop_criteria, prompt, max_new_tokens):
     # todo: with EARLY stop_criteria looks like HF return unvalid out with sentence<eos><unk><unk>
     # while genai ends sentence with <eos>
@@ -323,6 +330,7 @@ def user_defined_callback(subword):
 
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_callback_one_string(callback):
     pipe = read_model(get_models_list()[0])[4]
     generation_config = pipe.get_generation_config()
@@ -332,6 +340,7 @@ def test_callback_one_string(callback):
 
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_callback_batch_fail(callback):
     pipe = read_model(get_models_list()[0])[4]
     with pytest.raises(RuntimeError):
@@ -340,12 +349,14 @@ def test_callback_batch_fail(callback):
 
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_callback_kwargs_one_string(callback):
     pipe = read_model(get_models_list()[0])[4]
     pipe.generate('table is made of', max_new_tokens=10, streamer=callback)
 
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.parametrize("model_descr", get_models_list())
 def test_callback_decoding_metallama(model_descr, callback):
     # On metallam this prompt generates output which can shorten after adding new tokens.
@@ -359,6 +370,7 @@ def test_callback_decoding_metallama(model_descr, callback):
 
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_callback_kwargs_batch_fail(callback):
     pipe = read_model(get_models_list()[0])[4]
     with pytest.raises(RuntimeError):
@@ -380,6 +392,7 @@ def end(self):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_streamer_one_string():
     pipe = read_model(get_models_list()[0])[4]
     generation_config = pipe.get_generation_config()
@@ -389,6 +402,7 @@ def test_streamer_one_string():
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_streamer_batch_fail():
     pipe = read_model(get_models_list()[0])[4]
     printer = Printer(pipe.get_tokenizer())
@@ -397,6 +411,7 @@ def test_streamer_batch_fail():
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_streamer_kwargs_one_string():
     pipe = read_model(get_models_list()[0])[4]
     printer = Printer(pipe.get_tokenizer())
@@ -404,6 +419,7 @@ def test_streamer_kwargs_one_string():
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_streamer_kwargs_batch_fail():
     pipe = read_model(get_models_list()[0])[4]
     printer = Printer(pipe.get_tokenizer())
@@ -412,6 +428,7 @@ def test_streamer_kwargs_batch_fail():
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 def test_operator_with_callback_one_string(callback):
     pipe = read_model(get_models_list()[0])[4]
@@ -421,6 +438,7 @@ def test_operator_with_callback_one_string(callback):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)])
 def test_operator_with_callback_batch_fail(callback):
     pipe = read_model(get_models_list()[0])[4]
@@ -429,6 +447,7 @@ def test_operator_with_callback_batch_fail(callback):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_operator_with_streamer_kwargs_one_string():
     pipe = read_model(get_models_list()[0])[4]
     printer = Printer(pipe.get_tokenizer())
@@ -436,6 +455,7 @@ def test_operator_with_streamer_kwargs_one_string():
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_operator_with_streamer_kwargs_batch_fail():
     pipe = read_model(get_models_list()[0])[4]
     printer = Printer(pipe.get_tokenizer())
@@ -444,6 +464,7 @@ def test_operator_with_streamer_kwargs_batch_fail():
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_load_special_tokens_ids_1(model_tmp_path):
     # test when there is an available config.json
     config_json = { 
@@ -458,6 +479,7 @@ def test_load_special_tokens_ids_1(model_tmp_path):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_load_special_tokens_str_2(model_tmp_path):
     # test with special_tokens_map
     special_tokens_map_json = { 
@@ -472,6 +494,7 @@ def test_load_special_tokens_str_2(model_tmp_path):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_load_special_tokens_3_(model_tmp_path):
     # special_tokens_map is not available 
     # but tokenize_config.json exists
@@ -498,6 +521,7 @@ def test_load_special_tokens_3_(model_tmp_path):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_load_special_tokens_3(model_tmp_path):
     # both config.json is availabel and tokenizer_config.json available
     # check that it does not read int values from tokenizer_config.json if they are in config.json
@@ -532,6 +556,7 @@ def test_load_special_tokens_3(model_tmp_path):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.xfail(
     raises=AssertionError, 
     reason="CVS-143410 ov tokenizer should be aligned with hf",
@@ -575,6 +600,7 @@ def test_load_special_tokens_4(model_tmp_path):
 ]
 @pytest.mark.parametrize("generation_config", invalid_configs)
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_invalid_configs(model_tmp_path, generation_config):
     model_id, temp_path = model_tmp_path
     config_json = {}
@@ -584,6 +610,7 @@ def test_invalid_configs(model_tmp_path, generation_config):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_valid_configs(model_tmp_path):
     model_id, temp_path = model_tmp_path
     pipe = load_pipe([({"eos_token_id": 37}, "config.json")], temp_path)
@@ -602,6 +629,7 @@ def test_valid_configs(model_tmp_path):
     dict(top_k=0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_k
 ]
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.parametrize("generation_config", invalid_py_configs)
 def test_python_generation_config_validation(model_tmp_path, generation_config):
     model_id, temp_path = model_tmp_path
@@ -615,6 +643,7 @@ def test_python_generation_config_validation(model_tmp_path, generation_config):
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_unicode_pybind_decoding_1():
     # On this model this prompt generates unfinished utf string.
     # Test that pybind will not fail.
@@ -626,6 +655,7 @@ def test_unicode_pybind_decoding_1():
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_unicode_pybind_decoding_2():
     # On this model this prompt generates unfinished utf string.
     # Test that pybind will not fail.
@@ -636,6 +666,7 @@ def test_unicode_pybind_decoding_2():
 
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_unicode_pybind_decoding_3():
     # On this model this prompt generates unfinished utf-8 string
     # and streams it. Test that pybind will not fail while we pass string to python.
@@ -648,6 +679,7 @@ def test_unicode_pybind_decoding_3():
 
 @pytest.mark.skip(reason="probably both models ov + hf doesn't fit to memory")
 @pytest.mark.precommit
+@pytest.mark.nightly
 @pytest.mark.skipif(sys.platform.startswith("win"), reason="not enough space for this model on Win")
 def test_left_pad():
     # test left pad tokenizer post processing implementation

From 944321854d77c14cf02a0ff1d32b89ba4e7a1f62 Mon Sep 17 00:00:00 2001
From: Damian Kalinowski <damian.kalinowski@intel.com>
Date: Wed, 24 Jul 2024 08:37:34 +0200
Subject: [PATCH 11/19] Add infer request queue for tokenizers and allow for
 optional plugin_config in tokenizer (#651)

This improves performance of CB lib when tested within OVMS.
---
 .../genai/continuous_batching_pipeline.hpp    |   3 +-
 src/cpp/include/openvino/genai/tokenizer.hpp  |   2 +-
 src/cpp/src/circular_buffer_queue.hpp         | 100 ++++++++++++++++++
 src/cpp/src/continuous_batching_pipeline.cpp  |   9 +-
 src/cpp/src/tokenizer.cpp                     |  98 ++++++++++-------
 src/python/py_generate_pipeline.cpp           |  12 +--
 tests/python_tests/common.py                  |   2 +-
 tests/python_tests/ov_genai_test_utils.py     |   2 +-
 tests/python_tests/test_sampling.py           |   2 +-
 9 files changed, 179 insertions(+), 51 deletions(-)
 create mode 100644 src/cpp/src/circular_buffer_queue.hpp

diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
index be9a5fd8c1..f5f8c53309 100644
--- a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
@@ -30,7 +30,8 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
     ContinuousBatchingPipeline(const std::string& models_path,
                                const SchedulerConfig& scheduler_config,
                                const std::string& device = "CPU",
-                               const ov::AnyMap& plugin_config = {});
+                               const ov::AnyMap& llm_plugin_config = {},
+                               const ov::AnyMap& tokenizer_plugin_config = {});
 
     /**
     * @brief Constructs a ContinuousBatchingPipeline when ov::genai::Tokenizer is initialized manually using file from the different dirs.
diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp
index 5a1e181e21..425c30128b 100644
--- a/src/cpp/include/openvino/genai/tokenizer.hpp
+++ b/src/cpp/include/openvino/genai/tokenizer.hpp
@@ -29,7 +29,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     * @brief ov::genai::Tokenizer constructor.
     * @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path
     */
-    Tokenizer(const std::string& tokenizer_path);
+    Tokenizer(const std::string& tokenizer_path, const ov::AnyMap& plugin_config = {});
 
     /**
     * @brief encode a single prompt
diff --git a/src/cpp/src/circular_buffer_queue.hpp b/src/cpp/src/circular_buffer_queue.hpp
new file mode 100644
index 0000000000..086854e68e
--- /dev/null
+++ b/src/cpp/src/circular_buffer_queue.hpp
@@ -0,0 +1,100 @@
+#pragma once
+
+#include <queue>
+#include <mutex>
+#include <future>
+#include <algorithm>
+#include <atomic>
+
+namespace ov::genai {
+
+// From OVMS:
+// https://github.com/openvinotoolkit/model_server/blob/d73e85cbb8ac1d761754cb2064a00551a9ffc655/src/queue.hpp#L34
+template <typename T>
+class CircularBufferQueue
+{
+    int m_front_idx;
+    std::atomic<int> m_back_idx;
+    std::vector<int> m_values;
+    std::queue<std::promise<int>> m_promises;
+    std::vector<T> m_data;
+    std::mutex m_front_mut;
+    std::mutex m_queue_mutex;
+
+public:
+
+    CircularBufferQueue(size_t length, const std::function<T()>& create_fn) :
+        m_values(length),
+        m_front_idx{0},
+        m_back_idx{0} {
+        std::iota(m_values.begin(), m_values.end(), 0);
+        m_data.reserve(length);
+        for (size_t i = 0; i < length; i++) {
+            m_data.emplace_back(std::move(create_fn()));
+        }
+    }
+
+    CircularBufferQueue(const CircularBufferQueue&) = delete;
+    CircularBufferQueue(const CircularBufferQueue&&) = delete;
+    CircularBufferQueue& operator=(const CircularBufferQueue&) = delete;
+
+    T& get(int value) {
+        return m_data[value];
+    }
+
+    std::future<int> get_idle() {
+        int value;
+        std::promise<int> idle_promise;
+        std::future<int> idle_future = idle_promise.get_future();
+        std::unique_lock<std::mutex> lk(m_front_mut);
+        if (m_values[m_front_idx] < 0) {
+            std::unique_lock<std::mutex> queueLock(m_queue_mutex);
+            m_promises.push(std::move(idle_promise));
+        } else {
+            value = m_values[m_front_idx];
+            m_values[m_front_idx] = -1;
+            m_front_idx = (m_front_idx + 1) % m_values.size();
+            lk.unlock();
+            idle_promise.set_value(value);
+        }
+        return idle_future;
+    }
+
+    void return_to(int value) {
+        std::unique_lock<std::mutex> lk(m_queue_mutex);
+        if (m_promises.size()) {
+            std::promise<int> promise = std::move(m_promises.front());
+            m_promises.pop();
+            lk.unlock();
+            promise.set_value(value);
+            return;
+        }
+        int old_back = m_back_idx.load();
+        while (!m_back_idx.compare_exchange_weak(
+            old_back,
+            (old_back + 1) % m_values.size(),
+            std::memory_order_relaxed)) {
+        }
+        m_values[old_back] = value;
+    }
+};
+
+template <typename T>
+class CircularBufferQueueElementGuard {
+    CircularBufferQueue<T>* m_queue;
+    int m_value;
+public:
+    CircularBufferQueueElementGuard(CircularBufferQueue<T>* queue) : m_queue(queue) {
+        m_value = m_queue->get_idle().get();   // blocking until we get the element
+    }
+
+    T& get() {
+        return m_queue->get(m_value);
+    }
+
+    ~CircularBufferQueueElementGuard() {
+        m_queue->return_to(m_value);
+    }
+};
+
+}
diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp
index ddfebc5926..55100f3cb4 100644
--- a/src/cpp/src/continuous_batching_pipeline.cpp
+++ b/src/cpp/src/continuous_batching_pipeline.cpp
@@ -105,8 +105,8 @@ class ContinuousBatchingPipeline::Impl {
         // read default generation config
     }
 
-    Impl(const std::string& models_path, const SchedulerConfig& scheduler_config, const std::string& device, const ov::AnyMap& plugin_config)
-        : Impl{models_path, Tokenizer(models_path), scheduler_config, device, plugin_config} {}
+    Impl(const std::string& models_path, const SchedulerConfig& scheduler_config, const std::string& device, const ov::AnyMap& llm_plugin_config, const ov::AnyMap& tokenizer_plugin_config)
+        : Impl{models_path, Tokenizer(models_path, tokenizer_plugin_config), scheduler_config, device, llm_plugin_config} {}
 
     ov::genai::GenerationConfig get_config() const {
         return m_generation_config;
@@ -282,8 +282,9 @@ class ContinuousBatchingPipeline::Impl {
 ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::string& models_path,
                                                         const SchedulerConfig& scheduler_config,
                                                         const std::string& device,
-                                                        const ov::AnyMap& plugin_config ) {
-    m_impl = std::make_shared<Impl>(models_path, scheduler_config, device, plugin_config);
+                                                        const ov::AnyMap& llm_plugin_config,
+                                                        const ov::AnyMap& tokenizer_plugin_config) {
+    m_impl = std::make_shared<Impl>(models_path, scheduler_config, device, llm_plugin_config, tokenizer_plugin_config);
 }
 
 ContinuousBatchingPipeline::ContinuousBatchingPipeline(
diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index ac6b925dcb..b1e36033ee 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -7,7 +7,9 @@
 #include <jinja2cpp/template.h>
 #include <jinja2cpp/template_env.h>
 #include "tokenizers_path.hpp"
+#include "circular_buffer_queue.hpp"
 #include <fstream>
+#include <memory>
 
 namespace {
 
@@ -55,10 +57,12 @@ namespace genai {
 
 class Tokenizer::TokenizerImpl {
 public:
-    ov::InferRequest m_tokenizer_request;
-    ov::InferRequest m_detokenizer_request;
-    std::mutex m_tokenizer_mutex;
-    std::mutex m_detokenizer_mutex;
+    ov::CompiledModel m_tokenizer;
+    ov::CompiledModel m_detokenizer;
+
+    std::unique_ptr<CircularBufferQueue<ov::InferRequest>> m_ireq_queue_tokenizer;
+    std::unique_ptr<CircularBufferQueue<ov::InferRequest>> m_ireq_queue_detokenizer;
+
     int64_t m_pad_token_id = -1;
     int64_t m_bos_token_id = -1;
     int64_t m_eos_token_id = -1;
@@ -71,7 +75,7 @@ class Tokenizer::TokenizerImpl {
 
     TokenizerImpl() = default;
 
-    TokenizerImpl(std::filesystem::path tokenizer_path)
+    TokenizerImpl(std::filesystem::path tokenizer_path, const ov::AnyMap& plugin_config)
         : m_chat_template{chat_template_from_tokenizer_json_if_exists(tokenizer_path)} {
         ov::Core core;
         
@@ -92,10 +96,23 @@ class Tokenizer::TokenizerImpl {
         read_tokenizer_config_if_necessary(tokenizer_path); 
 
         auto device = "CPU"; // currently openvino_tokenizer supports only CPU
-        m_tokenizer_request = core.compile_model(tokenizer_path / "openvino_tokenizer.xml",
-                                                device).create_infer_request();
-        m_detokenizer_request = core.compile_model(tokenizer_path / "openvino_detokenizer.xml", 
-                                                   device).create_infer_request();
+        m_tokenizer = core.compile_model(tokenizer_path / "openvino_tokenizer.xml",
+                                                device, plugin_config);
+        m_detokenizer = core.compile_model(tokenizer_path / "openvino_detokenizer.xml", 
+                                                   device, plugin_config);
+
+        
+        const size_t INFER_REQUEST_QUEUE_SIZE = m_tokenizer.get_property(ov::optimal_number_of_infer_requests);
+        m_ireq_queue_tokenizer = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
+            INFER_REQUEST_QUEUE_SIZE,
+            [this]() -> ov::InferRequest {
+                return std::move(this->m_tokenizer.create_infer_request());
+            });
+        m_ireq_queue_detokenizer = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
+            INFER_REQUEST_QUEUE_SIZE,
+            [this]() -> ov::InferRequest {
+                return std::move(this->m_detokenizer.create_infer_request());
+            });
 
         // Get special token ids by inference if they are not defined.
         infer_special_tokens_if_necessary();
@@ -231,29 +248,35 @@ class Tokenizer::TokenizerImpl {
     }
 
     TokenizedInputs encode(std::string prompt) {
+        CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_tokenizer.get());
         size_t batch_size = 1;
-        std::unique_lock<std::mutex> lock(m_tokenizer_mutex);
-        m_tokenizer_request.set_input_tensor(ov::Tensor{ov::element::string, {batch_size}, &prompt});
-        m_tokenizer_request.infer();
-        return get_copied_results();
+        infer_request_guard.get().set_input_tensor(ov::Tensor{ov::element::string, {batch_size}, &prompt});
+        infer_request_guard.get().start_async();
+        infer_request_guard.get().wait();
+        return get_copied_results(
+            infer_request_guard.get().get_tensor("input_ids"),
+            infer_request_guard.get().get_tensor("attention_mask")
+        );
     }
 
     TokenizedInputs encode(std::vector<std::string>& prompts) {
         TokenizedInputs unpadded;
         {
-            std::unique_lock<std::mutex> lock(m_tokenizer_mutex);
-            m_tokenizer_request.set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, prompts.data()});
-            auto size_ = m_tokenizer_request.get_input_tensor().get_shape();
-            m_tokenizer_request.infer();
-
-            unpadded = get_copied_results();
+            CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_tokenizer.get());
+            infer_request_guard.get().set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, prompts.data()});
+            auto size_ = infer_request_guard.get().get_input_tensor().get_shape();
+            infer_request_guard.get().start_async();
+            infer_request_guard.get().wait();
+
+            unpadded = get_copied_results(
+                infer_request_guard.get().get_tensor("input_ids"),
+                infer_request_guard.get().get_tensor("attention_mask")
+            );
         }
         return pad_left(unpadded.input_ids, unpadded.attention_mask);
     }
 
-    TokenizedInputs get_copied_results() {
-        auto input_ids = m_tokenizer_request.get_tensor("input_ids");
-        auto attention_mask = m_tokenizer_request.get_tensor("attention_mask");
+    TokenizedInputs get_copied_results(ov::Tensor input_ids, ov::Tensor attention_mask) {
         ov::Tensor input_ids_ = ov::Tensor(input_ids.get_element_type(), input_ids.get_shape());
         ov::Tensor attention_mask_ = ov::Tensor(attention_mask.get_element_type(), attention_mask.get_shape());
         input_ids.copy_to(input_ids_);
@@ -263,22 +286,24 @@ class Tokenizer::TokenizerImpl {
     }
 
     std::string decode(std::vector<int64_t> tokens) {
+        CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_detokenizer.get());
         size_t batch_size = 1;
-        std::unique_lock<std::mutex> lock(m_detokenizer_mutex);
-        m_detokenizer_request.set_input_tensor(ov::Tensor{ov::element::i64, {batch_size, tokens.size()}, tokens.data()});
-        m_detokenizer_request.infer();
-        return m_detokenizer_request.get_output_tensor().data<std::string>()[0];
+        infer_request_guard.get().set_input_tensor(ov::Tensor{ov::element::i64, {batch_size, tokens.size()}, tokens.data()});
+        infer_request_guard.get().start_async();
+        infer_request_guard.get().wait();
+        return infer_request_guard.get().get_output_tensor().data<std::string>()[0];
     }
 
     std::vector<std::string> decode(ov::Tensor tokens) {
         OPENVINO_ASSERT(tokens.get_element_type() == ov::element::i64, "tokens tensor element type should be an i64");
         OPENVINO_ASSERT(tokens.get_shape().size() == 2, "tokens tensor should of rank 2 with shape [batch_size, seq_len]");
 
-        std::unique_lock<std::mutex> lock(m_detokenizer_mutex);
-        m_detokenizer_request.set_input_tensor(tokens);
-        m_detokenizer_request.infer();
+        CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_detokenizer.get());
+        infer_request_guard.get().set_input_tensor(tokens);
+        infer_request_guard.get().start_async();
+        infer_request_guard.get().wait();
         
-        auto res = m_detokenizer_request.get_output_tensor();
+        auto res = infer_request_guard.get().get_output_tensor();
         auto res_data = res.data<std::string>();
         return std::vector<std::string>(res_data, res_data + res.get_shape()[0]);
     }
@@ -299,10 +324,11 @@ class Tokenizer::TokenizerImpl {
             std::fill(tokens_data + i * max_len + line_len, tokens_data + (i + 1) * max_len, m_pad_token_id);
         }
 
-        std::unique_lock<std::mutex> lock(m_detokenizer_mutex);
-        m_detokenizer_request.set_input_tensor(tokens);
-        m_detokenizer_request.infer();
-        auto res = m_detokenizer_request.get_output_tensor();
+        CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_detokenizer.get());
+        infer_request_guard.get().set_input_tensor(tokens);
+        infer_request_guard.get().start_async();
+        infer_request_guard.get().wait();
+        auto res = infer_request_guard.get().get_output_tensor();
         auto res_data = res.data<std::string>();
         return std::vector<std::string>(res_data, res_data + res.get_shape()[0]);
     }
@@ -411,9 +437,9 @@ class Tokenizer::TokenizerImpl {
     
 };
 
-Tokenizer::Tokenizer(const std::string& tokenizer_path) {
+Tokenizer::Tokenizer(const std::string& tokenizer_path, const ov::AnyMap& plugin_config) {
     ScopedVar env_manager(tokenizers_relative_to_genai().string());
-    m_pimpl = std::make_shared<TokenizerImpl>(tokenizer_path);
+    m_pimpl = std::make_shared<TokenizerImpl>(tokenizer_path, plugin_config);
 }
 
 TokenizedInputs Tokenizer::encode(const std::string prompt) {
diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp
index d7b2aab29c..8a1a226bc1 100644
--- a/src/python/py_generate_pipeline.cpp
+++ b/src/python/py_generate_pipeline.cpp
@@ -436,10 +436,10 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
         R"(openvino_genai.Tokenizer object is used to initialize Tokenizer 
            if it's located in a different path than the main model.)")
         
-        .def(py::init([](const std::string& tokenizer_path) {
+        .def(py::init([](const std::string& tokenizer_path, const std::map<std::string, py::object>& plugin_config) {
             ScopedVar env_manager(ov_tokenizers_module_path());
-            return std::make_unique<ov::genai::Tokenizer>(tokenizer_path);
-        }), py::arg("tokenizer_path"))
+            return std::make_unique<ov::genai::Tokenizer>(tokenizer_path, properties_to_any_map(plugin_config));
+        }), py::arg("tokenizer_path"), py::arg("plugin_config") = ov::AnyMap({}))
         
         .def("encode", [](Tokenizer& tok, std::vector<std::string>& prompts) { return tok.encode(prompts); },
             py::arg("prompts"),
@@ -596,10 +596,10 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
         .def_readwrite("max_num_seqs", &SchedulerConfig::max_num_seqs);
 
     py::class_<ContinuousBatchingPipeline>(m, "ContinuousBatchingPipeline")
-        .def(py::init([](const std::string& model_path, const SchedulerConfig& scheduler_config, const std::string& device, const std::map<std::string, py::object>& plugin_config) {
+        .def(py::init([](const std::string& model_path, const SchedulerConfig& scheduler_config, const std::string& device, const std::map<std::string, py::object>& llm_plugin_config, const std::map<std::string, py::object>& tokenizer_plugin_config) {
             ScopedVar env_manager(ov_tokenizers_module_path());
-            return std::make_unique<ContinuousBatchingPipeline>(model_path, scheduler_config, device, properties_to_any_map(plugin_config));
-        }), py::arg("model_path"), py::arg("scheduler_config"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap({}))
+            return std::make_unique<ContinuousBatchingPipeline>(model_path, scheduler_config, device, properties_to_any_map(llm_plugin_config), properties_to_any_map(tokenizer_plugin_config));
+        }), py::arg("model_path"), py::arg("scheduler_config"), py::arg("device") = "CPU", py::arg("llm_plugin_config") = ov::AnyMap({}), py::arg("tokenizer_plugin_config") = ov::AnyMap({}))
         .def(py::init([](const std::string& model_path, const ov::genai::Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const std::map<std::string, py::object>& plugin_config) {
             ScopedVar env_manager(ov_tokenizers_module_path());
             return std::make_unique<ContinuousBatchingPipeline>(model_path, tokenizer, scheduler_config, device, properties_to_any_map(plugin_config));
diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py
index 95046a463a..0a94558274 100644
--- a/tests/python_tests/common.py
+++ b/tests/python_tests/common.py
@@ -273,7 +273,7 @@ def run_continuous_batching(
     prompts: List[str],
     generation_configs : List[GenerationConfig]
 ) -> List[GenerationResult]:
-    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config, "CPU", {})
+    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config, "CPU", {}, {})
     output = pipe.generate(prompts, generation_configs)
     del pipe
     shutil.rmtree(model_path)
diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py
index 7560486d42..bf76df534d 100644
--- a/tests/python_tests/ov_genai_test_utils.py
+++ b/tests/python_tests/ov_genai_test_utils.py
@@ -208,7 +208,7 @@ def load_tok(configs: List[Tuple], temp_path):
     for config_json, config_name in configs:
         with (temp_path / config_name).open('w') as f:
             json.dump(config_json, f)
-    return ov_genai.Tokenizer(str(temp_path))
+    return ov_genai.Tokenizer(str(temp_path), {})
 
 
 def load_pipe(configs: List[Tuple], temp_path):
diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py
index 27596359bf..9b34cd2f5b 100644
--- a/tests/python_tests/test_sampling.py
+++ b/tests/python_tests/test_sampling.py
@@ -306,7 +306,7 @@ def test_post_oom_health(tmp_path):
     model_path : Path = tmp_path / model_id
     save_ov_model_from_optimum(model, hf_tokenizer, model_path)
 
-    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), Tokenizer(model_path.absolute().as_posix()), scheduler_config)
+    pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), Tokenizer(model_path.absolute().as_posix(), {}), scheduler_config, "CPU", {})
     # First run should return incomplete response
     output = pipe.generate(["What is OpenVINO?"], generation_configs)
     assert(len(output))

From 04012f473c0eac190701926366e9b05704b80196 Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Wed, 24 Jul 2024 09:40:37 +0200
Subject: [PATCH 12/19] Skip test_preemption_with_multinomial_n_seq (#667)

Random sampling
---
 tests/python_tests/test_preemption.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/python_tests/test_preemption.py b/tests/python_tests/test_preemption.py
index 8c9bda1d33..cce74136eb 100644
--- a/tests/python_tests/test_preemption.py
+++ b/tests/python_tests/test_preemption.py
@@ -161,6 +161,7 @@ def test_preemption_with_multinomial(tmp_path, dynamic_split_fuse):
 
 @pytest.mark.parametrize("dynamic_split_fuse", [True, False])
 @pytest.mark.precommit
+@pytest.mark.skip(reason="Random sampling results are non deterministic due to: discrete_distribution impl depends on platform, model inference results may depend on CPU. Test passes on CI but fails locally.")
 def test_preemption_with_multinomial_n_seq(tmp_path, dynamic_split_fuse):
     generation_configs = multinomial_params_n_seq.generation_config
     for config in generation_configs:

From cc5e2356d64b709f765fda5563113b7802855db4 Mon Sep 17 00:00:00 2001
From: Sylwia Kuros <sylwia.kuros@intel.com>
Date: Wed, 24 Jul 2024 12:19:54 +0200
Subject: [PATCH 13/19] Set torchvision to < 0.19.0 (#668)

Using torchvision with version 0.19.0 causes the following issue:
```
Traceback (most recent call last):
  File "C:\Program Files\Python310\lib\site-packages\transformers\utils\import_utils.py", line 1567, in _get_module
    return importlib.import_module("." + module_name, self.__name__)
  File "C:\Program Files\Python310\lib\importlib\__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 883, in exec_module
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "C:\Program Files\Python310\lib\site-packages\transformers\models\auto\image_processing_auto.py", line 27, in <module>
    from ...image_processing_utils import BaseImageProcessor, ImageProcessingMixin
  File "C:\Program Files\Python310\lib\site-packages\transformers\image_processing_utils.py", line 21, in <module>
    from .image_transforms import center_crop, normalize, rescale
  File "C:\Program Files\Python310\lib\site-packages\transformers\image_transforms.py", line 22, in <module>
    from .image_utils import (
  File "C:\Program Files\Python310\lib\site-packages\transformers\image_utils.py", line 58, in <module>
    from torchvision.transforms import InterpolationMode
  File "C:\Program Files\Python310\lib\site-packages\torchvision\__init__.py", line 10, in <module>
    from torchvision import _meta_registrations, datasets, io, models, ops, transforms, utils  # usort:skip
  File "C:\Program Files\Python310\lib\site-packages\torchvision\_meta_registrations.py", line 163, in <module>
    @torch.library.register_fake("torchvision::nms")
AttributeError: module 'torch.library' has no attribute 'register_fake'
```
---
 llm_bench/python/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llm_bench/python/requirements.txt b/llm_bench/python/requirements.txt
index ed80a66deb..d83cd5a376 100644
--- a/llm_bench/python/requirements.txt
+++ b/llm_bench/python/requirements.txt
@@ -7,6 +7,7 @@ openvino_genai
 auto-gptq>=0.5.1 # for gptq
 pillow
 torch
+torchvision<0.19.0
 transformers>=4.40.0
 diffusers>=0.22.0
 #optimum is in dependency list of optimum-intel 

From 42dd04900cded77671ae1fa9d50f888180ace73f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mi=C5=82osz=20=C5=BBeglarski?= <milosz.zeglarski@intel.com>
Date: Wed, 24 Jul 2024 12:14:35 +0200
Subject: [PATCH 14/19] [Continuous batching] In the event of OOM, return
 tokens generated so far for the request  (#661)

---
 src/cpp/src/sequence_group.hpp      | 71 ++++++++++++-----------------
 tests/python_tests/test_sampling.py | 11 +++--
 2 files changed, 36 insertions(+), 46 deletions(-)

diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp
index 3df1820cfb..88b86b4484 100644
--- a/src/cpp/src/sequence_group.hpp
+++ b/src/cpp/src/sequence_group.hpp
@@ -425,59 +425,46 @@ class SequenceGroup {
         return m_generation_stream->get_status() == GenerationStatus::DROPPED_BY_HANDLE;
     }
 
-    void notify_handle() {
+    void push_outputs() {
+        GenerationOutputs outputs;
+        for (auto& sequence: m_sequences) {
+            GenerationOutput output;
+            output.generated_token_ids = sequence->get_generated_ids();
+            output.score = sequence->get_beam_search_score(m_sampling_params);
+            outputs.emplace(sequence->get_grouped_id(), output);
+        }
+        m_generation_stream->push(outputs);
+    }
+
+    void push_partial_outputs() {
+        GenerationOutputs outputs;
+        // TODO: support streamimg for n seqs
+        for (auto& sequence : m_sequences) {
+            // todo: check seq.is_finished() to generate without several </s>
+            // or is it ok to use padding?
+            const auto last_gen_token = sequence->get_last_generation_output();
+            outputs.emplace(sequence->get_grouped_id(), last_gen_token);
+        }
+        m_generation_stream->push(outputs);
+    }
 
+    void notify_handle() {
         if (out_of_memory()) {
             set_generation_status(GenerationStatus::IGNORED);
         } else if (has_finished()) {
             set_generation_status(GenerationStatus::FINISHED);
         }
-
-        GenerationOutputs outputs;
-
         // For beam search streaming is not available, so we notify only upon finishing
         if(m_sampling_params.is_beam_search()) {
-            if (has_finished()) {
-                std::vector<Sequence::CPtr> finished_sequences = get_finished_sequences();
-
-                OPENVINO_ASSERT(finished_sequences.size() == num_total_seqs() && has_finished());
-                for (auto& sequence: finished_sequences) {
-                    GenerationOutput output;
-                    output.generated_token_ids = sequence->get_generated_ids();
-                    output.score = sequence->get_beam_search_score(m_sampling_params);
-                    outputs.emplace(sequence->get_grouped_id(), output);
-                }
-
-                if (outputs.size()) {
-                    m_generation_stream->push(outputs);
-                }
+            if (has_finished() || out_of_memory()) {
+                push_outputs();
             }
-        // For greedy or multinomial sampling we decide whever to stream partial results depending on the user parameter
         } else if (m_sampling_params.is_greedy_decoding() || m_sampling_params.is_multinomial()) {
             // TO DO: Now we always stream for greedy search for the sake of benchmarking 
-            if (num_total_seqs() == 1 /* m_sampling_params.stream */) {
-                // TODO: support streamimg for n seqs
-                for (auto& sequence : m_sequences) {
-                    // todo: check seq.is_finished() to generate without several </s>
-                    // or is it ok to use padding?
-                    const auto last_gen_token = sequence->get_last_generation_output();
-                    outputs.emplace(sequence->get_grouped_id(), last_gen_token);
-                }
-                m_generation_stream->push(outputs);
-            } else if (has_finished()) {
-                std::vector<Sequence::CPtr> finished_sequences = get_finished_sequences();
-
-                OPENVINO_ASSERT(finished_sequences.size() == num_total_seqs() && has_finished());
-                for (auto& sequence: finished_sequences) {
-                    GenerationOutput output;
-                    output.generated_token_ids = sequence->get_generated_ids();
-                    output.score = sequence->get_cumulative_log_probs();
-                    outputs.emplace(sequence->get_grouped_id(), output);
-                }
-
-                if (outputs.size()) {
-                    m_generation_stream->push(outputs);
-                }
+            if (num_total_seqs() == 1) {
+                push_partial_outputs();
+            } else if (has_finished() || out_of_memory()) {
+                push_outputs();
             }
         }
     } 
diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py
index 9b34cd2f5b..741c89db78 100644
--- a/tests/python_tests/test_sampling.py
+++ b/tests/python_tests/test_sampling.py
@@ -291,8 +291,9 @@ def test_individual_generation_configs_random(tmp_path, test_struct: RandomSampl
 
 
 @pytest.mark.precommit
-def test_post_oom_health(tmp_path):
-    generation_config = get_greedy()
+@pytest.mark.parametrize("sampling_config", [get_greedy(), get_beam_search(), get_multinomial_all_parameters()])
+def test_post_oom_health(tmp_path, sampling_config):
+    generation_config = sampling_config
     generation_config.ignore_eos = True
     generation_config.max_new_tokens = 1000000
 
@@ -309,9 +310,11 @@ def test_post_oom_health(tmp_path):
     pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), Tokenizer(model_path.absolute().as_posix(), {}), scheduler_config, "CPU", {})
     # First run should return incomplete response
     output = pipe.generate(["What is OpenVINO?"], generation_configs)
-    assert(len(output))
+    assert (len(output))
+    assert(len(output[0].m_generation_ids))
     # Same for the second run, here we want to make sure the cleanup works and we have free blocks after recent OOM
     output = pipe.generate(["What is OpenVINO?"], generation_configs)
-    assert(len(output))
+    assert (len(output))
+    assert(len(output[0].m_generation_ids))
     del pipe
     shutil.rmtree(model_path)
\ No newline at end of file

From 97595208b02dd479bf159305bec00b5cf1a9999f Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Thu, 25 Jul 2024 14:30:39 +0400
Subject: [PATCH 15/19] Bump version only (#684)

---
 CMakeLists.txt | 2 +-
 pyproject.toml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 27ed56b453..f45ab24279 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -18,7 +18,7 @@ elseif(NOT GENERATOR_IS_MULTI_CONFIG_VAR AND NOT DEFINED CMAKE_BUILD_TYPE)
 endif()
 
 project(OpenVINOGenAI
-        VERSION 2024.3.0.0
+        VERSION 2024.4.0.0
         DESCRIPTION "OpenVINO GenAI"
         HOMEPAGE_URL "https://github.com/openvinotoolkit/openvino.genai"
         LANGUAGES CXX)
diff --git a/pyproject.toml b/pyproject.toml
index 7cfa564ef9..af55c3f684 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "openvino_genai"
-version = "2024.3.0.0"
+version = "2024.4.0.0"
 description = "Python bindings for https://github.com/openvinotoolkit/openvino.genai"
 requires-python = ">=3.8"
 readme = {file = "src/README.md", content-type="text/markdown"}

From f42e63d706c4a51a9f470d19b5677f1b3d498c35 Mon Sep 17 00:00:00 2001
From: Zlobin Vladimir <vladimir.zlobin@intel.com>
Date: Thu, 25 Jul 2024 17:31:07 +0400
Subject: [PATCH 16/19] Fix merge conflicts resolution (#685)

---
 CMakeLists.txt                 | 18 +-----------------
 thirdparty/openvino_tokenizers |  2 +-
 2 files changed, 2 insertions(+), 18 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f45ab24279..e080b4a97a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -57,33 +57,17 @@ if(ENABLE_PYTHON)
     endif()
 endif()
 
-if(ENABLE_PYTHON)
-    # the following two calls are required for cross-compilation
-    if(OpenVINODeveloperPackage_DIR)
-        ov_find_python3(REQUIRED)
-        ov_detect_python_module_extension()
-    else()
-        if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
-            find_package(Python3 REQUIRED COMPONENTS Interpreter Development.Module)
-        else()
-            find_package(Python3 REQUIRED COMPONENTS Interpreter Development)
-        endif()
-    endif()
-endif()
-
 add_subdirectory(thirdparty)
 add_subdirectory(src)
 add_subdirectory(samples)
 add_subdirectory(tests/cpp)
 
-install(FILES LICENSE DESTINATION docs/licensing COMPONENT licensing_genai RENAME LICENSE-GENAI)
-install(FILES third-party-programs.txt DESTINATION docs/licensing COMPONENT licensing_genai RENAME third-party-programs-genai.txt)
 install(FILES LICENSE DESTINATION docs/licensing COMPONENT licensing_genai RENAME LICENSE-GENAI)
 install(FILES third-party-programs.txt DESTINATION docs/licensing COMPONENT licensing_genai RENAME third-party-programs-genai.txt)
 set(CPACK_ARCHIVE_COMPONENT_INSTALL ON)
 set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY OFF)
 # Workaround https://gitlab.kitware.com/cmake/cmake/-/issues/2614
-set(CPACK_COMPONENTS_ALL core_genai core_genai_dev cpp_samples_genai licensing_genai openvino_tokenizers openvino_tokenizers_licenses)
+set(CPACK_COMPONENTS_ALL core_genai core_genai_dev cpp_samples_genai licensing_genai openvino_tokenizers openvino_tokenizers_docs)
 if(ENABLE_PYTHON)
     list(APPEND CPACK_COMPONENTS_ALL pygenai_${Python3_VERSION_MAJOR}_${Python3_VERSION_MINOR})
 endif() 
diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers
index 04795c1b78..fb0157c30a 160000
--- a/thirdparty/openvino_tokenizers
+++ b/thirdparty/openvino_tokenizers
@@ -1 +1 @@
-Subproject commit 04795c1b78c61e3294d1744c78a8ebb5e129256c
+Subproject commit fb0157c30a8a7f6538471fe622b8b52a3800278a

From 14f9c2b1b935d805e7bcb270791880a6cfdbc657 Mon Sep 17 00:00:00 2001
From: Nikita Malinin <nikita.malinin@intel.com>
Date: Thu, 25 Jul 2024 17:25:24 +0200
Subject: [PATCH 17/19] Partial revert of #616 (#687)

Reverts broken `data-aware` changes from #616
---
 llm_bench/python/utils/nncf_utils.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/llm_bench/python/utils/nncf_utils.py b/llm_bench/python/utils/nncf_utils.py
index 25ef8aff18..01d0dd95b3 100644
--- a/llm_bench/python/utils/nncf_utils.py
+++ b/llm_bench/python/utils/nncf_utils.py
@@ -38,7 +38,7 @@ def get_compressed_path(output_dir: str, base_precision, option: str):
 
 
 INT4_MODEL_CONFIGURATION = {
-    "dolly-v2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 1.0, "scale": True},
+    "dolly-v2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
     "gpt-j-6b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64},
     "opt-6.7b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64, "ratio": 0.8},
     "red-pajama-incite-7b-instruct": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128},
@@ -69,13 +69,11 @@ def get_compressed_path(output_dir: str, base_precision, option: str):
     "mistral-7b-v0.1": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.9},
     "llama-7b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.7},
     "opt-2.7b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.7},
-    "red-pajama-incite-chat-3b-v1": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 1.0, "scale": True},
+    "red-pajama-incite-chat-3b-v1": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
     "vicuna-7b-v1.5": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 1.0},
     "stablelm-tuned-alpha-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
-    "gpt-2": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.5, "scale": True},
     "longchat-b7": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.9},
     "starcoder2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.9},
     "tiny-llama-1.1b-chat": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8},
-    "stablelm-7b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.6, "scale": True},
     "phi-2": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.9},
 }

From f2010de9fbcf69ff44b465535c3ff9efeb749f7e Mon Sep 17 00:00:00 2001
From: Sylwia Kuros <sylwia.kuros@intel.com>
Date: Fri, 26 Jul 2024 08:47:09 +0200
Subject: [PATCH 18/19] Update requirements.txt

---
 llm_bench/python/requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llm_bench/python/requirements.txt b/llm_bench/python/requirements.txt
index d83cd5a376..ed80a66deb 100644
--- a/llm_bench/python/requirements.txt
+++ b/llm_bench/python/requirements.txt
@@ -7,7 +7,6 @@ openvino_genai
 auto-gptq>=0.5.1 # for gptq
 pillow
 torch
-torchvision<0.19.0
 transformers>=4.40.0
 diffusers>=0.22.0
 #optimum is in dependency list of optimum-intel 

From 4bd1a26a08cca1895475add911bc53d8eff34a6c Mon Sep 17 00:00:00 2001
From: Anastasiia Pnevskaia <anastasiia.pnevskaia@intel.com>
Date: Fri, 26 Jul 2024 08:51:58 +0200
Subject: [PATCH 19/19] Prefix caching. (#639)

Implementation of prefix caching.

Ticket: CVS-138669
---
 .../openvino/genai/scheduler_config.hpp       |   8 +
 src/cpp/src/block_manager.hpp                 | 258 +++++++++++++++++-
 src/cpp/src/scheduler.hpp                     |  28 +-
 src/cpp/src/sequence_group.hpp                |  21 ++
 src/python/py_generate_pipeline.cpp           |   5 +-
 tests/cpp/CMakeLists.txt                      |   5 +-
 tests/cpp/block_manager.cpp                   |  31 ++-
 tests/cpp/evictor.cpp                         |  54 ++++
 tests/cpp/scheduler.cpp                       |  68 +++++
 9 files changed, 443 insertions(+), 35 deletions(-)
 create mode 100644 tests/cpp/evictor.cpp

diff --git a/src/cpp/include/openvino/genai/scheduler_config.hpp b/src/cpp/include/openvino/genai/scheduler_config.hpp
index 787060d07e..d9bf7a7b41 100644
--- a/src/cpp/include/openvino/genai/scheduler_config.hpp
+++ b/src/cpp/include/openvino/genai/scheduler_config.hpp
@@ -30,5 +30,13 @@ struct SchedulerConfig {
 
     // max number of scheduled sequences (you can think of it as "max batch size")
     std::size_t max_num_seqs = 256;
+
+    // Enable caching of KV-blocks.
+    // When turned on all previously calculated KV-caches are kept in memory for future usages.
+    // KV-caches can be rewritten if KV-cache limit is reached, but blocks are not released.
+    // This results in more RAM usage, maximum RAM usage is determined by cache_size or num_kv_blocks parameters. 
+    // When turend off only KV-cache required for batch calculation is kept in memory and 
+    // when a sequence has finished genegartion its cache is released.
+    bool enable_prefix_caching = false;
 };
 }
diff --git a/src/cpp/src/block_manager.hpp b/src/cpp/src/block_manager.hpp
index ab60b7f5ff..3b1a663235 100644
--- a/src/cpp/src/block_manager.hpp
+++ b/src/cpp/src/block_manager.hpp
@@ -6,6 +6,7 @@
 #include <memory>
 #include <list>
 #include <map>
+#include <chrono>
 
 #include "sequence_group.hpp"
 
@@ -13,13 +14,17 @@ namespace ov::genai {
 class KVCacheBlock {
     int m_ref_count;
     int m_index;
+    size_t m_hash;
+    size_t m_num_hashed_tokens;
+    std::chrono::time_point<std::chrono::system_clock> m_timestamp;
 public:
     using Ptr = std::shared_ptr<KVCacheBlock>;
     using CPtr = std::shared_ptr<const KVCacheBlock>;
 
     explicit KVCacheBlock(int index)
         : m_ref_count(0),
-          m_index(index) { }
+          m_index(index),
+          m_timestamp(std::chrono::system_clock::now()) { }
 
     int get_index() const {
         return m_index;
@@ -34,6 +39,7 @@ class KVCacheBlock {
     }
 
     void release() {
+        OPENVINO_ASSERT(m_ref_count > 0);
         --m_ref_count;
     }
 
@@ -44,15 +50,79 @@ class KVCacheBlock {
     int get_references_count() const {
         return m_ref_count;
     }
+
+    size_t get_hash() const {
+        return m_hash;
+    }
+
+    size_t get_num_hashed_tokens() const {
+        return m_num_hashed_tokens;
+    }
+
+    void set_hash(size_t hash, size_t num_hashed_tokens) {
+        m_hash = hash;
+        m_num_hashed_tokens = num_hashed_tokens;
+    }
+
+    void set_timestamp(const std::chrono::time_point<std::chrono::system_clock>& timestamp) {
+        m_timestamp = timestamp;
+    }
+
+    std::chrono::time_point<std::chrono::system_clock> get_timestamp() {
+        return m_timestamp;
+    }
+};
+
+
+class Evictor {
+    std::map<size_t, KVCacheBlock::Ptr> blocks;
+public:
+    void add(size_t hash, KVCacheBlock::Ptr block) {
+        blocks[hash] = block;
+    }
+
+    static bool block_is_less(const std::pair<size_t, KVCacheBlock::Ptr>& lhs, const std::pair<size_t, KVCacheBlock::Ptr>& rhs) {
+        return lhs.second->get_timestamp() < rhs.second->get_timestamp();
+    }
+
+    KVCacheBlock::Ptr get_block(size_t hash) {
+        if (blocks.find(hash)== blocks.end())
+        {
+            return nullptr;
+        }
+        KVCacheBlock::Ptr block = blocks[hash];
+        block->set_timestamp(std::chrono::system_clock::now());
+        block->increment();
+        blocks.erase(hash);
+        return block;
+    }
+
+    KVCacheBlock::Ptr get_lru_block() {
+        if (!blocks.size()) {
+            return nullptr;
+        }
+        auto hash_block = std::min_element(std::begin(blocks), std::end(blocks), block_is_less);
+        auto block = hash_block->second;
+        block->set_timestamp(std::chrono::system_clock::now());
+        block->increment();
+        blocks.erase(hash_block->first);
+        return block;
+    }
+
+    size_t num_blocks() const {
+        return blocks.size();
+    }
 };
 
 
 class BlockAllocator {
     std::list<KVCacheBlock::Ptr> m_free_blocks;
+    ov::genai::Evictor m_evictor;
     int m_total_num_blocks;
+    bool m_enable_prefix_caching;
 public:
-    BlockAllocator(int num_blocks) :
-        m_total_num_blocks(num_blocks) {
+    BlockAllocator(int num_blocks, bool enable_prefix_caching) :
+        m_total_num_blocks(num_blocks), m_enable_prefix_caching(enable_prefix_caching) {
         for (int block_id = 0; block_id < m_total_num_blocks; ++block_id) {
             m_free_blocks.push_back(std::make_shared<KVCacheBlock>(block_id));
         }
@@ -64,21 +134,28 @@ class BlockAllocator {
     }
 
     size_t num_free_blocks() const {
-        return m_free_blocks.size();
+        return m_free_blocks.size() + m_evictor.num_blocks();
     }
 
     bool can_allocate_blocks(size_t num_blocks) const {
-        return num_blocks <= m_free_blocks.size();
+        return num_blocks <= num_free_blocks();
     }
 
     void free(KVCacheBlock::Ptr block) {
         block->release();
         if (block->is_free()) {
-            m_free_blocks.push_back(block);
+            if (m_enable_prefix_caching)
+            {
+                m_evictor.add(block->get_hash(), block);
+            }
+            else {
+                m_free_blocks.push_back(block);
+            }
         }
     }
 
     KVCacheBlock::Ptr allocate_block() {
+        OPENVINO_ASSERT(!m_enable_prefix_caching);
         OPENVINO_ASSERT(can_allocate_blocks(1));
         KVCacheBlock::Ptr allocated_block = m_free_blocks.front();
         allocated_block->increment();
@@ -86,20 +163,83 @@ class BlockAllocator {
         return allocated_block;
     }
 
+    KVCacheBlock::Ptr allocate_block(size_t hash, size_t num_hashed_tokens, std::map<uint64_t, KVCacheBlock::Ptr>& cached_blocks) {
+        OPENVINO_ASSERT(m_enable_prefix_caching);
+        OPENVINO_ASSERT(can_allocate_blocks(1));
+        auto block = m_evictor.get_block(hash);
+        if (block != nullptr) {
+            // use cached block from evictor
+            cached_blocks[hash] = block;
+            return block;
+        }
+        // TODO: Currently we cache all allocated blocks which might be redundant for beam search,
+        // where blocks of non-used candidates are not needed in cache.
+        // This part can be improved if we cache only blocks for prompt.
+        if (cached_blocks.find(hash) != cached_blocks.end()) {
+            // use cashed block from cached_blocks
+            block = cached_blocks[hash];
+            cached_blocks[hash]->increment();
+            return block;
+        }
+        if (m_free_blocks.size() > 0) {
+            // allocate new empty block
+            KVCacheBlock::Ptr allocated_block = m_free_blocks.front();
+            allocated_block->increment();
+            allocated_block->set_hash(hash, num_hashed_tokens);
+            cached_blocks[hash] = allocated_block;
+
+            m_free_blocks.pop_front();
+            return allocated_block;
+        }
+        if (m_evictor.num_blocks() > 0) {
+            // get least resently used block from evictor and reuse it
+            KVCacheBlock::Ptr block = m_evictor.get_lru_block();
+            cached_blocks.erase(block->get_hash());
+
+            // update block with new hash
+            block->set_hash(hash, num_hashed_tokens);
+            cached_blocks[hash] = block;
+            return block;
+        }
+        // out of memory
+        return nullptr;
+    }
+
+    KVCacheBlock::Ptr get_cached_block(size_t hash, std::map<uint64_t, KVCacheBlock::Ptr>& cached_blocks) {
+        auto block = m_evictor.get_block(hash);
+        if (block != nullptr) {
+            // use cashed block from evictor
+            cached_blocks[hash] = block;
+            return block;
+        }
+        if (cached_blocks.find(hash) != cached_blocks.end()) {
+            // use cashed block from cached_blocks
+            // TODO: add tokens validation in case of hash collision
+            block = cached_blocks[hash];
+            cached_blocks[hash]->increment();
+            return block;
+        }
+        return nullptr;
+    }
+
     float get_used_percentage() const {
-        return static_cast<float>(m_total_num_blocks - m_free_blocks.size()) / m_total_num_blocks;
+        return static_cast<float>(m_total_num_blocks - num_free_blocks()) / m_total_num_blocks;
     }
 };
 
 class BlockManager {
     BlockAllocator m_allocator;
+    bool m_enable_prefix_caching;
+    size_t m_block_size;
+    // TODO: caching time can probably be improved if we use the prefix tree
+    std::map<uint64_t, KVCacheBlock::Ptr> cached_blocks;
 
     // stores blocks for each sequence (not sequence group)
     // the same block can be seen in multiple block_tables for different sequences
     std::map<uint64_t, std::vector<KVCacheBlock::Ptr>> m_block_table;
 public:
-    BlockManager(int num_blocks)
-        : m_allocator(num_blocks) { }
+    BlockManager(int num_blocks, bool enable_prefix_caching, size_t block_size)
+        : m_allocator(num_blocks, enable_prefix_caching), m_enable_prefix_caching(enable_prefix_caching), m_block_size(block_size) { }
 
     ~BlockManager() {
         // sanity check that all sequences are freed
@@ -195,11 +335,32 @@ class BlockManager {
         return m_allocator.can_allocate_blocks(num_blocks);
     }
 
-    void allocate(uint64_t sequence_id, size_t num_blocks) {
+    void allocate(ov::genai::Sequence::CPtr sequence, size_t num_blocks, const ov::genai::TokenIds& prompt_ids = {}) {
         OPENVINO_ASSERT(num_blocks > 0 && can_allocate_blocks(num_blocks));
+        if (m_enable_prefix_caching) {
+            OPENVINO_ASSERT(prompt_ids.size() > 0, "prompt_ids should be set for hash calculation.");
+        }
+        auto sequence_id = sequence->get_id();
+        auto block_table = m_block_table[sequence_id];
+        auto content_length = sequence->get_generated_len() + prompt_ids.size();
+        size_t num_hashed_tokens = block_table.size() * m_block_size;
 
         for (size_t i = 0; i < num_blocks; ++i) {
-            m_block_table[sequence_id].push_back(m_allocator.allocate_block());
+
+            ov::genai::KVCacheBlock::Ptr block = nullptr; 
+            if (m_enable_prefix_caching) {
+                num_hashed_tokens += m_block_size;
+                if (num_hashed_tokens > content_length) {
+                    num_hashed_tokens = content_length;
+                }
+                auto hash = sequence->get_hash(num_hashed_tokens, prompt_ids);
+                block = m_allocator.allocate_block(hash, num_hashed_tokens, cached_blocks);
+            }
+            else {
+                block = m_allocator.allocate_block();
+            }
+            OPENVINO_ASSERT(block != nullptr);
+            m_block_table[sequence_id].push_back(block);
         }
     }
 
@@ -324,21 +485,36 @@ class BlockManager {
 
             if (num_logical_blocks > num_physical_blocks) {
                 OPENVINO_ASSERT(can_allocate_blocks(num_logical_blocks - num_physical_blocks));
-                allocate(seq_id, num_logical_blocks - num_physical_blocks);
+                allocate(sequence, num_logical_blocks - num_physical_blocks, seq_group->get_prompt_ids());
             } else {
                 OPENVINO_ASSERT(num_logical_blocks == num_physical_blocks, "A number of physical and logic blocks must be the same in this code path");
                 KVCacheBlock::Ptr last_block = block_table.back();
-
                 if (last_block->copy_on_write()) {
                     // we need to fork current block, because reference counter is more than 1
-                    KVCacheBlock::Ptr new_block = m_allocator.allocate_block();
+                    KVCacheBlock::Ptr new_block = nullptr;
+                    if (m_enable_prefix_caching) {
+                        auto hash = sequence->get_hash(seq_group->get_context_len(), seq_group->get_prompt_ids());
+                        new_block = m_allocator.allocate_block(hash, seq_group->get_context_len(), cached_blocks);
+                        cached_blocks[hash] = new_block;
+                    }
+                    else {
+                        new_block = m_allocator.allocate_block();
+                    }
                     block_table[num_physical_blocks - 1] = new_block;
                     // write information about block forking for later usage in CacheManager
                     copy_blocks_map[last_block->get_index()].push_back(new_block->get_index());
                     // release `last_block` usage
                     m_allocator.free(last_block);
                 } else {
-                    // nothing to do, because we are the only users of this block
+                    // we are the only users of this block
+                    if (m_enable_prefix_caching) {
+                        // update hash of block
+                        auto prev_hash = last_block->get_hash();
+                        auto hash = sequence->get_hash(seq_group->get_context_len(), seq_group->get_prompt_ids());
+                        last_block->set_hash(hash, seq_group->get_context_len());
+                        cached_blocks.erase(prev_hash);
+                        cached_blocks[hash] = last_block;
+                    }
                 }
             }
         }
@@ -346,5 +522,57 @@ class BlockManager {
         // it returns information which blocks should be forked by CacheManager
         return copy_blocks_map;
     }
+
+
+    void _restore_cached_blocks(SequenceGroup::Ptr group, size_t block_size) {
+        auto prompt_ids = group->get_prompt_ids(); 
+        auto sequences = group->get_not_finished_sequences();
+        OPENVINO_ASSERT(sequences.size() == 1);
+        auto sequence = sequences[0];
+        auto seq_id = sequence->get_id();
+        auto& block_table = m_block_table[seq_id];
+
+        size_t content_len = 0;       
+        while (content_len < prompt_ids.size()) {
+            size_t prev_iteration_content_len = content_len; 
+            content_len += block_size;
+            if (content_len > prompt_ids.size()) {
+                content_len = prompt_ids.size();
+            }
+            // restore fully filled blocks
+            auto hash = sequence->get_hash(content_len, prompt_ids);
+            auto block = m_allocator.get_cached_block(hash, cached_blocks);
+            if (block != nullptr) {
+                block->set_timestamp(std::chrono::system_clock::now());
+                m_block_table[seq_id].push_back(block);
+                group->update_processed_tokens_num(content_len);
+            }
+            else {
+                // restore partially filled block
+                for (size_t i = 1; i < block_size; i++) {
+                    if (prev_iteration_content_len + i > prompt_ids.size()) {
+                        break;
+                    }
+                    auto hash = sequence->get_hash(prev_iteration_content_len + i, prompt_ids);
+                    auto block = m_allocator.get_cached_block(hash, cached_blocks);
+                    if (block != nullptr) {
+                        block->set_timestamp(std::chrono::system_clock::now());
+                        m_block_table[seq_id].push_back(block);
+                        group->update_processed_tokens_num(prev_iteration_content_len + i);
+                        
+                        size_t new_tokens_count_in_block = std::min(content_len, prev_iteration_content_len + block_size);
+                        if (new_tokens_count_in_block > prev_iteration_content_len + i) {
+                            cached_blocks.erase(hash);
+                            auto new_hash = sequence->get_hash(new_tokens_count_in_block, prompt_ids);
+                            cached_blocks[new_hash] = block;
+                        }
+
+                        break;
+                    }
+                }
+                break;                
+            }
+        }
+    }
 };
 }
diff --git a/src/cpp/src/scheduler.hpp b/src/cpp/src/scheduler.hpp
index ca749137db..c52ed8d7a6 100644
--- a/src/cpp/src/scheduler.hpp
+++ b/src/cpp/src/scheduler.hpp
@@ -10,7 +10,6 @@
 #include "openvino/genai/scheduler_config.hpp"
 #include "block_manager.hpp"
 #include "sequence_group.hpp"
-#include "block_manager.hpp"
 
 namespace ov::genai {
 class Scheduler {
@@ -34,11 +33,14 @@ class Scheduler {
     };
 
     explicit Scheduler(const SchedulerConfig & config = {}) :
-        m_config(config), m_block_manager(m_config.num_kv_blocks) { }
+        m_config(config), m_block_manager(m_config.num_kv_blocks, m_config.enable_prefix_caching, m_config.block_size) { }
 
     Output schedule(std::vector<SequenceGroup::Ptr>& sequence_groups) {
         Output scheduler_output;
 
+        if (m_config.enable_prefix_caching)
+            _restore_cached_blocks(sequence_groups);
+
         if (m_config.dynamic_split_fuse) {
             // deepspeed-mii case
             // generation phase is always scheduled first
@@ -167,6 +169,15 @@ class Scheduler {
         return std::numeric_limits<size_t>::max();
     }
 
+    void _restore_cached_blocks(const std::vector<SequenceGroup::Ptr>& sequence_groups) {
+        for (size_t sequence_group_id = 0; sequence_group_id < sequence_groups.size(); ++sequence_group_id) {
+            SequenceGroup::Ptr sequence_group = sequence_groups[sequence_group_id];
+            if (sequence_group->can_generate_tokens() || sequence_group->num_running_seqs() != 1)
+                continue;
+            m_block_manager._restore_cached_blocks(sequence_group, m_config.block_size);
+        }
+    }
+
     void _apply_preemption(size_t sequence_group_id, const std::vector<SequenceGroup::Ptr>& sequence_groups) {
         SequenceGroup::Ptr sequence_group = sequence_groups[sequence_group_id];
 
@@ -222,7 +233,7 @@ class Scheduler {
                 if (num_scheduled_tokens > 0) {
                     // allocate KV blocks if required
                     if (num_scheduled_blocks > 0)
-                        m_block_manager.allocate(seq_id, num_scheduled_blocks);
+                        m_block_manager.allocate(sequence, num_scheduled_blocks, sequence_group->get_prompt_ids());
                     // and schedule tokens
                     sequence_group->schedule_tokens(num_scheduled_tokens);
 
@@ -326,7 +337,8 @@ class Scheduler {
                 // prompt phases can have a single running sequence
                 OPENVINO_ASSERT(num_running_seqs == 1);
                 // here we also assume that sequence must be scheduler in a single shot and has no already generated context
-                OPENVINO_ASSERT(sequence_group->get_context_len() == 0);
+                if (!m_config.enable_prefix_caching)
+                    OPENVINO_ASSERT(sequence_group->get_context_len() == 0);
 
                 size_t num_available_tokens_in_megabatch = m_config.max_num_batched_tokens - scheduler_output.m_total_num_scheduled_tokens;
                 size_t sequence_len = sequence_group->get_num_available_tokens_for_batching();
@@ -354,11 +366,15 @@ class Scheduler {
                     Sequence::Ptr sequence = (*sequence_group)[0];
                     uint64_t seq_id = sequence->get_id();
 
-                    // allocate KV blocks
-                    m_block_manager.allocate(seq_id, num_required_blocks);
                     // and schedule tokens
                     sequence_group->schedule_tokens(sequence_len);
 
+                    // allocate KV blocks
+                    if (sequence_group->get_num_processed_tokens() == 0)
+                        m_block_manager.allocate(sequence, num_required_blocks, sequence_group->get_prompt_ids());
+                    else 
+                        m_block_manager.append_slots(sequence_group);
+                    
                     // add information to scheduler_output
                     {
                         scheduler_output.m_scheduled_sequence_groups_ids.push_back(sequence_group_id);
diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp
index 88b86b4484..d5b9506b2c 100644
--- a/src/cpp/src/sequence_group.hpp
+++ b/src/cpp/src/sequence_group.hpp
@@ -6,6 +6,7 @@
 #include <vector>
 #include <set>
 #include <cstdlib>
+#include <string_view>
 
 #include "openvino/genai/generation_handle.hpp"
 #include "openvino/genai/generation_config.hpp"
@@ -121,6 +122,21 @@ class Sequence {
         float score = cumulative_log_prob / std::pow(current_length, sampling_params.length_penalty);
         return score;
     }
+
+    // Each KV block can be uniquely identified by 
+    // the tokens within the block and the tokens in the prefix before the block.
+    // hash(prefix tokens + block tokens) <--> KV Block
+    size_t get_hash(size_t content_length, const ov::genai::TokenIds& prompt_ids) const {
+        std::vector<int64_t> content;
+        OPENVINO_ASSERT(content_length <= prompt_ids.size() + m_generated_ids.size());
+        content.insert( content.end(), prompt_ids.begin(), prompt_ids.begin() + std::min(prompt_ids.size(), content_length));
+        if (content_length > prompt_ids.size()) {
+            content.insert(content.end(), m_generated_ids.begin(), m_generated_ids.begin() + content_length - prompt_ids.size());
+        }
+        const char* data = reinterpret_cast<const char*>(content.data());
+        std::size_t size = content.size() * sizeof(content[0]);
+        return std::hash<std::string_view>{}(std::string_view(data, size));
+    }
 };
 
 // contains a list of Sequences in generic case (beam search or parallel sampling)
@@ -345,6 +361,11 @@ class SequenceGroup {
         clear_scheduled_tokens();
     }
 
+    void update_processed_tokens_num(size_t processed_tokens) {
+        m_num_processed_tokens = processed_tokens;
+        m_max_content_len = processed_tokens;
+    }
+
     void clear_waiting_sequences() {
         for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) {
             if (m_sequences[seq_id]->is_waiting()) {
diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp
index 8a1a226bc1..f2dea4b830 100644
--- a/src/python/py_generate_pipeline.cpp
+++ b/src/python/py_generate_pipeline.cpp
@@ -591,9 +591,10 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
         .def_readwrite("num_kv_blocks", &SchedulerConfig::num_kv_blocks)
         .def_readwrite("cache_size", &SchedulerConfig::cache_size)
         .def_readwrite("block_size", &SchedulerConfig::block_size)
-        .def_readwrite("cache_size", &SchedulerConfig::cache_size)
         .def_readwrite("dynamic_split_fuse", &SchedulerConfig::dynamic_split_fuse)
-        .def_readwrite("max_num_seqs", &SchedulerConfig::max_num_seqs);
+        .def_readwrite("max_num_seqs", &SchedulerConfig::max_num_seqs)
+        .def_readwrite("enable_prefix_caching", &SchedulerConfig::enable_prefix_caching);
+         
 
     py::class_<ContinuousBatchingPipeline>(m, "ContinuousBatchingPipeline")
         .def(py::init([](const std::string& model_path, const SchedulerConfig& scheduler_config, const std::string& device, const std::map<std::string, py::object>& llm_plugin_config, const std::map<std::string, py::object>& tokenizer_plugin_config) {
diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt
index 025a58a507..083b911416 100644
--- a/tests/cpp/CMakeLists.txt
+++ b/tests/cpp/CMakeLists.txt
@@ -4,6 +4,9 @@ FetchContent_Declare(
 )
 FetchContent_MakeAvailable(googletest)
 set(TEST_TARGET_NAME "tests_continuous_batching")
-add_executable(${TEST_TARGET_NAME} scheduler.cpp block_manager.cpp logit_filtering.cpp cache_manager.cpp generate_config.cpp)
+file(GLOB tests_src
+     "*.cpp"
+)
+add_executable(${TEST_TARGET_NAME} ${tests_src})
 target_link_libraries(${TEST_TARGET_NAME} PUBLIC openvino::genai gtest_main)
 target_include_directories(${TEST_TARGET_NAME} PRIVATE "${PROJECT_SOURCE_DIR}/src/cpp/src")
diff --git a/tests/cpp/block_manager.cpp b/tests/cpp/block_manager.cpp
index b3c89535a6..4621c184f5 100644
--- a/tests/cpp/block_manager.cpp
+++ b/tests/cpp/block_manager.cpp
@@ -10,30 +10,39 @@
 #include "scheduler.hpp"
 
 TEST(TestBlockManager, general_test) {
-    ov::genai::BlockManager bm = ov::genai::BlockManager(6);
+    ov::genai::BlockManager bm = ov::genai::BlockManager(6, false, 4);
+    ov::genai::TokenIds prompt_ids;
 
-    bm.allocate(0, 6);
-    EXPECT_TRUE(bm.has_block_table(0));
-    EXPECT_EQ(bm.get_block_table(0).size(), 6);
+    ov::genai::SequenceGroup::Ptr sequence_group = std::make_shared<ov::genai::SequenceGroup>(
+        0, 
+        ov::Tensor(ov::element::i64, {
+        prompt_ids.size()}, prompt_ids.data()),
+        ov::genai::beam_search(),
+        4);
+    auto sequence = sequence_group->get_not_finished_sequences()[0];
+    bm.allocate(sequence, 6);
+    auto seq_id = sequence->get_id();
+    EXPECT_TRUE(bm.has_block_table(seq_id));
+    EXPECT_EQ(bm.get_block_table(seq_id).size(), 6);
     EXPECT_EQ(bm.num_free_blocks(), 0);
 
-    bm.free_sequence_partially_single_runnning_sequence(0, 4);
-    EXPECT_EQ(bm.get_block_table(0).size(), 2);
+    bm.free_sequence_partially_single_runnning_sequence(seq_id, 4);
+    EXPECT_EQ(bm.get_block_table(seq_id).size(), 2);
     EXPECT_EQ(bm.num_free_blocks(), 4);
 
-    bm.free_sequence(0);
-    EXPECT_FALSE(bm.has_block_table(0));
+    bm.free_sequence(seq_id);
+    EXPECT_FALSE(bm.has_block_table(seq_id));
     EXPECT_EQ(bm.num_free_blocks(), 6);
 
-    bm.allocate(0, 2);
-    bm.fork_sequence(0, 1);
+    bm.allocate(sequence, 2);
+    bm.fork_sequence(seq_id, 1);
     EXPECT_TRUE(bm.has_block_table(1));
     EXPECT_EQ(bm.get_block_table(1).back()->get_references_count(), 2);
 
 }
 
 TEST(TestBlockManager, required_blocks_count) {
-    ov::genai::BlockManager bm = ov::genai::BlockManager(8);
+    ov::genai::BlockManager bm = ov::genai::BlockManager(8, false, 4);
 
     std::vector<uint64_t> tokens = {0,1,2,3,4};
     ov::genai::SequenceGroup::Ptr sequence_group = std::make_shared<ov::genai::SequenceGroup>(
diff --git a/tests/cpp/evictor.cpp b/tests/cpp/evictor.cpp
new file mode 100644
index 0000000000..9867dfa2b5
--- /dev/null
+++ b/tests/cpp/evictor.cpp
@@ -0,0 +1,54 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+#include "openvino/runtime/core.hpp"
+#include "scheduler.hpp"
+#include <chrono>
+#include <thread>
+
+TEST(TestEvictor, general_test) {
+    ov::genai::Evictor evictor;
+    auto block0 = std::make_shared<ov::genai::KVCacheBlock>(0);
+    block0->set_hash(77, 1);
+    std::this_thread::sleep_until(std::chrono::system_clock::now() + std::chrono::seconds(1));
+    auto block1 = std::make_shared<ov::genai::KVCacheBlock>(1);
+    block1->set_hash(56, 2);
+    std::this_thread::sleep_until(std::chrono::system_clock::now() + std::chrono::seconds(1));
+    auto block2 = std::make_shared<ov::genai::KVCacheBlock>(2);
+    block2->set_hash(23, 3);
+    std::this_thread::sleep_until(std::chrono::system_clock::now() + std::chrono::seconds(1));
+    evictor.add(block0->get_hash(), block0);
+    evictor.add(block1->get_hash(), block1);
+    evictor.add(block2->get_hash(), block2);
+    EXPECT_EQ(evictor.num_blocks(), 3);
+
+    auto block = evictor.get_block(56);
+    EXPECT_EQ(block->get_index(), 1);
+    EXPECT_EQ(block->get_hash(), 56);
+    EXPECT_EQ(block->get_references_count(), 1);
+    EXPECT_EQ(evictor.num_blocks(), 2);
+
+    EXPECT_EQ(evictor.get_block(44), nullptr);
+    EXPECT_EQ(evictor.num_blocks(), 2);
+
+    EXPECT_EQ(evictor.get_lru_block()->get_index(), 0);
+    EXPECT_EQ(evictor.num_blocks(), 1);
+
+    auto block3 = std::make_shared<ov::genai::KVCacheBlock>(7);
+    block3->set_hash(12, 4);
+    std::this_thread::sleep_until(std::chrono::system_clock::now() + std::chrono::seconds(1));
+    auto block4 = std::make_shared<ov::genai::KVCacheBlock>(10);
+    block4->set_hash(99, 5);
+    std::this_thread::sleep_until(std::chrono::system_clock::now() + std::chrono::seconds(1));
+    evictor.add(block3->get_hash(), block3);
+    evictor.add(block4->get_hash(), block4);
+    block2->set_timestamp(std::chrono::system_clock::now());
+
+    EXPECT_EQ(evictor.get_lru_block()->get_index(), 7);
+    EXPECT_EQ(evictor.get_lru_block()->get_index(), 10);
+    EXPECT_EQ(evictor.get_lru_block()->get_index(), 2);
+    EXPECT_EQ(evictor.get_lru_block(), nullptr);
+    EXPECT_EQ(evictor.num_blocks(), 0);
+}
diff --git a/tests/cpp/scheduler.cpp b/tests/cpp/scheduler.cpp
index b4114dd1b2..5468fd014b 100644
--- a/tests/cpp/scheduler.cpp
+++ b/tests/cpp/scheduler.cpp
@@ -366,3 +366,71 @@ TEST(TestScheduler, test_partially_preempted_prompt) {
         EXPECT_FALSE(scheduler.has_block_table(idx0));
     }
 }
+
+
+
+TEST(TestScheduler, prefix_caching_test) {
+    std::array<SchedulerConfig, 2> configs = {SchedulerConfig(), SchedulerConfig()};
+    configs.at(0).max_num_batched_tokens = 32;
+    configs.at(0).num_kv_blocks = 100;
+    configs.at(0).block_size = 4;
+    configs.at(0).dynamic_split_fuse = false;
+    configs.at(0).max_num_seqs = 5;
+    configs.at(0).enable_prefix_caching = true;
+    configs.at(1).max_num_batched_tokens = 32;
+    configs.at(1).num_kv_blocks = 100;
+    configs.at(1).block_size = 4;
+    configs.at(1).dynamic_split_fuse = true;
+    configs.at(1).max_num_seqs = 5;
+    configs.at(1).enable_prefix_caching = true;
+    for (auto scheduler_config: configs) {
+        std::vector<uint64_t> prompt_tokens = {0,1,2,3,4,5,6,7};
+        std::vector<uint64_t> histrory_tokens = {};
+        // schedule prompt
+        Scheduler scheduler = Scheduler(scheduler_config);
+
+        size_t chat_iterations = 10;
+
+        for (size_t chat_iteration = 0; chat_iteration < chat_iterations; chat_iteration++) {
+            std::vector<uint64_t> tokens = histrory_tokens;
+            tokens.insert(tokens.end(), prompt_tokens.begin(), prompt_tokens.end());
+            SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()),
+                                                                                    ov::genai::greedy(), scheduler_config.block_size);
+            std::vector<SequenceGroup::Ptr> requests = {sequence_group};
+
+            auto out1 = scheduler.schedule(requests);
+            if (chat_iteration == 0)
+                EXPECT_EQ(out1.m_total_num_scheduled_tokens, prompt_tokens.size());
+            else 
+                EXPECT_EQ(out1.m_total_num_scheduled_tokens, prompt_tokens.size() + 1);
+            for (auto seq: requests) {
+                std::vector<Sequence::Ptr> running_sequences = seq->get_running_sequences();
+                running_sequences[0]->append_token(23, 0.7);
+                seq->finish_iteration();
+            }
+
+            // schedule generate
+            size_t num_generate_tokens = 10;
+            for (size_t i = 0; i < num_generate_tokens; i++) {
+                auto out2 = scheduler.schedule(requests);
+                EXPECT_EQ(out2.m_total_num_scheduled_tokens, 1);
+                for (auto seq: requests) {
+                    std::vector<Sequence::Ptr> running_sequences = seq->get_running_sequences();
+                    running_sequences[0]->append_token(16, 0.9);
+                    seq->finish_iteration();
+                }
+            }
+
+            // finish sequence
+            auto sequence = requests[0]->get_running_sequences()[0];
+            sequence->set_status(SequenceStatus::FINISHED);
+            auto idx0 = sequence->get_id();
+            scheduler.free_sequence(idx0);
+            auto generated_ids = sequence->get_generated_ids();
+
+            histrory_tokens.insert(histrory_tokens.end(), prompt_tokens.begin(), prompt_tokens.end());
+            histrory_tokens.insert(histrory_tokens.end(), generated_ids.begin(), generated_ids.end());
+        }
+    }
+
+}