From 5924b23afe13d4299e3a1d49f72b6abca62346c6 Mon Sep 17 00:00:00 2001
From: sbalandi <sofya.balandina@intel.com>
Date: Mon, 13 Jan 2025 12:49:57 +0000
Subject: [PATCH] Automatically apply chat template in non-chat scenarios

---
 .github/workflows/causal_lm_cpp.yml           | 20 +++++++----
 README.md                                     |  1 -
 samples/cpp/text_generation/README.md         |  2 +-
 samples/python/text_generation/README.md      |  2 +-
 src/README.md                                 |  2 ++
 .../openvino/genai/generation_config.hpp      |  4 +++
 .../include/openvino/genai/llm_pipeline.hpp   |  4 +++
 src/cpp/include/openvino/genai/tokenizer.hpp  |  3 ++
 .../genai/visual_language/pipeline.hpp        |  8 +++++
 .../genai/whisper_generation_config.hpp       |  2 ++
 src/cpp/src/generation_config.cpp             |  1 +
 src/cpp/src/icontinuous_batching.cpp          | 15 ++++++--
 src/cpp/src/llm_pipeline_stateful.cpp         | 34 ++++++++++++++-----
 src/cpp/src/llm_pipeline_static.cpp           | 20 +++++++++--
 src/cpp/src/tokenizer.cpp                     |  8 +++++
 .../src/visual_language/inputs_embedder.cpp   | 29 ++++++++++++++--
 .../src/visual_language/inputs_embedder.hpp   |  3 ++
 src/cpp/src/visual_language/pipeline.cpp      |  2 ++
 .../openvino_genai/py_openvino_genai.pyi      |  2 ++
 src/python/py_generation_config.cpp           |  1 +
 src/python/py_tokenizer.cpp                   |  6 ++++
 tests/python_tests/common.py                  | 18 ++++++++--
 tests/python_tests/test_generation_config.py  |  2 ++
 tests/python_tests/test_llm_pipeline.py       |  6 ++--
 tests/python_tests/test_sampling.py           |  2 +-
 ...fusion-v2-1_p0_iter0_pid2834205_output.png |  3 ++
 ...fusion-v2-1_p0_iter0_pid2834425_output.png |  3 ++
 ...fusion-v2-1_p0_iter1_pid2834205_output.png |  3 ++
 ...fusion-v2-1_p0_iter1_pid2834425_output.png |  3 ++
 tools/llm_bench/task/text_generation.py       | 12 +++++++
 .../task/visual_language_generation.py        |  1 +
 tools/who_what_benchmark/whowhatbench/wwb.py  |  3 +-
 32 files changed, 193 insertions(+), 32 deletions(-)
 create mode 100644 tools/llm_bench/stable-diffusion-v2-1_p0_iter0_pid2834205_output.png
 create mode 100644 tools/llm_bench/stable-diffusion-v2-1_p0_iter0_pid2834425_output.png
 create mode 100644 tools/llm_bench/stable-diffusion-v2-1_p0_iter1_pid2834205_output.png
 create mode 100644 tools/llm_bench/stable-diffusion-v2-1_p0_iter1_pid2834425_output.png

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index c5ac2c6acc..0755847d55 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -209,7 +209,7 @@ jobs:
           "
           echo "你好！ 你好嗎？" passed
 
-          timeout 1m ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "Alan Turing was a" "return 0" "你好！ 你好嗎？" > ./pred.txt
+          timeout 1m ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?" "return 0" "你好！ 你好嗎？" > ./pred.txt
           python -c "
           import transformers
           with open('pred.txt', 'r', errors='ignore') as file:
@@ -219,7 +219,7 @@ jobs:
           print('\n\n')
           tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
           prompts = [
-            'Alan Turing was a',
+            'Why is the Sun yellow?',
             'return 0',
             '你好！ 你好嗎？'
           ]
@@ -227,7 +227,6 @@ jobs:
             if tokenizer.chat_template:
                 prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
             tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
-            print(tokenized)
             for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
                 ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
                 print(ref)
@@ -277,7 +276,10 @@ jobs:
           echo import transformers > ref.py
           echo predictions = open('cpp.txt', 'r').read() >> ref.py
           echo tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0', trust_remote_code=True) >> ref.py
-          echo tokenized = tokenizer('69', return_tensors='pt') >> ref.py
+          echo prompt = '69' >> ref.py
+          echo if tokenizer.chat_template: >> ref.py
+          echo     prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) >> ref.py
+          echo tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) >> ref.py
           echo for beam in transformers.AutoModelForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0', trust_remote_code=True).generate(**tokenized, max_new_tokens=100, do_sample=False): >> ref.py
           echo     ref = tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) >> ref.py
           echo     idx = predictions.find(ref) >> ref.py
@@ -584,7 +586,10 @@ jobs:
           with open('pred_greedy.txt', 'r') as file:
               predictions = file.read()
           tokenizer = transformers.AutoTokenizer.from_pretrained('microsoft/phi-1_5')
-          tokenized = tokenizer('Alan Turing was a', return_tensors='pt')
+          prompt = 'Alan Turing was a'
+          if tokenizer.chat_template:
+              prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
+          tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
           for output in transformers.AutoModelForCausalLM.from_pretrained('microsoft/phi-1_5').generate(**tokenized, max_length=100, do_sample=False):
               ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True)
               idx = predictions.find(ref)
@@ -639,7 +644,10 @@ jobs:
           with open('pred_greedy.txt', 'r') as file:
               predictions = file.read()
           tokenizer = transformers.AutoTokenizer.from_pretrained('ikala/redpajama-3b-chat')
-          tokenized = tokenizer('Alan Turing was a', return_tensors='pt')
+          prompt = 'Alan Turing was a'
+          if tokenizer.chat_template:
+              prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
+          tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False)
           for output in transformers.AutoModelForCausalLM.from_pretrained('ikala/redpajama-3b-chat').generate(**tokenized, max_length=100, do_sample=False):
               ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True)
               idx = predictions.find(ref)
diff --git a/README.md b/README.md
index cea1e358bc..221a81c6c3 100644
--- a/README.md
+++ b/README.md
@@ -133,7 +133,6 @@ from PIL import Image
 
 # Choose GPU instead of CPU in the line below to run the model on Intel integrated or discrete GPU
 pipe = openvino_genai.VLMPipeline("./InternVL2-1B", "CPU")
-pipe.start_chat()
 
 image = Image.open("dog.jpg")
 image_data = np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.uint8)
diff --git a/samples/cpp/text_generation/README.md b/samples/cpp/text_generation/README.md
index f370c74a80..886028ed3f 100644
--- a/samples/cpp/text_generation/README.md
+++ b/samples/cpp/text_generation/README.md
@@ -48,7 +48,7 @@ Recommended models: meta-llama/Llama-2-7b-chat-hf, TinyLlama/TinyLlama-1.1B-Chat
   ./chat_sample <MODEL_DIR>
   ```
 #### Missing chat template
-If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model.
+If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model or update it using call `pipe.get_tokenizer().set_chat_template(new_chat_template)`.
 The following template can be used as a default, but it may not work properly with every model:
 ```
 "chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user\n' + message['content'] + '<|im_end|>\n<|im_start|>assistant\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|im_end|>\n'}}{% endif %}{% endfor %}",
diff --git a/samples/python/text_generation/README.md b/samples/python/text_generation/README.md
index 84b5302639..36e456d90b 100644
--- a/samples/python/text_generation/README.md
+++ b/samples/python/text_generation/README.md
@@ -48,7 +48,7 @@ Recommended models: meta-llama/Llama-2-7b-chat-hf, TinyLlama/TinyLlama-1.1B-Chat
   python chat_sample.py model_dir
   ```
 #### Missing chat template
-If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model.
+If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model or update it using call `pipe.get_tokenizer().set_chat_template(new_chat_template)`.
 The following template can be used as a default, but it may not work properly with every model:
 ```
 "chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user\n' + message['content'] + '<|im_end|>\n<|im_start|>assistant\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|im_end|>\n'}}{% endif %}{% endfor %}",
diff --git a/src/README.md b/src/README.md
index af4953f98a..c2ed8c2a60 100644
--- a/src/README.md
+++ b/src/README.md
@@ -73,6 +73,8 @@ output:
 'it is made up of carbon atoms. The carbon atoms are arranged in a linear pattern, which gives the yellow color. The arrangement of carbon atoms in'
 ```
 
+>**Note**: The chat_template from tokenizer_config.json or from tokenizer/detokenizer model will be automatically applied to the prompt at the generation stage. If you want to disable it, you can do it by calling pipe.get_tokenizer().set_chat_template("").
+
 A simple chat in Python:
 ```python
 import openvino_genai as ov_genai
diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp
index 3a75fc02ea..e3f1abb002 100644
--- a/src/cpp/include/openvino/genai/generation_config.hpp
+++ b/src/cpp/include/openvino/genai/generation_config.hpp
@@ -128,6 +128,8 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
 
     std::optional<AdapterConfig> adapters;
 
+    bool apply_chat_template = true;
+
     /** @brief sets eos_token_id to tokenizer_eos_token_id if eos_token_id is less than 0.
      * Otherwise verifies eos_token_id == tokenizer_eos_token_id.
      */
@@ -189,6 +191,8 @@ extern OPENVINO_GENAI_EXPORTS ov::Property<size_t> rng_seed;
 static constexpr ov::Property<float> assistant_confidence_threshold{"assistant_confidence_threshold"};
 static constexpr ov::Property<size_t> num_assistant_tokens{"num_assistant_tokens"};
 
+static constexpr ov::Property<bool> apply_chat_template{"apply_chat_template"};
+
 // Predefined Configs
 
 OPENVINO_DEPRECATED("Please, use individual parameters instead of predefined configs. This method will be removed in 2026.0.0 release")
diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp
index 31b1ac1675..26232574dc 100644
--- a/src/cpp/include/openvino/genai/llm_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp
@@ -177,6 +177,8 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
     * @param generation_config optional GenerationConfig
     * @param streamer optional streamer
     * @return DecodedResults decoded resulting text
+    * chat_template will be applied to the prompt, run pipe.get_tokenizer().set_chat_template(custom_chat_template) to update it.
+    * To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false.
     */
     DecodedResults generate(
         StringInputs inputs,
@@ -191,6 +193,8 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
     * @param inputs input prompt or a vector of prompts
     * @param properties properties
     * @return DecodedResults decoded resulting text
+    * chat_template will be applied to the prompt, run pipe.get_tokenizer().set_chat_template(custom_chat_template) to update it.
+    * To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false.
     */
     template <typename... Properties>
     util::EnableIfAllStringAny<DecodedResults, Properties...> generate(
diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp
index 0a54d1da2a..bde4eb3fe1 100644
--- a/src/cpp/include/openvino/genai/tokenizer.hpp
+++ b/src/cpp/include/openvino/genai/tokenizer.hpp
@@ -221,6 +221,9 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     /// @param chat_template The new template to override with.
     void set_chat_template(const std::string& chat_template);
 
+    // get information about a chat template to check its status, for example whether it is empty
+    std::string get_chat_template() const;
+
     // information about <bos>, <eos> tokens should be public,
     // they are used at least in StreamerBase descendants
     int64_t get_bos_token_id() const;
diff --git a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp
index 8c3d380b0f..b6b1d5c7f6 100644
--- a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp
+++ b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp
@@ -98,6 +98,8 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
     /// @param generation_config A config to follow for text generation.
     /// @param streamer A streamer to acquire intermediate result.
     /// @return A string generated by a model.
+    /// chat_template will be applied to the prompt, run pipe.set_chat_template(custom_chat_template) to update it.
+    /// To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false.
     VLMDecodedResults generate(
         const std::string& prompt,
         const std::vector<ov::Tensor>& rgbs,
@@ -111,6 +113,8 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
     /// @param generation_config A config to follow for text generation.
     /// @param streamer A streamer to acquire intermediate result.
     /// @return A string generated by a model.
+    /// chat_template will be applied to the prompt, run pipe.set_chat_template(custom_chat_template) to update it.
+    /// To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false.
     VLMDecodedResults generate(
         const std::string& prompt,
         const ov::Tensor& rgb,
@@ -124,6 +128,8 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
     /// for its members, StreamerVariant a single image or multiple
     /// images.
     /// @return A string generated by a model.
+    /// chat_template will be applied to the prompt, run pipe.set_chat_template(custom_chat_template) to update it.
+    /// To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false.
     VLMDecodedResults generate(
         const std::string& prompt,
         const ov::AnyMap& config_map
@@ -137,6 +143,8 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
     /// @param ...properties ov::Property instances to be combined into
     /// ov::AnyMap.
     /// @return A string generated by a model.
+    /// chat_template will be applied to the prompt, run pipe.set_chat_template(custom_chat_template) to update it.
+    /// To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false.
     template <typename... Properties>
     util::EnableIfAllStringAny<VLMDecodedResults, Properties...> generate(
         const std::string& prompt,
diff --git a/src/cpp/include/openvino/genai/whisper_generation_config.hpp b/src/cpp/include/openvino/genai/whisper_generation_config.hpp
index 18b4202609..4bc186495f 100644
--- a/src/cpp/include/openvino/genai/whisper_generation_config.hpp
+++ b/src/cpp/include/openvino/genai/whisper_generation_config.hpp
@@ -97,6 +97,8 @@ class OPENVINO_GENAI_EXPORTS WhisperGenerationConfig : public GenerationConfig {
     // A list containing the non-speech tokens that will be suppressed during generation.
     std::vector<int64_t> suppress_tokens;
 
+    bool apply_chat_template = false;
+
     void update_generation_config(const ov::AnyMap& config_map = {});
 
     template <typename... Properties>
diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp
index de23852c9b..3914e217c4 100644
--- a/src/cpp/src/generation_config.cpp
+++ b/src/cpp/src/generation_config.cpp
@@ -128,6 +128,7 @@ void GenerationConfig::update_generation_config(const ov::AnyMap& properties) {
     read_anymap_param(properties, "logprobs", logprobs);
     read_anymap_param(properties, "num_return_sequences", num_return_sequences);
     read_anymap_param(properties, "adapters", adapters);
+    read_anymap_param(properties, "apply_chat_template", apply_chat_template);
 
     // penalties
     read_anymap_param(properties, "frequency_penalty", frequency_penalty);
diff --git a/src/cpp/src/icontinuous_batching.cpp b/src/cpp/src/icontinuous_batching.cpp
index 78f8fda8f7..6b748d6665 100644
--- a/src/cpp/src/icontinuous_batching.cpp
+++ b/src/cpp/src/icontinuous_batching.cpp
@@ -53,9 +53,20 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
     } else {
         input_ids.reserve(prompts.size());
         timer.start();
-        for (const std::string& prompt : prompts) {
+        for (size_t i = 0; i < prompts.size(); i++) {
+            const std::string& prompt = prompts.at(i);
             const auto encode_start = std::chrono::steady_clock::now();
-            input_ids.push_back(m_tokenizer.encode(prompt).input_ids);
+            ov::Tensor encoded_inputs;
+            if (sampling_params.at(i).apply_chat_template && !m_tokenizer.get_chat_template().empty()) {
+                ChatHistory history({{{"role", "user"}, {"content", prompt}}});
+                constexpr bool add_generation_prompt = true;
+                auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
+                encoded_inputs = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false)).input_ids;
+            } else {
+                // in case when chat_template was not found in tokenizer_config.json or set
+                encoded_inputs = m_tokenizer.encode(prompt).input_ids;
+            }
+            input_ids.push_back(encoded_inputs);
             tokenization_durations.emplace_back(PerfMetrics::get_microsec(std::chrono::steady_clock::now() - encode_start));
         }
         timer.end();
diff --git a/src/cpp/src/llm_pipeline_stateful.cpp b/src/cpp/src/llm_pipeline_stateful.cpp
index 6836e57257..250468ef9f 100644
--- a/src/cpp/src/llm_pipeline_stateful.cpp
+++ b/src/cpp/src/llm_pipeline_stateful.cpp
@@ -9,6 +9,8 @@
 #include "text_callback_streamer.hpp"
 #include "utils.hpp"
 
+#include "debug_utils.hpp"
+
 namespace ov::genai {
 
 StatefulLLMPipeline::StatefulLLMPipeline(
@@ -87,14 +89,19 @@ DecodedResults StatefulLLMPipeline::generate(
     TokenizedInputs encoded_input;
 
     if (auto input_vector = std::get_if<std::vector<std::string>>(&inputs)) {
-        std::vector<std::string> templated_input_vector;
-        for (auto& input : *input_vector) {
-            ChatHistory history({{{"role", "user"}, {"content", input}}});
-            constexpr bool add_generation_prompt = true;
-            auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
-            templated_input_vector.push_back(templated_prompt);
+        OPENVINO_ASSERT(!is_chat_conversation, "Can't chat with multiple prompts");
+        if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) {
+            std::vector<std::string> templated_input_vector;
+            for (auto& input : *input_vector) {
+                ChatHistory history({{{"role", "user"}, {"content", input}}});
+                constexpr bool add_generation_prompt = true;
+                auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
+                templated_input_vector.push_back(templated_prompt);
+            }
+            encoded_input = m_tokenizer.encode(templated_input_vector, ov::genai::add_special_tokens(false));
+        } else {
+            encoded_input = m_tokenizer.encode(*input_vector);
         }
-        encoded_input = m_tokenizer.encode(templated_input_vector, ov::genai::add_special_tokens(false));
     } else if (auto input_prompt = std::get_if<std::string>(&inputs)) {
         std::string& prompt = *input_prompt;
 
@@ -110,7 +117,7 @@ DecodedResults StatefulLLMPipeline::generate(
 
             m_history.push_back({{"role", "user"}, {"content", prompt}});
             constexpr bool add_generation_prompt = true;
-            auto new_templated_chat_history  = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
+            auto new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
             // Do not add special tokens in chat scenario to be aligned with HF.
             auto new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(false));
             auto prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false));
@@ -163,7 +170,16 @@ DecodedResults StatefulLLMPipeline::generate(
 
             // TODO: Forbid LoRA config change if we are in the chat mode, because it requires regenerating the history with LoRA applied
         } else {
-            encoded_input = m_tokenizer.encode(prompt);
+            std::string& prompt = *input_prompt;
+            if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) {
+                ChatHistory history({{{"role", "user"}, {"content", prompt}}});
+                constexpr bool add_generation_prompt = true;
+                auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
+                encoded_input = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false));
+            } else {
+                // in case when chat_template was not found in tokenizer_config.json or set
+                encoded_input = m_tokenizer.encode(prompt);
+            }
         }
     }
 
diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
index b17ee959c5..0d84ef4f3c 100644
--- a/src/cpp/src/llm_pipeline_static.cpp
+++ b/src/cpp/src/llm_pipeline_static.cpp
@@ -827,7 +827,15 @@ DecodedResults StatefulLLMPipeline::generate(
         // for chat ov::genai::add_special_tokens(false) is aligned with stateful pipeline and HF
         tokenized_input = m_tokenizer.encode(prompt, ov::genai::add_special_tokens(false));
     } else {
-        tokenized_input = m_tokenizer.encode(prompt);
+        if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) {
+            ChatHistory history({{{"role", "user"}, {"content", prompt}}});
+            constexpr bool add_generation_prompt = true;
+            auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
+            tokenized_input = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false));
+        } else {
+            // in case when chat_template was not found in tokenizer_config.json or set
+            tokenized_input = m_tokenizer.encode(prompt);
+        }
     }
 
     auto encode_stop_time =  std::chrono::steady_clock::now();
@@ -1294,7 +1302,15 @@ DecodedResults StatelessLLMPipeline::generate(
         // for chat ov::genai::add_special_tokens(false) is aligned with stateful pipeline and HF
         tokenized_input = m_tokenizer.encode(prompt, ov::genai::add_special_tokens(false));
     } else {
-        tokenized_input = m_tokenizer.encode(prompt);
+        if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) {
+            ChatHistory history({{{"role", "user"}, {"content", prompt}}});
+            constexpr bool add_generation_prompt = true;
+            auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
+            tokenized_input = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false));
+        } else {
+            // in case when chat_template was not found in tokenizer_config.json or set
+            tokenized_input = m_tokenizer.encode(prompt);
+        }
     }
 
     auto encode_stop_time =  std::chrono::steady_clock::now();
diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
index 9676cdb5f3..2eadda53ba 100644
--- a/src/cpp/src/tokenizer.cpp
+++ b/src/cpp/src/tokenizer.cpp
@@ -573,6 +573,10 @@ class Tokenizer::TokenizerImpl {
     void set_chat_template(const std::string& chat_template) {
         m_chat_template = patch_chat_template(chat_template);
     }
+
+    std::string get_chat_template() {
+        return m_chat_template;
+    }
 };
 
 Tokenizer::Tokenizer(const std::filesystem::path& tokenizer_path, const ov::AnyMap& properties) {
@@ -676,6 +680,10 @@ std::string Tokenizer::apply_chat_template(ChatHistory history,
     return m_pimpl->apply_chat_template(history, add_generation_prompt, chat_template);
 }
 
+std::string Tokenizer::get_chat_template() const {
+    return m_pimpl->get_chat_template();
+}
+
 void Tokenizer::set_chat_template(const std::string& chat_template) {
     m_pimpl->set_chat_template(chat_template);
 }
diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
index 66b17e5804..e912570f20 100644
--- a/src/cpp/src/visual_language/inputs_embedder.cpp
+++ b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -43,6 +43,8 @@ class InputsEmbedder::IInputsEmbedder {
     // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add best answer to history 
     // so, let's keep info about amount of tokens to trim from kv cache and amount of tokens to keep in history
     ov::genai::utils::HistoryRemoveManager m_kv_history_manager = {0, 0};
+    // True if chat template should be applied for non-chat scenario
+    bool m_apply_chat_template = true;
 
 public:
     virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) = 0;
@@ -82,6 +84,10 @@ class InputsEmbedder::IInputsEmbedder {
         std::copy(encoded_result.begin(), encoded_result.end(), std::back_inserter(m_tokenized_history));
     }
 
+    void set_apply_chat_template_status(bool apply_chat_template) {
+        m_apply_chat_template = apply_chat_template;
+    }
+
     virtual void start_chat(const std::string& system_message) {
         m_is_chat_conversation = true;
         m_kv_history_manager.reset();
@@ -155,7 +161,7 @@ class InputsEmbedder::IInputsEmbedder {
             m_history.push_back({{"role", "user"}, {"content", prompt}});
             constexpr bool add_generation_prompt = true;
             std::string new_templated_chat_history;
-            try {
+           try {
                 new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
             } catch (const std::exception& error) {
                 // Use fallback chat template if it was not found in tokenizer_config.json
@@ -169,8 +175,23 @@ class InputsEmbedder::IInputsEmbedder {
             m_templated_chat_history = std::move(new_templated_chat_history);
             return {new_chat_tokens, prev_chat_tokens};
         } else {
+            ov::Tensor encoded_input_ids;
             auto start_tokenizer_time = std::chrono::steady_clock::now();
-            ov::Tensor encoded_input_ids = m_tokenizer.encode(prompt).input_ids;
+            if (m_apply_chat_template) {
+                std::string templated_prompt;
+                ChatHistory history({{{"role", "user"}, {"content", prompt}}});
+                constexpr bool add_generation_prompt = true;
+
+                if (!m_tokenizer.get_chat_template().empty()) {
+                    templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
+                } else {
+                    // Use fallback chat template if it was not found in tokenizer_config.json
+                    templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt, chat_template_fallback);
+                }
+                encoded_input_ids = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false)).input_ids;
+            } else {
+                encoded_input_ids = m_tokenizer.encode(prompt).input_ids;
+            }
             auto end_tokenizer_time = std::chrono::steady_clock::now();
             metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
             return {encoded_input_ids, ov::Tensor()};
@@ -2046,6 +2067,10 @@ void InputsEmbedder::update_chat_history(const std::string& decoded_results) {
     return m_impl->update_chat_history(decoded_results);
 }
 
+void InputsEmbedder::set_apply_chat_template_status(bool apply_chat_template) {
+    return m_impl->set_apply_chat_template_status(apply_chat_template);
+}
+
 void InputsEmbedder::finish_chat() {
     return m_impl->finish_chat();
 }
diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp
index 4462c58185..5bd7cd3004 100644
--- a/src/cpp/src/visual_language/inputs_embedder.hpp
+++ b/src/cpp/src/visual_language/inputs_embedder.hpp
@@ -58,6 +58,9 @@ class InputsEmbedder {
     // adds currently generated text to chat history
     void update_chat_history(const std::string& decoded_results);
 
+    // set the apply_chat_template flag, which determines whether chat template should be applied for non-chat scenarios
+    void set_apply_chat_template_status(bool apply_chat_template);
+
     // finishes chat and clears a chat history 
     void finish_chat();
 private:
diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp
index 95e3064548..a3f9859384 100644
--- a/src/cpp/src/visual_language/pipeline.cpp
+++ b/src/cpp/src/visual_language/pipeline.cpp
@@ -165,6 +165,8 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
             generation_config.set_eos_token_id(m_generation_config.eos_token_id);
         generation_config.validate();
 
+        m_inputs_embedder->set_apply_chat_template_status(generation_config.apply_chat_template);
+
         auto start_get_inputs_embeds = std::chrono::steady_clock::now();
         ov::Tensor inputs_embeds = m_inputs_embedder->get_inputs_embeds(prompt, rgbs, perf_metrics);
         auto end_get_inputs_embeds = std::chrono::steady_clock::now();
diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
index f1898d1232..1ebb84616c 100644
--- a/src/python/openvino_genai/py_openvino_genai.pyi
+++ b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -578,6 +578,7 @@ class GenerationConfig:
         num_return_sequences: the number of sequences to generate from a single prompt.
     """
     adapters: AdapterConfig | None
+    apply_chat_template: bool
     assistant_confidence_threshold: float
     diversity_penalty: float
     do_sample: bool
@@ -1653,6 +1654,7 @@ class Tokenizer:
     openvino_genai.Tokenizer object is used to initialize Tokenizer
                if it's located in a different path than the main model.
     """
+    chat_template: str
     def __init__(self, tokenizer_path: os.PathLike, properties: dict[str, typing.Any] = {}, **kwargs) -> None:
         ...
     def apply_chat_template(self, history: list[dict[str, str]], add_generation_prompt: bool, chat_template: str = '') -> str:
diff --git a/src/python/py_generation_config.cpp b/src/python/py_generation_config.cpp
index e2a6d7062c..a7d7789a55 100644
--- a/src/python/py_generation_config.cpp
+++ b/src/python/py_generation_config.cpp
@@ -115,6 +115,7 @@ void init_generation_config(py::module_& m) {
         .def_readwrite("include_stop_str_in_output", &GenerationConfig::include_stop_str_in_output)
         .def_readwrite("stop_token_ids", &GenerationConfig::stop_token_ids)
         .def_readwrite("adapters", &GenerationConfig::adapters)
+        .def_readwrite("apply_chat_template", &GenerationConfig::apply_chat_template)
         .def("set_eos_token_id", &GenerationConfig::set_eos_token_id, py::arg("tokenizer_eos_token_id"))
         .def("is_beam_search", &GenerationConfig::is_beam_search)
         .def("is_greedy_decoding", &GenerationConfig::is_greedy_decoding)
diff --git a/src/python/py_tokenizer.cpp b/src/python/py_tokenizer.cpp
index 0dd9f3d715..5d8640b9d5 100644
--- a/src/python/py_tokenizer.cpp
+++ b/src/python/py_tokenizer.cpp
@@ -109,6 +109,12 @@ void init_tokenizer(py::module_& m) {
             "Override a chat_template read from tokenizer_config.json."
         )
 
+        .def_property(
+            "chat_template",
+            &Tokenizer::get_chat_template,
+            &Tokenizer::set_chat_template
+        )
+
         .def("get_pad_token_id", &Tokenizer::get_pad_token_id)
         .def("get_bos_token_id", &Tokenizer::get_bos_token_id)
         .def("get_eos_token_id", &Tokenizer::get_eos_token_id)
diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py
index b0b6a70e93..bc75be7acc 100644
--- a/tests/python_tests/common.py
+++ b/tests/python_tests/common.py
@@ -252,7 +252,12 @@ def run_hugging_face(
         # process prompt by promp as we have multiple generation configs
         for prompt, generation_config in zip(prompts, generation_configs):
             hf_generation_config = convert_to_hf(opt_model.generation_config, generation_config)
-            inputs = hf_tokenizer(prompt, return_tensors="pt")
+            inputs = {}
+            if hf_tokenizer.chat_template:
+                prompt = hf_tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
+                inputs = hf_tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
+            else:
+                inputs = hf_tokenizer(prompt, return_tensors="pt")
             input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask']
             prompt_len = 0 if generation_config.echo else input_ids.numel()
 
@@ -266,8 +271,15 @@ def run_hugging_face(
                 generation_result.m_scores = [score for score in generate_outputs.sequences_scores]
             generation_results.append(generation_result)
     else:
-        # process all prompts as a single batch as we have a single generation config for all prompts
-        inputs = hf_tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=True, padding_side='left')
+        inputs = {}
+        if hf_tokenizer.chat_template:
+            processed_prompts = []
+            for prompt in prompts:
+                processed_prompts.append(hf_tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True))
+            # process all prompts as a single batch as we have a single generation config for all prompts
+            inputs = hf_tokenizer(processed_prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=False, padding_side='left')
+        else:
+            inputs = hf_tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, padding_side='left')
         input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask']
         hf_generation_config = convert_to_hf(opt_model.generation_config, generation_configs)
         hf_encoded_outputs = opt_model.generate(input_ids, attention_mask=attention_mask, generation_config=hf_generation_config, tokenizer=hf_tokenizer)
diff --git a/tests/python_tests/test_generation_config.py b/tests/python_tests/test_generation_config.py
index 682f4a93e2..a674ca8e81 100644
--- a/tests/python_tests/test_generation_config.py
+++ b/tests/python_tests/test_generation_config.py
@@ -58,6 +58,8 @@ def verify_set_values(generation_config, kwargs):
     dict(max_new_tokens=1, assistant_confidence_threshold=0.5),
     dict(max_new_tokens=1, num_assistant_tokens=2),
     dict(max_new_tokens=1, num_assistant_tokens=2, max_ngram_size=2), # prompt lookup
+    dict(max_new_tokens=1, apply_chat_template=True),
+    dict(max_new_tokens=1, apply_chat_template=False),
 ]
 @pytest.mark.parametrize("generation_config_kwargs", configs)
 @pytest.mark.precommit
diff --git a/tests/python_tests/test_llm_pipeline.py b/tests/python_tests/test_llm_pipeline.py
index 8968f2a083..1396b23aa7 100644
--- a/tests/python_tests/test_llm_pipeline.py
+++ b/tests/python_tests/test_llm_pipeline.py
@@ -339,7 +339,7 @@ def test_unicode_pybind_decoding_one_string():
     # Test that pybind will not fail.
     model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')
     ov_pipe = read_model((model_id, path))[4]
-    res_str = ov_pipe.generate(',', max_new_tokens=4)
+    res_str = ov_pipe.generate(',', max_new_tokens=4, apply_chat_template=False)
     assert '�' == res_str[-1]
 
 
@@ -350,7 +350,7 @@ def test_unicode_pybind_decoding_batched():
     # Test that pybind will not fail.
     model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')
     ov_pipe = read_model((model_id, path))[4]
-    res_str = ov_pipe.generate([","], max_new_tokens=4)
+    res_str = ov_pipe.generate([","], max_new_tokens=4, apply_chat_template=False)
     assert '�' == res_str.texts[0][-1]
 
 
@@ -362,7 +362,7 @@ def test_unicode_pybind_decoding_one_string_streamer():
     model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')
     ov_pipe = read_model((model_id, path))[4]
     res_str = []
-    ov_pipe.generate(",", max_new_tokens=4, streamer=lambda x: res_str.append(x))
+    ov_pipe.generate(",", max_new_tokens=4, apply_chat_template=False, streamer=lambda x: res_str.append(x))
     assert '�' == ''.join(res_str)[-1]
 
 #
diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py
index 7a3aced29a..3ad1335681 100644
--- a/tests/python_tests/test_sampling.py
+++ b/tests/python_tests/test_sampling.py
@@ -18,7 +18,7 @@
                           (dict(max_new_tokens=30, min_new_tokens=30), '你好！ 你好嗎？'),
                           (dict(max_new_tokens=30, ignore_eos=True), 'Alan Turing was a'),
                         #   (dict(max_length=40), 'table is made of'),
-                          (dict(stop_token_ids={28998}), 'The Sun is yellow because'), # since a test does not hang, it means stop token is met
+                          (dict(stop_token_ids={28998}, apply_chat_template=False), 'The Sun is yellow because'), # since a test does not hang, it means stop token is met, skip chat template to generate long answer
                         #   (dict(max_new_tokens=1, min_new_tokens=0, echo=True), 'What is OpenVINO?')
                           ],
                          ids=["max_new_tokens",
diff --git a/tools/llm_bench/stable-diffusion-v2-1_p0_iter0_pid2834205_output.png b/tools/llm_bench/stable-diffusion-v2-1_p0_iter0_pid2834205_output.png
new file mode 100644
index 0000000000..b45b62e5e8
--- /dev/null
+++ b/tools/llm_bench/stable-diffusion-v2-1_p0_iter0_pid2834205_output.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b8ec37757008e9e8ac18b6cebfb7a64e88eff17511be3f24c8aa219e192ae56b
+size 134952
diff --git a/tools/llm_bench/stable-diffusion-v2-1_p0_iter0_pid2834425_output.png b/tools/llm_bench/stable-diffusion-v2-1_p0_iter0_pid2834425_output.png
new file mode 100644
index 0000000000..d7c4c010be
--- /dev/null
+++ b/tools/llm_bench/stable-diffusion-v2-1_p0_iter0_pid2834425_output.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:720577da63f0352976eacf388fb6b113a2c62f613bf581bff6c18fb10db58926
+size 135308
diff --git a/tools/llm_bench/stable-diffusion-v2-1_p0_iter1_pid2834205_output.png b/tools/llm_bench/stable-diffusion-v2-1_p0_iter1_pid2834205_output.png
new file mode 100644
index 0000000000..b45b62e5e8
--- /dev/null
+++ b/tools/llm_bench/stable-diffusion-v2-1_p0_iter1_pid2834205_output.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b8ec37757008e9e8ac18b6cebfb7a64e88eff17511be3f24c8aa219e192ae56b
+size 134952
diff --git a/tools/llm_bench/stable-diffusion-v2-1_p0_iter1_pid2834425_output.png b/tools/llm_bench/stable-diffusion-v2-1_p0_iter1_pid2834425_output.png
new file mode 100644
index 0000000000..d7c4c010be
--- /dev/null
+++ b/tools/llm_bench/stable-diffusion-v2-1_p0_iter1_pid2834425_output.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:720577da63f0352976eacf388fb6b113a2c62f613bf581bff6c18fb10db58926
+size 135308
diff --git a/tools/llm_bench/task/text_generation.py b/tools/llm_bench/task/text_generation.py
index 76f5678dd9..19d64197f1 100644
--- a/tools/llm_bench/task/text_generation.py
+++ b/tools/llm_bench/task/text_generation.py
@@ -234,6 +234,7 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data
     gen_config.rng_seed = args["seed"]
     gen_config.num_beams = args["num_beams"]
     gen_config.do_sample = False
+    gen_config.apply_chat_template = False
     if args.get('draft_model', ''):
         config_info = "Speculative decoding config: "
         if args.get('num_assistant_tokens', None):
@@ -380,7 +381,14 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg
     gen_config.max_new_tokens = max_gen_tokens
     gen_config.num_beams = args["num_beams"]
     gen_config.do_sample = False
+<<<<<<< HEAD
     gen_config.ignore_eos = True
+=======
+<<<<<<< HEAD
+=======
+    gen_config.ignore_eos = True
+    gen_config.apply_chat_template = False
+>>>>>>> 94cb1d5e (Automatically apply chat template in non-chat scenarios)
     enable_prompt_permutations = not args.get("disable_prompt_permutation", False)
     if enable_prompt_permutations:
         log.warning(
@@ -394,6 +402,10 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg
         input_ids[:, 0] = num + 1
         attention_mask = input_data.attention_mask
         input_data = TokenizedInputs(input_ids=ov.Tensor(input_ids), attention_mask=attention_mask)
+<<<<<<< HEAD
+=======
+>>>>>>> 86894870 (Automatically apply chat template in non-chat scenarios)
+>>>>>>> 94cb1d5e (Automatically apply chat template in non-chat scenarios)
     if args.get('draft_model', ''):
         config_info = "Speculative decoding config: "
         if args.get("num_assistant_tokens", None):
diff --git a/tools/llm_bench/task/visual_language_generation.py b/tools/llm_bench/task/visual_language_generation.py
index a02b16b2bb..9cc6702999 100644
--- a/tools/llm_bench/task/visual_language_generation.py
+++ b/tools/llm_bench/task/visual_language_generation.py
@@ -211,6 +211,7 @@ def run_visual_language_generation_genai(
     gen_config.max_new_tokens = max_gen_tokens
     gen_config.num_beams = args["num_beams"]
     gen_config.do_sample = False
+    gen_config.apply_chat_template = False
     kwargs = {}
     if len(images) >= 1:
         kwargs["images"] = images[0]
diff --git a/tools/who_what_benchmark/whowhatbench/wwb.py b/tools/who_what_benchmark/whowhatbench/wwb.py
index 7d4354f846..fa7dc40401 100644
--- a/tools/who_what_benchmark/whowhatbench/wwb.py
+++ b/tools/who_what_benchmark/whowhatbench/wwb.py
@@ -267,7 +267,7 @@ def genai_gen_text(model, tokenizer, question, max_new_tokens, skip_question, us
         model.finish_chat()
         return result
     else:
-        return model.generate(question, do_sample=False, max_new_tokens=max_new_tokens)
+        return model.generate(question, do_sample=False, max_new_tokens=max_new_tokens, apply_chat_template=False)
 
 
 def llamacpp_gen_text(model, tokenizer, question, max_new_tokens, skip_question, use_chat_template=False):
@@ -336,6 +336,7 @@ def genai_gen_visual_text(model, prompt, image, processor, tokenizer, max_new_to
     config = model.get_generation_config()
     config.max_new_tokens = max_new_tokens
     config.do_sample = False
+    config.apply_chat_template = False
     model.set_generation_config(config)
     if tokenizer.chat_template is not None:
         model.start_chat(tokenizer.chat_template)