From 5924b23afe13d4299e3a1d49f72b6abca62346c6 Mon Sep 17 00:00:00 2001 From: sbalandi Date: Mon, 13 Jan 2025 12:49:57 +0000 Subject: [PATCH] Automatically apply chat template in non-chat scenarios --- .github/workflows/causal_lm_cpp.yml | 20 +++++++---- README.md | 1 - samples/cpp/text_generation/README.md | 2 +- samples/python/text_generation/README.md | 2 +- src/README.md | 2 ++ .../openvino/genai/generation_config.hpp | 4 +++ .../include/openvino/genai/llm_pipeline.hpp | 4 +++ src/cpp/include/openvino/genai/tokenizer.hpp | 3 ++ .../genai/visual_language/pipeline.hpp | 8 +++++ .../genai/whisper_generation_config.hpp | 2 ++ src/cpp/src/generation_config.cpp | 1 + src/cpp/src/icontinuous_batching.cpp | 15 ++++++-- src/cpp/src/llm_pipeline_stateful.cpp | 34 ++++++++++++++----- src/cpp/src/llm_pipeline_static.cpp | 20 +++++++++-- src/cpp/src/tokenizer.cpp | 8 +++++ .../src/visual_language/inputs_embedder.cpp | 29 ++++++++++++++-- .../src/visual_language/inputs_embedder.hpp | 3 ++ src/cpp/src/visual_language/pipeline.cpp | 2 ++ .../openvino_genai/py_openvino_genai.pyi | 2 ++ src/python/py_generation_config.cpp | 1 + src/python/py_tokenizer.cpp | 6 ++++ tests/python_tests/common.py | 18 ++++++++-- tests/python_tests/test_generation_config.py | 2 ++ tests/python_tests/test_llm_pipeline.py | 6 ++-- tests/python_tests/test_sampling.py | 2 +- ...fusion-v2-1_p0_iter0_pid2834205_output.png | 3 ++ ...fusion-v2-1_p0_iter0_pid2834425_output.png | 3 ++ ...fusion-v2-1_p0_iter1_pid2834205_output.png | 3 ++ ...fusion-v2-1_p0_iter1_pid2834425_output.png | 3 ++ tools/llm_bench/task/text_generation.py | 12 +++++++ .../task/visual_language_generation.py | 1 + tools/who_what_benchmark/whowhatbench/wwb.py | 3 +- 32 files changed, 193 insertions(+), 32 deletions(-) create mode 100644 tools/llm_bench/stable-diffusion-v2-1_p0_iter0_pid2834205_output.png create mode 100644 tools/llm_bench/stable-diffusion-v2-1_p0_iter0_pid2834425_output.png create mode 100644 tools/llm_bench/stable-diffusion-v2-1_p0_iter1_pid2834205_output.png create mode 100644 tools/llm_bench/stable-diffusion-v2-1_p0_iter1_pid2834425_output.png diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index c5ac2c6acc..0755847d55 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -209,7 +209,7 @@ jobs: " echo "你好! 你好嗎?" passed - timeout 1m ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "Alan Turing was a" "return 0" "你好! 你好嗎?" > ./pred.txt + timeout 1m ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?" "return 0" "你好! 你好嗎?" > ./pred.txt python -c " import transformers with open('pred.txt', 'r', errors='ignore') as file: @@ -219,7 +219,7 @@ jobs: print('\n\n') tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') prompts = [ - 'Alan Turing was a', + 'Why is the Sun yellow?', 'return 0', '你好! 你好嗎?' ] @@ -227,7 +227,6 @@ jobs: if tokenizer.chat_template: prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) - print(tokenized) for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) print(ref) @@ -277,7 +276,10 @@ jobs: echo import transformers > ref.py echo predictions = open('cpp.txt', 'r').read() >> ref.py echo tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0', trust_remote_code=True) >> ref.py - echo tokenized = tokenizer('69', return_tensors='pt') >> ref.py + echo prompt = '69' >> ref.py + echo if tokenizer.chat_template: >> ref.py + echo prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) >> ref.py + echo tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) >> ref.py echo for beam in transformers.AutoModelForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0', trust_remote_code=True).generate(**tokenized, max_new_tokens=100, do_sample=False): >> ref.py echo ref = tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) >> ref.py echo idx = predictions.find(ref) >> ref.py @@ -584,7 +586,10 @@ jobs: with open('pred_greedy.txt', 'r') as file: predictions = file.read() tokenizer = transformers.AutoTokenizer.from_pretrained('microsoft/phi-1_5') - tokenized = tokenizer('Alan Turing was a', return_tensors='pt') + prompt = 'Alan Turing was a' + if tokenizer.chat_template: + prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) for output in transformers.AutoModelForCausalLM.from_pretrained('microsoft/phi-1_5').generate(**tokenized, max_length=100, do_sample=False): ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) @@ -639,7 +644,10 @@ jobs: with open('pred_greedy.txt', 'r') as file: predictions = file.read() tokenizer = transformers.AutoTokenizer.from_pretrained('ikala/redpajama-3b-chat') - tokenized = tokenizer('Alan Turing was a', return_tensors='pt') + prompt = 'Alan Turing was a' + if tokenizer.chat_template: + prompt = tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(prompt, return_tensors='pt', add_special_tokens=False) for output in transformers.AutoModelForCausalLM.from_pretrained('ikala/redpajama-3b-chat').generate(**tokenized, max_length=100, do_sample=False): ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) diff --git a/README.md b/README.md index cea1e358bc..221a81c6c3 100644 --- a/README.md +++ b/README.md @@ -133,7 +133,6 @@ from PIL import Image # Choose GPU instead of CPU in the line below to run the model on Intel integrated or discrete GPU pipe = openvino_genai.VLMPipeline("./InternVL2-1B", "CPU") -pipe.start_chat() image = Image.open("dog.jpg") image_data = np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.uint8) diff --git a/samples/cpp/text_generation/README.md b/samples/cpp/text_generation/README.md index f370c74a80..886028ed3f 100644 --- a/samples/cpp/text_generation/README.md +++ b/samples/cpp/text_generation/README.md @@ -48,7 +48,7 @@ Recommended models: meta-llama/Llama-2-7b-chat-hf, TinyLlama/TinyLlama-1.1B-Chat ./chat_sample ``` #### Missing chat template -If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model. +If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model or update it using call `pipe.get_tokenizer().set_chat_template(new_chat_template)`. The following template can be used as a default, but it may not work properly with every model: ``` "chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user\n' + message['content'] + '<|im_end|>\n<|im_start|>assistant\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|im_end|>\n'}}{% endif %}{% endfor %}", diff --git a/samples/python/text_generation/README.md b/samples/python/text_generation/README.md index 84b5302639..36e456d90b 100644 --- a/samples/python/text_generation/README.md +++ b/samples/python/text_generation/README.md @@ -48,7 +48,7 @@ Recommended models: meta-llama/Llama-2-7b-chat-hf, TinyLlama/TinyLlama-1.1B-Chat python chat_sample.py model_dir ``` #### Missing chat template -If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model. +If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model or update it using call `pipe.get_tokenizer().set_chat_template(new_chat_template)`. The following template can be used as a default, but it may not work properly with every model: ``` "chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user\n' + message['content'] + '<|im_end|>\n<|im_start|>assistant\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|im_end|>\n'}}{% endif %}{% endfor %}", diff --git a/src/README.md b/src/README.md index af4953f98a..c2ed8c2a60 100644 --- a/src/README.md +++ b/src/README.md @@ -73,6 +73,8 @@ output: 'it is made up of carbon atoms. The carbon atoms are arranged in a linear pattern, which gives the yellow color. The arrangement of carbon atoms in' ``` +>**Note**: The chat_template from tokenizer_config.json or from tokenizer/detokenizer model will be automatically applied to the prompt at the generation stage. If you want to disable it, you can do it by calling pipe.get_tokenizer().set_chat_template(""). + A simple chat in Python: ```python import openvino_genai as ov_genai diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp index 3a75fc02ea..e3f1abb002 100644 --- a/src/cpp/include/openvino/genai/generation_config.hpp +++ b/src/cpp/include/openvino/genai/generation_config.hpp @@ -128,6 +128,8 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { std::optional adapters; + bool apply_chat_template = true; + /** @brief sets eos_token_id to tokenizer_eos_token_id if eos_token_id is less than 0. * Otherwise verifies eos_token_id == tokenizer_eos_token_id. */ @@ -189,6 +191,8 @@ extern OPENVINO_GENAI_EXPORTS ov::Property rng_seed; static constexpr ov::Property assistant_confidence_threshold{"assistant_confidence_threshold"}; static constexpr ov::Property num_assistant_tokens{"num_assistant_tokens"}; +static constexpr ov::Property apply_chat_template{"apply_chat_template"}; + // Predefined Configs OPENVINO_DEPRECATED("Please, use individual parameters instead of predefined configs. This method will be removed in 2026.0.0 release") diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp index 31b1ac1675..26232574dc 100644 --- a/src/cpp/include/openvino/genai/llm_pipeline.hpp +++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp @@ -177,6 +177,8 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { * @param generation_config optional GenerationConfig * @param streamer optional streamer * @return DecodedResults decoded resulting text + * chat_template will be applied to the prompt, run pipe.get_tokenizer().set_chat_template(custom_chat_template) to update it. + * To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false. */ DecodedResults generate( StringInputs inputs, @@ -191,6 +193,8 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { * @param inputs input prompt or a vector of prompts * @param properties properties * @return DecodedResults decoded resulting text + * chat_template will be applied to the prompt, run pipe.get_tokenizer().set_chat_template(custom_chat_template) to update it. + * To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false. */ template util::EnableIfAllStringAny generate( diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp index 0a54d1da2a..bde4eb3fe1 100644 --- a/src/cpp/include/openvino/genai/tokenizer.hpp +++ b/src/cpp/include/openvino/genai/tokenizer.hpp @@ -221,6 +221,9 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { /// @param chat_template The new template to override with. void set_chat_template(const std::string& chat_template); + // get information about a chat template to check its status, for example whether it is empty + std::string get_chat_template() const; + // information about , tokens should be public, // they are used at least in StreamerBase descendants int64_t get_bos_token_id() const; diff --git a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp index 8c3d380b0f..b6b1d5c7f6 100644 --- a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp +++ b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp @@ -98,6 +98,8 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { /// @param generation_config A config to follow for text generation. /// @param streamer A streamer to acquire intermediate result. /// @return A string generated by a model. + /// chat_template will be applied to the prompt, run pipe.set_chat_template(custom_chat_template) to update it. + /// To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false. VLMDecodedResults generate( const std::string& prompt, const std::vector& rgbs, @@ -111,6 +113,8 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { /// @param generation_config A config to follow for text generation. /// @param streamer A streamer to acquire intermediate result. /// @return A string generated by a model. + /// chat_template will be applied to the prompt, run pipe.set_chat_template(custom_chat_template) to update it. + /// To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false. VLMDecodedResults generate( const std::string& prompt, const ov::Tensor& rgb, @@ -124,6 +128,8 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { /// for its members, StreamerVariant a single image or multiple /// images. /// @return A string generated by a model. + /// chat_template will be applied to the prompt, run pipe.set_chat_template(custom_chat_template) to update it. + /// To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false. VLMDecodedResults generate( const std::string& prompt, const ov::AnyMap& config_map @@ -137,6 +143,8 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { /// @param ...properties ov::Property instances to be combined into /// ov::AnyMap. /// @return A string generated by a model. + /// chat_template will be applied to the prompt, run pipe.set_chat_template(custom_chat_template) to update it. + /// To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false. template util::EnableIfAllStringAny generate( const std::string& prompt, diff --git a/src/cpp/include/openvino/genai/whisper_generation_config.hpp b/src/cpp/include/openvino/genai/whisper_generation_config.hpp index 18b4202609..4bc186495f 100644 --- a/src/cpp/include/openvino/genai/whisper_generation_config.hpp +++ b/src/cpp/include/openvino/genai/whisper_generation_config.hpp @@ -97,6 +97,8 @@ class OPENVINO_GENAI_EXPORTS WhisperGenerationConfig : public GenerationConfig { // A list containing the non-speech tokens that will be suppressed during generation. std::vector suppress_tokens; + bool apply_chat_template = false; + void update_generation_config(const ov::AnyMap& config_map = {}); template diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp index de23852c9b..3914e217c4 100644 --- a/src/cpp/src/generation_config.cpp +++ b/src/cpp/src/generation_config.cpp @@ -128,6 +128,7 @@ void GenerationConfig::update_generation_config(const ov::AnyMap& properties) { read_anymap_param(properties, "logprobs", logprobs); read_anymap_param(properties, "num_return_sequences", num_return_sequences); read_anymap_param(properties, "adapters", adapters); + read_anymap_param(properties, "apply_chat_template", apply_chat_template); // penalties read_anymap_param(properties, "frequency_penalty", frequency_penalty); diff --git a/src/cpp/src/icontinuous_batching.cpp b/src/cpp/src/icontinuous_batching.cpp index 78f8fda8f7..6b748d6665 100644 --- a/src/cpp/src/icontinuous_batching.cpp +++ b/src/cpp/src/icontinuous_batching.cpp @@ -53,9 +53,20 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( } else { input_ids.reserve(prompts.size()); timer.start(); - for (const std::string& prompt : prompts) { + for (size_t i = 0; i < prompts.size(); i++) { + const std::string& prompt = prompts.at(i); const auto encode_start = std::chrono::steady_clock::now(); - input_ids.push_back(m_tokenizer.encode(prompt).input_ids); + ov::Tensor encoded_inputs; + if (sampling_params.at(i).apply_chat_template && !m_tokenizer.get_chat_template().empty()) { + ChatHistory history({{{"role", "user"}, {"content", prompt}}}); + constexpr bool add_generation_prompt = true; + auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt); + encoded_inputs = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false)).input_ids; + } else { + // in case when chat_template was not found in tokenizer_config.json or set + encoded_inputs = m_tokenizer.encode(prompt).input_ids; + } + input_ids.push_back(encoded_inputs); tokenization_durations.emplace_back(PerfMetrics::get_microsec(std::chrono::steady_clock::now() - encode_start)); } timer.end(); diff --git a/src/cpp/src/llm_pipeline_stateful.cpp b/src/cpp/src/llm_pipeline_stateful.cpp index 6836e57257..250468ef9f 100644 --- a/src/cpp/src/llm_pipeline_stateful.cpp +++ b/src/cpp/src/llm_pipeline_stateful.cpp @@ -9,6 +9,8 @@ #include "text_callback_streamer.hpp" #include "utils.hpp" +#include "debug_utils.hpp" + namespace ov::genai { StatefulLLMPipeline::StatefulLLMPipeline( @@ -87,14 +89,19 @@ DecodedResults StatefulLLMPipeline::generate( TokenizedInputs encoded_input; if (auto input_vector = std::get_if>(&inputs)) { - std::vector templated_input_vector; - for (auto& input : *input_vector) { - ChatHistory history({{{"role", "user"}, {"content", input}}}); - constexpr bool add_generation_prompt = true; - auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt); - templated_input_vector.push_back(templated_prompt); + OPENVINO_ASSERT(!is_chat_conversation, "Can't chat with multiple prompts"); + if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) { + std::vector templated_input_vector; + for (auto& input : *input_vector) { + ChatHistory history({{{"role", "user"}, {"content", input}}}); + constexpr bool add_generation_prompt = true; + auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt); + templated_input_vector.push_back(templated_prompt); + } + encoded_input = m_tokenizer.encode(templated_input_vector, ov::genai::add_special_tokens(false)); + } else { + encoded_input = m_tokenizer.encode(*input_vector); } - encoded_input = m_tokenizer.encode(templated_input_vector, ov::genai::add_special_tokens(false)); } else if (auto input_prompt = std::get_if(&inputs)) { std::string& prompt = *input_prompt; @@ -110,7 +117,7 @@ DecodedResults StatefulLLMPipeline::generate( m_history.push_back({{"role", "user"}, {"content", prompt}}); constexpr bool add_generation_prompt = true; - auto new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); + auto new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); // Do not add special tokens in chat scenario to be aligned with HF. auto new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(false)); auto prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false)); @@ -163,7 +170,16 @@ DecodedResults StatefulLLMPipeline::generate( // TODO: Forbid LoRA config change if we are in the chat mode, because it requires regenerating the history with LoRA applied } else { - encoded_input = m_tokenizer.encode(prompt); + std::string& prompt = *input_prompt; + if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) { + ChatHistory history({{{"role", "user"}, {"content", prompt}}}); + constexpr bool add_generation_prompt = true; + auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt); + encoded_input = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false)); + } else { + // in case when chat_template was not found in tokenizer_config.json or set + encoded_input = m_tokenizer.encode(prompt); + } } } diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index b17ee959c5..0d84ef4f3c 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -827,7 +827,15 @@ DecodedResults StatefulLLMPipeline::generate( // for chat ov::genai::add_special_tokens(false) is aligned with stateful pipeline and HF tokenized_input = m_tokenizer.encode(prompt, ov::genai::add_special_tokens(false)); } else { - tokenized_input = m_tokenizer.encode(prompt); + if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) { + ChatHistory history({{{"role", "user"}, {"content", prompt}}}); + constexpr bool add_generation_prompt = true; + auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt); + tokenized_input = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false)); + } else { + // in case when chat_template was not found in tokenizer_config.json or set + tokenized_input = m_tokenizer.encode(prompt); + } } auto encode_stop_time = std::chrono::steady_clock::now(); @@ -1294,7 +1302,15 @@ DecodedResults StatelessLLMPipeline::generate( // for chat ov::genai::add_special_tokens(false) is aligned with stateful pipeline and HF tokenized_input = m_tokenizer.encode(prompt, ov::genai::add_special_tokens(false)); } else { - tokenized_input = m_tokenizer.encode(prompt); + if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) { + ChatHistory history({{{"role", "user"}, {"content", prompt}}}); + constexpr bool add_generation_prompt = true; + auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt); + tokenized_input = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false)); + } else { + // in case when chat_template was not found in tokenizer_config.json or set + tokenized_input = m_tokenizer.encode(prompt); + } } auto encode_stop_time = std::chrono::steady_clock::now(); diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp index 9676cdb5f3..2eadda53ba 100644 --- a/src/cpp/src/tokenizer.cpp +++ b/src/cpp/src/tokenizer.cpp @@ -573,6 +573,10 @@ class Tokenizer::TokenizerImpl { void set_chat_template(const std::string& chat_template) { m_chat_template = patch_chat_template(chat_template); } + + std::string get_chat_template() { + return m_chat_template; + } }; Tokenizer::Tokenizer(const std::filesystem::path& tokenizer_path, const ov::AnyMap& properties) { @@ -676,6 +680,10 @@ std::string Tokenizer::apply_chat_template(ChatHistory history, return m_pimpl->apply_chat_template(history, add_generation_prompt, chat_template); } +std::string Tokenizer::get_chat_template() const { + return m_pimpl->get_chat_template(); +} + void Tokenizer::set_chat_template(const std::string& chat_template) { m_pimpl->set_chat_template(chat_template); } diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index 66b17e5804..e912570f20 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -43,6 +43,8 @@ class InputsEmbedder::IInputsEmbedder { // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add best answer to history // so, let's keep info about amount of tokens to trim from kv cache and amount of tokens to keep in history ov::genai::utils::HistoryRemoveManager m_kv_history_manager = {0, 0}; + // True if chat template should be applied for non-chat scenario + bool m_apply_chat_template = true; public: virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) = 0; @@ -82,6 +84,10 @@ class InputsEmbedder::IInputsEmbedder { std::copy(encoded_result.begin(), encoded_result.end(), std::back_inserter(m_tokenized_history)); } + void set_apply_chat_template_status(bool apply_chat_template) { + m_apply_chat_template = apply_chat_template; + } + virtual void start_chat(const std::string& system_message) { m_is_chat_conversation = true; m_kv_history_manager.reset(); @@ -155,7 +161,7 @@ class InputsEmbedder::IInputsEmbedder { m_history.push_back({{"role", "user"}, {"content", prompt}}); constexpr bool add_generation_prompt = true; std::string new_templated_chat_history; - try { + try { new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); } catch (const std::exception& error) { // Use fallback chat template if it was not found in tokenizer_config.json @@ -169,8 +175,23 @@ class InputsEmbedder::IInputsEmbedder { m_templated_chat_history = std::move(new_templated_chat_history); return {new_chat_tokens, prev_chat_tokens}; } else { + ov::Tensor encoded_input_ids; auto start_tokenizer_time = std::chrono::steady_clock::now(); - ov::Tensor encoded_input_ids = m_tokenizer.encode(prompt).input_ids; + if (m_apply_chat_template) { + std::string templated_prompt; + ChatHistory history({{{"role", "user"}, {"content", prompt}}}); + constexpr bool add_generation_prompt = true; + + if (!m_tokenizer.get_chat_template().empty()) { + templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt); + } else { + // Use fallback chat template if it was not found in tokenizer_config.json + templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt, chat_template_fallback); + } + encoded_input_ids = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false)).input_ids; + } else { + encoded_input_ids = m_tokenizer.encode(prompt).input_ids; + } auto end_tokenizer_time = std::chrono::steady_clock::now(); metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); return {encoded_input_ids, ov::Tensor()}; @@ -2046,6 +2067,10 @@ void InputsEmbedder::update_chat_history(const std::string& decoded_results) { return m_impl->update_chat_history(decoded_results); } +void InputsEmbedder::set_apply_chat_template_status(bool apply_chat_template) { + return m_impl->set_apply_chat_template_status(apply_chat_template); +} + void InputsEmbedder::finish_chat() { return m_impl->finish_chat(); } diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp index 4462c58185..5bd7cd3004 100644 --- a/src/cpp/src/visual_language/inputs_embedder.hpp +++ b/src/cpp/src/visual_language/inputs_embedder.hpp @@ -58,6 +58,9 @@ class InputsEmbedder { // adds currently generated text to chat history void update_chat_history(const std::string& decoded_results); + // set the apply_chat_template flag, which determines whether chat template should be applied for non-chat scenarios + void set_apply_chat_template_status(bool apply_chat_template); + // finishes chat and clears a chat history void finish_chat(); private: diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 95e3064548..a3f9859384 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -165,6 +165,8 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { generation_config.set_eos_token_id(m_generation_config.eos_token_id); generation_config.validate(); + m_inputs_embedder->set_apply_chat_template_status(generation_config.apply_chat_template); + auto start_get_inputs_embeds = std::chrono::steady_clock::now(); ov::Tensor inputs_embeds = m_inputs_embedder->get_inputs_embeds(prompt, rgbs, perf_metrics); auto end_get_inputs_embeds = std::chrono::steady_clock::now(); diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index f1898d1232..1ebb84616c 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -578,6 +578,7 @@ class GenerationConfig: num_return_sequences: the number of sequences to generate from a single prompt. """ adapters: AdapterConfig | None + apply_chat_template: bool assistant_confidence_threshold: float diversity_penalty: float do_sample: bool @@ -1653,6 +1654,7 @@ class Tokenizer: openvino_genai.Tokenizer object is used to initialize Tokenizer if it's located in a different path than the main model. """ + chat_template: str def __init__(self, tokenizer_path: os.PathLike, properties: dict[str, typing.Any] = {}, **kwargs) -> None: ... def apply_chat_template(self, history: list[dict[str, str]], add_generation_prompt: bool, chat_template: str = '') -> str: diff --git a/src/python/py_generation_config.cpp b/src/python/py_generation_config.cpp index e2a6d7062c..a7d7789a55 100644 --- a/src/python/py_generation_config.cpp +++ b/src/python/py_generation_config.cpp @@ -115,6 +115,7 @@ void init_generation_config(py::module_& m) { .def_readwrite("include_stop_str_in_output", &GenerationConfig::include_stop_str_in_output) .def_readwrite("stop_token_ids", &GenerationConfig::stop_token_ids) .def_readwrite("adapters", &GenerationConfig::adapters) + .def_readwrite("apply_chat_template", &GenerationConfig::apply_chat_template) .def("set_eos_token_id", &GenerationConfig::set_eos_token_id, py::arg("tokenizer_eos_token_id")) .def("is_beam_search", &GenerationConfig::is_beam_search) .def("is_greedy_decoding", &GenerationConfig::is_greedy_decoding) diff --git a/src/python/py_tokenizer.cpp b/src/python/py_tokenizer.cpp index 0dd9f3d715..5d8640b9d5 100644 --- a/src/python/py_tokenizer.cpp +++ b/src/python/py_tokenizer.cpp @@ -109,6 +109,12 @@ void init_tokenizer(py::module_& m) { "Override a chat_template read from tokenizer_config.json." ) + .def_property( + "chat_template", + &Tokenizer::get_chat_template, + &Tokenizer::set_chat_template + ) + .def("get_pad_token_id", &Tokenizer::get_pad_token_id) .def("get_bos_token_id", &Tokenizer::get_bos_token_id) .def("get_eos_token_id", &Tokenizer::get_eos_token_id) diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py index b0b6a70e93..bc75be7acc 100644 --- a/tests/python_tests/common.py +++ b/tests/python_tests/common.py @@ -252,7 +252,12 @@ def run_hugging_face( # process prompt by promp as we have multiple generation configs for prompt, generation_config in zip(prompts, generation_configs): hf_generation_config = convert_to_hf(opt_model.generation_config, generation_config) - inputs = hf_tokenizer(prompt, return_tensors="pt") + inputs = {} + if hf_tokenizer.chat_template: + prompt = hf_tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True) + inputs = hf_tokenizer(prompt, return_tensors="pt", add_special_tokens=False) + else: + inputs = hf_tokenizer(prompt, return_tensors="pt") input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask'] prompt_len = 0 if generation_config.echo else input_ids.numel() @@ -266,8 +271,15 @@ def run_hugging_face( generation_result.m_scores = [score for score in generate_outputs.sequences_scores] generation_results.append(generation_result) else: - # process all prompts as a single batch as we have a single generation config for all prompts - inputs = hf_tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=True, padding_side='left') + inputs = {} + if hf_tokenizer.chat_template: + processed_prompts = [] + for prompt in prompts: + processed_prompts.append(hf_tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)) + # process all prompts as a single batch as we have a single generation config for all prompts + inputs = hf_tokenizer(processed_prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=False, padding_side='left') + else: + inputs = hf_tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, padding_side='left') input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask'] hf_generation_config = convert_to_hf(opt_model.generation_config, generation_configs) hf_encoded_outputs = opt_model.generate(input_ids, attention_mask=attention_mask, generation_config=hf_generation_config, tokenizer=hf_tokenizer) diff --git a/tests/python_tests/test_generation_config.py b/tests/python_tests/test_generation_config.py index 682f4a93e2..a674ca8e81 100644 --- a/tests/python_tests/test_generation_config.py +++ b/tests/python_tests/test_generation_config.py @@ -58,6 +58,8 @@ def verify_set_values(generation_config, kwargs): dict(max_new_tokens=1, assistant_confidence_threshold=0.5), dict(max_new_tokens=1, num_assistant_tokens=2), dict(max_new_tokens=1, num_assistant_tokens=2, max_ngram_size=2), # prompt lookup + dict(max_new_tokens=1, apply_chat_template=True), + dict(max_new_tokens=1, apply_chat_template=False), ] @pytest.mark.parametrize("generation_config_kwargs", configs) @pytest.mark.precommit diff --git a/tests/python_tests/test_llm_pipeline.py b/tests/python_tests/test_llm_pipeline.py index 8968f2a083..1396b23aa7 100644 --- a/tests/python_tests/test_llm_pipeline.py +++ b/tests/python_tests/test_llm_pipeline.py @@ -339,7 +339,7 @@ def test_unicode_pybind_decoding_one_string(): # Test that pybind will not fail. model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3') ov_pipe = read_model((model_id, path))[4] - res_str = ov_pipe.generate(',', max_new_tokens=4) + res_str = ov_pipe.generate(',', max_new_tokens=4, apply_chat_template=False) assert '�' == res_str[-1] @@ -350,7 +350,7 @@ def test_unicode_pybind_decoding_batched(): # Test that pybind will not fail. model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3') ov_pipe = read_model((model_id, path))[4] - res_str = ov_pipe.generate([","], max_new_tokens=4) + res_str = ov_pipe.generate([","], max_new_tokens=4, apply_chat_template=False) assert '�' == res_str.texts[0][-1] @@ -362,7 +362,7 @@ def test_unicode_pybind_decoding_one_string_streamer(): model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3') ov_pipe = read_model((model_id, path))[4] res_str = [] - ov_pipe.generate(",", max_new_tokens=4, streamer=lambda x: res_str.append(x)) + ov_pipe.generate(",", max_new_tokens=4, apply_chat_template=False, streamer=lambda x: res_str.append(x)) assert '�' == ''.join(res_str)[-1] # diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py index 7a3aced29a..3ad1335681 100644 --- a/tests/python_tests/test_sampling.py +++ b/tests/python_tests/test_sampling.py @@ -18,7 +18,7 @@ (dict(max_new_tokens=30, min_new_tokens=30), '你好! 你好嗎?'), (dict(max_new_tokens=30, ignore_eos=True), 'Alan Turing was a'), # (dict(max_length=40), 'table is made of'), - (dict(stop_token_ids={28998}), 'The Sun is yellow because'), # since a test does not hang, it means stop token is met + (dict(stop_token_ids={28998}, apply_chat_template=False), 'The Sun is yellow because'), # since a test does not hang, it means stop token is met, skip chat template to generate long answer # (dict(max_new_tokens=1, min_new_tokens=0, echo=True), 'What is OpenVINO?') ], ids=["max_new_tokens", diff --git a/tools/llm_bench/stable-diffusion-v2-1_p0_iter0_pid2834205_output.png b/tools/llm_bench/stable-diffusion-v2-1_p0_iter0_pid2834205_output.png new file mode 100644 index 0000000000..b45b62e5e8 --- /dev/null +++ b/tools/llm_bench/stable-diffusion-v2-1_p0_iter0_pid2834205_output.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8ec37757008e9e8ac18b6cebfb7a64e88eff17511be3f24c8aa219e192ae56b +size 134952 diff --git a/tools/llm_bench/stable-diffusion-v2-1_p0_iter0_pid2834425_output.png b/tools/llm_bench/stable-diffusion-v2-1_p0_iter0_pid2834425_output.png new file mode 100644 index 0000000000..d7c4c010be --- /dev/null +++ b/tools/llm_bench/stable-diffusion-v2-1_p0_iter0_pid2834425_output.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:720577da63f0352976eacf388fb6b113a2c62f613bf581bff6c18fb10db58926 +size 135308 diff --git a/tools/llm_bench/stable-diffusion-v2-1_p0_iter1_pid2834205_output.png b/tools/llm_bench/stable-diffusion-v2-1_p0_iter1_pid2834205_output.png new file mode 100644 index 0000000000..b45b62e5e8 --- /dev/null +++ b/tools/llm_bench/stable-diffusion-v2-1_p0_iter1_pid2834205_output.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8ec37757008e9e8ac18b6cebfb7a64e88eff17511be3f24c8aa219e192ae56b +size 134952 diff --git a/tools/llm_bench/stable-diffusion-v2-1_p0_iter1_pid2834425_output.png b/tools/llm_bench/stable-diffusion-v2-1_p0_iter1_pid2834425_output.png new file mode 100644 index 0000000000..d7c4c010be --- /dev/null +++ b/tools/llm_bench/stable-diffusion-v2-1_p0_iter1_pid2834425_output.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:720577da63f0352976eacf388fb6b113a2c62f613bf581bff6c18fb10db58926 +size 135308 diff --git a/tools/llm_bench/task/text_generation.py b/tools/llm_bench/task/text_generation.py index 76f5678dd9..19d64197f1 100644 --- a/tools/llm_bench/task/text_generation.py +++ b/tools/llm_bench/task/text_generation.py @@ -234,6 +234,7 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data gen_config.rng_seed = args["seed"] gen_config.num_beams = args["num_beams"] gen_config.do_sample = False + gen_config.apply_chat_template = False if args.get('draft_model', ''): config_info = "Speculative decoding config: " if args.get('num_assistant_tokens', None): @@ -380,7 +381,14 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg gen_config.max_new_tokens = max_gen_tokens gen_config.num_beams = args["num_beams"] gen_config.do_sample = False +<<<<<<< HEAD gen_config.ignore_eos = True +======= +<<<<<<< HEAD +======= + gen_config.ignore_eos = True + gen_config.apply_chat_template = False +>>>>>>> 94cb1d5e (Automatically apply chat template in non-chat scenarios) enable_prompt_permutations = not args.get("disable_prompt_permutation", False) if enable_prompt_permutations: log.warning( @@ -394,6 +402,10 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg input_ids[:, 0] = num + 1 attention_mask = input_data.attention_mask input_data = TokenizedInputs(input_ids=ov.Tensor(input_ids), attention_mask=attention_mask) +<<<<<<< HEAD +======= +>>>>>>> 86894870 (Automatically apply chat template in non-chat scenarios) +>>>>>>> 94cb1d5e (Automatically apply chat template in non-chat scenarios) if args.get('draft_model', ''): config_info = "Speculative decoding config: " if args.get("num_assistant_tokens", None): diff --git a/tools/llm_bench/task/visual_language_generation.py b/tools/llm_bench/task/visual_language_generation.py index a02b16b2bb..9cc6702999 100644 --- a/tools/llm_bench/task/visual_language_generation.py +++ b/tools/llm_bench/task/visual_language_generation.py @@ -211,6 +211,7 @@ def run_visual_language_generation_genai( gen_config.max_new_tokens = max_gen_tokens gen_config.num_beams = args["num_beams"] gen_config.do_sample = False + gen_config.apply_chat_template = False kwargs = {} if len(images) >= 1: kwargs["images"] = images[0] diff --git a/tools/who_what_benchmark/whowhatbench/wwb.py b/tools/who_what_benchmark/whowhatbench/wwb.py index 7d4354f846..fa7dc40401 100644 --- a/tools/who_what_benchmark/whowhatbench/wwb.py +++ b/tools/who_what_benchmark/whowhatbench/wwb.py @@ -267,7 +267,7 @@ def genai_gen_text(model, tokenizer, question, max_new_tokens, skip_question, us model.finish_chat() return result else: - return model.generate(question, do_sample=False, max_new_tokens=max_new_tokens) + return model.generate(question, do_sample=False, max_new_tokens=max_new_tokens, apply_chat_template=False) def llamacpp_gen_text(model, tokenizer, question, max_new_tokens, skip_question, use_chat_template=False): @@ -336,6 +336,7 @@ def genai_gen_visual_text(model, prompt, image, processor, tokenizer, max_new_to config = model.get_generation_config() config.max_new_tokens = max_new_tokens config.do_sample = False + config.apply_chat_template = False model.set_generation_config(config) if tokenizer.chat_template is not None: model.start_chat(tokenizer.chat_template)