test

openvinotoolkit · Jan 28, 2025 · cdbd22e · cdbd22e
1 parent 5924b23
commit cdbd22e
Show file tree

Hide file tree

Showing 16 changed files with 1,080 additions and 1,057 deletions.
diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
diff --git a/.github/workflows/job_vlm_sample_llava.yml b/.github/workflows/job_vlm_sample_llava.yml
@@ -14,32 +14,32 @@ env:
   l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.1.0-17911-83c047443de/l_openvino_toolkit_ubuntu22_2025.1.0.dev20250116_x86_64.tgz
 
 jobs:
-  visual_language_chat_sample-ubuntu-llava:
-    runs-on: ubuntu-22.04-16-cores
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - uses: actions/setup-python@v4
-        with:
-          python-version: 3.11
-      - uses: ./.github/actions/install_openvino
-        with:
-          ov_link: ${{ env.l_u22_ov_link }}
-      - uses: ./.github/actions/build_app
-        with:
-          build_target: 'visual_language_chat py_openvino_genai'
-      - uses: ./.github/actions/install_python_deps
-      - name: Download and convert model
-        run: |
-          source ./ov/setupvars.sh
-          optimum-cli export openvino --model ${{ inputs.model_id }} ./${{ inputs.model_dir }}
-      - name: Download images
-        run: |
-          wget https://llava-vl.github.io/static/images/monalisa.jpg
-      - name: Run visual_language_chat C++ sample
-        run: >
-          source ./ov/setupvars.sh
-          && ./build/samples/cpp/visual_language_chat/visual_language_chat ./${{ inputs.model_dir }} monalisa.jpg
-          <<< $'Who drew this painting?\nWhen did the painter live?'
-        timeout-minutes: 4
+  # visual_language_chat_sample-ubuntu-llava:
+  #   runs-on: ubuntu-22.04-16-cores
+  #   steps:
+  #     - uses: actions/checkout@v4
+  #       with:
+  #         submodules: recursive
+  #     - uses: actions/setup-python@v4
+  #       with:
+  #         python-version: 3.11
+  #     - uses: ./.github/actions/install_openvino
+  #       with:
+  #         ov_link: ${{ env.l_u22_ov_link }}
+  #     - uses: ./.github/actions/build_app
+  #       with:
+  #         build_target: 'visual_language_chat py_openvino_genai'
+  #     - uses: ./.github/actions/install_python_deps
+  #     - name: Download and convert model
+  #       run: |
+  #         source ./ov/setupvars.sh
+  #         optimum-cli export openvino --model ${{ inputs.model_id }} ./${{ inputs.model_dir }}
+  #     - name: Download images
+  #       run: |
+  #         wget https://llava-vl.github.io/static/images/monalisa.jpg
+  #     - name: Run visual_language_chat C++ sample
+  #       run: >
+  #         source ./ov/setupvars.sh
+  #         && ./build/samples/cpp/visual_language_chat/visual_language_chat ./${{ inputs.model_dir }} monalisa.jpg
+  #         <<< $'Who drew this painting?\nWhen did the painter live?'
+  #       timeout-minutes: 4
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
@@ -267,12 +267,12 @@ jobs:
       fail-fast: false
       matrix:
         test:
-          - name: 'Whisper'
-            cmd: 'tests/python_tests/test_whisper_pipeline.py tests/python_tests/test_whisper_pipeline_static.py'
-          - name: 'Cacheopt E2E'
-            cmd: 'tests/python_tests/test_kv_cache_eviction.py'
+          # - name: 'Whisper'
+          #   cmd: 'tests/python_tests/test_whisper_pipeline.py tests/python_tests/test_whisper_pipeline_static.py'
+          # - name: 'Cacheopt E2E'
+          #   cmd: 'tests/python_tests/test_kv_cache_eviction.py'
           - name: 'LLM & VLM'
-            cmd: 'tests/python_tests --ignore tests/python_tests/test_whisper_pipeline.py --ignore tests/python_tests/test_kv_cache_eviction.py --ignore tests/python_tests/test_whisper_pipeline_static.py'
+            cmd: 'tests/python_tests --ignore tests/python_tests/test_whisper_pipeline.py --ignore tests/python_tests/test_kv_cache_eviction.py --ignore tests/python_tests/test_whisper_pipeline_static.py --ignore tests/python_tests/test_continuous_batching.py --ignore tests/python_tests/test_generation_config.py --ignore tests/python_tests/test_tokenizer.py --ignore tests/python_tests/test_vlm_pipeline.py'
     defaults:
       run:
         shell: bash

diff --git a/samples/python/text_generation/chat_sample.py b/samples/python/text_generation/chat_sample.py
@@ -24,15 +24,16 @@ def main():
     config = openvino_genai.GenerationConfig()
     config.max_new_tokens = 100
 
-    pipe.start_chat()
+    # pipe.start_chat()
     while True:
         try:
             prompt = input('question:\n')
         except EOFError:
             break
-        pipe.generate(prompt, config, streamer)
+        res = pipe.generate(prompt, max_new_tokens=30, apply_chat_template=False)
+        print(res)
         print('\n----------')
-    pipe.finish_chat()
+    # pipe.finish_chat()
 
 
 if '__main__' == __name__:

diff --git a/src/cpp/src/debug_utils.hpp b/src/cpp/src/debug_utils.hpp
@@ -12,7 +12,7 @@
 template <typename T>
 void print_array(T * array, size_t size) {
     std::cout << " => [ ";
-    for (size_t i = 0; i < std::min(size, size_t(10)); ++i) {
+    for (size_t i = 0; i < size; ++i) {
         std::cout << array[i] << " ";
     }
     std::cout << " ] " << std::endl;

diff --git a/src/cpp/src/icontinuous_batching.cpp b/src/cpp/src/icontinuous_batching.cpp
@@ -2,6 +2,8 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "icontinuous_batching.hpp"
+#include "debug_utils.hpp"
+#include "openvino/genai/tokenizer.hpp"
 
 namespace ov::genai {
 
@@ -64,8 +66,10 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
                 encoded_inputs = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false)).input_ids;
             } else {
                 // in case when chat_template was not found in tokenizer_config.json or set
-                encoded_inputs = m_tokenizer.encode(prompt).input_ids;
+                std::string str_input(prompt);
+                encoded_inputs = m_tokenizer.encode(str_input, ov::genai::add_special_tokens(true)).input_ids;
             }
+            print_tensor("encoded_inputs", encoded_inputs);
             input_ids.push_back(encoded_inputs);
             tokenization_durations.emplace_back(PerfMetrics::get_microsec(std::chrono::steady_clock::now() - encode_start));
         }
@@ -82,6 +86,8 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
         auto& raw_counters = perf_metrics.raw_metrics;
         raw_counters.tokenization_durations.emplace_back(tokenization_durations[i]);
 
+        print_array(res.m_generation_ids.at(0).data(), res.m_generation_ids.at(0).size());
+
         std::vector<std::string> generated;
         generated.reserve(res.m_generation_ids.size());
         for (size_t idx = 0; idx < res.m_generation_ids.size(); ++idx) {

diff --git a/src/cpp/src/llm_pipeline_stateful.cpp b/src/cpp/src/llm_pipeline_stateful.cpp
@@ -91,6 +91,7 @@ DecodedResults StatefulLLMPipeline::generate(
     if (auto input_vector = std::get_if<std::vector<std::string>>(&inputs)) {
         OPENVINO_ASSERT(!is_chat_conversation, "Can't chat with multiple prompts");
         if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) {
+            std::cout << " input_vector apply_chat_template true " << std::endl;
             std::vector<std::string> templated_input_vector;
             for (auto& input : *input_vector) {
                 ChatHistory history({{{"role", "user"}, {"content", input}}});
@@ -100,8 +101,10 @@ DecodedResults StatefulLLMPipeline::generate(
             }
             encoded_input = m_tokenizer.encode(templated_input_vector, ov::genai::add_special_tokens(false));
         } else {
-            encoded_input = m_tokenizer.encode(*input_vector);
+            std::cout << " input_vector apply_chat_template false " << std::endl;
+            encoded_input = m_tokenizer.encode(*input_vector, ov::genai::add_special_tokens(true));
         }
+        print_tensor("encoded_input", encoded_input.input_ids);
     } else if (auto input_prompt = std::get_if<std::string>(&inputs)) {
         std::string& prompt = *input_prompt;
 
@@ -172,14 +175,17 @@ DecodedResults StatefulLLMPipeline::generate(
         } else {
             std::string& prompt = *input_prompt;
             if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) {
+                std::cout << " apply_chat_template true " << std::endl;
                 ChatHistory history({{{"role", "user"}, {"content", prompt}}});
                 constexpr bool add_generation_prompt = true;
                 auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
                 encoded_input = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false));
             } else {
                 // in case when chat_template was not found in tokenizer_config.json or set
-                encoded_input = m_tokenizer.encode(prompt);
+                std::cout << " apply_chat_template false 1" << std::endl;
+                encoded_input = m_tokenizer.encode(prompt, ov::genai::add_special_tokens(true));
             }
+            print_tensor("encoded_input", encoded_input.input_ids);
         }
     }
 

diff --git a/src/cpp/src/text_callback_streamer.cpp b/src/cpp/src/text_callback_streamer.cpp
@@ -16,7 +16,7 @@ bool TextCallbackStreamer::put(int64_t token) {
     m_tokens_cache.push_back(token);
     std::string text = m_tokenizer.decode(m_tokens_cache);
     m_decoded_lengths.push_back(text.length());
-    
+
     if (!text.empty() && '\n' == text.back() && text.size() > m_printed_len) {
         // Flush the cache after the new line symbol
         res << std::string_view{text.data() + m_printed_len, text.size() - m_printed_len};

diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -11,7 +11,7 @@
 
 #include "utils.hpp"
 #include <regex>
-
+#include "debug_utils.hpp"
 namespace ov::genai {
 
 const ModelsMap::mapped_type& get_model_weights_pair(const ModelsMap& models_map, const std::string& key);
@@ -250,6 +250,7 @@ class InputsEmbedder::IInputsEmbedder {
             }
             m_tokenized_history.clear();
             std::copy_n(new_chat_tokens.data<int64_t>(), new_chat_tokens.get_size(), std::back_inserter(m_tokenized_history));
+
             return encoded_input_ids;
         } else {
             m_tokenized_history.clear();

diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py
@@ -253,13 +253,17 @@ def run_hugging_face(
         for prompt, generation_config in zip(prompts, generation_configs):
             hf_generation_config = convert_to_hf(opt_model.generation_config, generation_config)
             inputs = {}
-            if hf_tokenizer.chat_template:
+            if hf_tokenizer.chat_template and generation_config.apply_chat_template:
                 prompt = hf_tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
                 inputs = hf_tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
             else:
                 inputs = hf_tokenizer(prompt, return_tensors="pt")
             input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask']
             prompt_len = 0 if generation_config.echo else input_ids.numel()
+
+            if (not generation_config.apply_chat_template):
+                print("prompt: ", prompt)
+                print("inputs: ", inputs)
 
             generate_outputs = opt_model.generate(input_ids=input_ids, attention_mask=attention_mask, generation_config=hf_generation_config, tokenizer=hf_tokenizer)
             all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences], skip_special_tokens=True)
@@ -272,14 +276,19 @@ def run_hugging_face(
             generation_results.append(generation_result)
     else:
         inputs = {}
-        if hf_tokenizer.chat_template:
+        if hf_tokenizer.chat_template and generation_configs.apply_chat_template:
             processed_prompts = []
             for prompt in prompts:
                 processed_prompts.append(hf_tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True))
             # process all prompts as a single batch as we have a single generation config for all prompts
             inputs = hf_tokenizer(processed_prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=False, padding_side='left')
         else:
             inputs = hf_tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, padding_side='left')
+
+        if (not generation_configs.apply_chat_template):
+            print("prompt: ", prompts)
+            print("inputs: ", inputs['input_ids'])
+
         input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask']
         hf_generation_config = convert_to_hf(opt_model.generation_config, generation_configs)
         hf_encoded_outputs = opt_model.generate(input_ids, attention_mask=attention_mask, generation_config=hf_generation_config, tokenizer=hf_tokenizer)
@@ -412,6 +421,7 @@ def run_llm_pipeline(
     shutil.rmtree(models_path)
 
     if isinstance(streamer, StreamerWithResults):
+        print(" ====  compare_generation_results streamer and resutls ==== ")
         compare_generation_results(prompts, generation_results, streamer.get_results(), generation_config)
 
     return generation_results
@@ -429,8 +439,10 @@ def compare_generation_result(hf_result: GenerationResult, ov_result: Generation
         for hf_text, ov_text in zip(hf_result.m_generation_ids, ov_result.m_generation_ids):
             assert ov_text in hf_text
     else:
+        print("len: ", len(hf_result.m_generation_ids), " ", len(hf_result.m_generation_ids))
         assert len(hf_result.m_generation_ids) == len(ov_result.m_generation_ids)
         for hf_text, ov_text in zip(hf_result.m_generation_ids, ov_result.m_generation_ids):
+            print("hf_text: ", hf_text, " ov_text ", ov_text)
             assert hf_text == ov_text
 
 
@@ -487,6 +499,10 @@ def run_llm_pipeline_with_ref(model_id: str,
     ov_results = run_llm_pipeline(models_path, prompts, generation_config, use_cb, streamer=streamer.accumulate if isinstance(streamer, StreamerWithResults) else streamer)
     hf_results = run_hugging_face(opt_model, hf_tokenizer, prompts, generation_config)
 
+    if (not generation_config.apply_chat_template):
+        print("ov_results ", ov_results)
+        print("hf_results: ", hf_results)
+    print(" ====  compare_generation_results hf_results and ov_results ==== ")
     compare_generation_results(prompts, hf_results, ov_results, generation_config)
 
 

diff --git a/tests/python_tests/test_llm_pipeline.py b/tests/python_tests/test_llm_pipeline.py
@@ -340,6 +340,7 @@ def test_unicode_pybind_decoding_one_string():
     model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')
     ov_pipe = read_model((model_id, path))[4]
     res_str = ov_pipe.generate(',', max_new_tokens=4, apply_chat_template=False)
+    print(res_str)
     assert '�' == res_str[-1]
 
 
@@ -351,7 +352,9 @@ def test_unicode_pybind_decoding_batched():
     model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')
     ov_pipe = read_model((model_id, path))[4]
     res_str = ov_pipe.generate([","], max_new_tokens=4, apply_chat_template=False)
+    print(res_str.texts)
     assert '�' == res_str.texts[0][-1]
+    assert '�' == res_str.texts[0][-2]
 
 
 @pytest.mark.precommit

diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py
@@ -30,6 +30,8 @@
                               ])
 def test_basic_stop_criteria(tmp_path, generation_config, prompt):
     model_id : str = "katuni4ka/tiny-random-phi3"
+    if 'apply_chat_template' in generation_config:
+        print("apply_chat_template ", generation_config['apply_chat_template'])
     run_llm_pipeline_with_ref(model_id, [prompt], generation_config, tmp_path)
 
 

diff --git a/tools/llm_bench/stable-diffusion-v2-1_p0_iter0_pid2834205_output.png b/tools/llm_bench/stable-diffusion-v2-1_p0_iter0_pid2834205_output.png
diff --git a/tools/llm_bench/stable-diffusion-v2-1_p0_iter0_pid2834425_output.png b/tools/llm_bench/stable-diffusion-v2-1_p0_iter0_pid2834425_output.png
diff --git a/tools/llm_bench/stable-diffusion-v2-1_p0_iter1_pid2834205_output.png b/tools/llm_bench/stable-diffusion-v2-1_p0_iter1_pid2834205_output.png
diff --git a/tools/llm_bench/stable-diffusion-v2-1_p0_iter1_pid2834425_output.png b/tools/llm_bench/stable-diffusion-v2-1_p0_iter1_pid2834425_output.png