Merge branch 'master' into use-continuos-batching-by-default

openvinotoolkit · Oct 12, 2024 · c9dc107 · c9dc107
2 parents 1806fa0 + 67bcef1
commit c9dc107
Show file tree

Hide file tree

Showing 52 changed files with 1,112 additions and 272 deletions.
diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
@@ -203,7 +203,7 @@ jobs:
           echo "Multi prompt" passed
 
   cpp-greedy_causal_lm-windows:
-    runs-on: windows-2019-16-core
+    runs-on: windows-latest
     env:
       PYTHONIOENCODING: "utf8"
     defaults:
@@ -216,10 +216,6 @@ jobs:
       - uses: actions/setup-python@v4
         with:
           python-version: 3.9
-      - name: Configure Developer Command Prompt for Microsoft Visual C++
-        uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0
-        with:
-          toolset: 14.29
       - run: curl --output ov.zip ${{ env.w_ov_link }}
       - run: unzip -d ov ov.zip
       - run: dirs=(ov/*) && mv ov/*/* ov && rmdir "${dirs[@]}"
@@ -701,20 +697,38 @@ jobs:
         run: |
           source ./ov/setupvars.sh
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release --target visual_language_chat -j
-      - name: Download and convert a model and an image
+          cmake --build ./build/ --config Release --target visual_language_chat py_generate_pipeline -j
+      - name: Download and convert MiniCPM-V-2_6 model and an image
         run: |
           source ./ov/setupvars.sh
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python ./samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py ./miniCPM-V-2_6/
           wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --output-document cat.jpg
-          
-      - name: Run chat chat sample
+      - name: Run visual_language_chat sample - MiniCPM-V-2_6
         run: >
           source ./ov/setupvars.sh
           && timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ cat.jpg
           <<< $'What is on the image?\nWhat is special on the image?'
+      - name: Download and convert LLaVa 1.5 model and an image
+        run: |
+          source ./ov/setupvars.sh
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          optimum-cli export openvino --model llava-hf/llava-1.5-7b-hf ./llava_1_5_7b_ov/
+          wget https://llava-vl.github.io/static/images/monalisa.jpg
+      - name: Run visual_language_chat sample - LLaVa 1.5
+        run: >
+          source ./ov/setupvars.sh
+          && timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./llava_1_5_7b_ov/ monalisa.jpg
+          <<< $'Who drew this painting?\nWhen did the painter live?'
+
+      - name: Run python chat sample
+        run: |
+          source ./ov/setupvars.sh
+          export PYTHONPATH=./build/:$PYTHONPATH
+          printf 'What is on the image?\nWhat is special on the image?\n' > ./input.txt
+          timeout 120s python ./samples/python/visual_language_chat/visual_language_chat.py ./miniCPM-V-2_6/ cat.jpg < input.txt > ./pred.txt
 
   cpp-continuous-batching-ubuntu:
     runs-on: ubuntu-20.04-8-cores
@@ -760,7 +774,7 @@ jobs:
           timeout 200s ./build/samples/cpp/continuous_batching_benchmark/continuous_batching_benchmark -n 10 --dynamic_split_fuse --max_batch_size 256 --max_input_len 256 -m ./TinyLlama-1.1B-Chat-v1.0/ --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json --cache_size 1
 
   cpp-continuous-batching-windows:
-    runs-on: windows-2019-16-core
+    runs-on: windows-latest
     env:
       PYTHONIOENCODING: "utf8"
     defaults:
@@ -773,10 +787,6 @@ jobs:
       - uses: actions/setup-python@v4
         with:
           python-version: 3.9
-      - name: Configure Developer Command Prompt for Microsoft Visual C++
-        uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0
-        with:
-          toolset: 14.29
       - name: Install OpenVINO
         run: |
           curl --output ov.zip ${{ env.w_ov_link }}

diff --git a/.github/workflows/lcm_dreamshaper_cpp.yml b/.github/workflows/lcm_dreamshaper_cpp.yml
@@ -70,7 +70,7 @@ jobs:
           ./build/samples/cpp/text2image/stable_diffusion ./models/lcm_dreamshaper_v7/FP16 "cyberpunk cityscape like Tokyo New York  with tall buildings at dusk golden hour cinematic lighting"
 
   lcm_dreamshaper_v7_cpp-windows:
-    runs-on: windows-2019-16-core
+    runs-on: windows-latest
     defaults:
       run:
         shell: pwsh
@@ -88,11 +88,6 @@ jobs:
             mv ./tmp/*/* .
           popd
 
-      - name: Configure Developer Command Prompt for Microsoft Visual C++
-        uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0
-        with:
-          toolset: 14.29
-
       - name: Build app
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"

diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/llm_bench-python.yml
@@ -62,11 +62,11 @@ jobs:
         python ./llm_bench/python/benchmark.py -m tiny-random-qwen -d cpu -n 1 -f pt
     - name: Test tiny-random-baichuan2 on Linux
       run: |
-        python ./llm_bench/python/convert.py --model_id katuni4ka/tiny-random-baichuan2 --output_dir ./ov_models/tiny-random-baichuan2 --precision FP16
+        optimum-cli export openvino --model katuni4ka/tiny-random-baichuan2 --trust-remote-code --weight-format fp16 ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16
         python ./llm_bench/python/benchmark.py -m ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16/ -d cpu -n 1
     - name: Test tiny-stable-diffusion on Linux
       run: |
-        python ./llm_bench/python/convert.py --model_id segmind/tiny-sd --output_dir ./ov_models/tiny-sd --precision FP16
+        optimum-cli export openvino --model segmind/tiny-sd --trust-remote-code --weight-format fp16 ./ov_models/tiny-sd/pytorch/dldt/FP16/
         python ./llm_bench/python/benchmark.py -m ./ov_models/tiny-sd/pytorch/dldt/FP16/ -pf ./llm_bench/python/prompts/stable-diffusion.jsonl -d cpu -n 1
     - name: WWB Tests
       run: |

diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml
@@ -76,7 +76,7 @@ jobs:
           ./build/samples/cpp/text2image/lora_stable_diffusion ./models/dreamlike-art-dreamlike-anime-1.0/FP16 "curly-haired unicorn in the forest, anime, line" ./models/soulcard.safetensors 0.7
 
   stable_diffusion_1_5_cpp-windows:
-      runs-on: windows-2019-16-core
+      runs-on: windows-latest
       defaults:
         run:
           shell: pwsh
@@ -94,11 +94,6 @@ jobs:
               mv ./tmp/*/* .
             popd
 
-        - name: Configure Developer Command Prompt for Microsoft Visual C++
-          uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0
-          with:
-            toolset: 14.29
-
         - name: Build app
           run: |
             . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"

diff --git a/README.md b/README.md
@@ -40,10 +40,9 @@ Continuous batching functionality is used within OpenVINO Model Server (OVMS) to
 
     # Install optimum-intel to be able to download, convert and optimize LLMs from Hugging Face
     # Optimum is not required to run models, only to convert and compress
-    pip install optimum[openvino]
+    pip install optimum-intel@git+https://github.com/huggingface/optimum-intel.git
 
     # (Optional) Install (TBD) to be able to download models from Model Scope
-    #pip install optimum[openvino]
 ```
 
 ## Performing text generation 

diff --git a/llm_bench/python/benchmark.py b/llm_bench/python/benchmark.py
@@ -202,13 +202,14 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list,
             log.warning(f"[{num}] Prompt[{prompt_index}]'s md5 {result_md5_list} "
                         f"is different from md5 of the {num - 1} iteration {prev_md5}")
             llm_bench_utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0])
-            if num == 1:
-                # if the device is CPU, throw exception
-                if args['devices'].lower().startswith('cpu') is True:
+            if not args.get("use_cb", False):
+                if num == 1:
+                    # if the device is CPU, throw exception
+                    if args['devices'].lower().startswith('cpu') is True:
+                        assert (result_md5_list == prev_md5)
+                else:
+                    # throw exception
                     assert (result_md5_list == prev_md5)
-            else:
-                # throw exception
-                assert (result_md5_list == prev_md5)
     else:
         llm_bench_utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0])
     if bench_hook is not None:
@@ -412,13 +413,14 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg
             log.warning(f"[{num}] Prompt[{prompt_index}]'s md5 {result_md5_list} "
                         f"is different from md5 of the {num - 1} iteration {prev_md5}")
             llm_bench_utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0])
-            if num == 1:
-                # if the device is CPU, throw exception
-                if args['devices'].lower().startswith('cpu') is True:
+            if not args.get("use_cb", False):
+                if num == 1:
+                    # if the device is CPU, throw exception
+                    if args['devices'].lower().startswith('cpu') is True:
+                        assert (result_md5_list == prev_md5)
+                else:
+                    # throw exception
                     assert (result_md5_list == prev_md5)
-            else:
-                # throw exception
-                assert (result_md5_list == prev_md5)
     else:
         llm_bench_utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0])
     streamer.reset()

diff --git a/llm_bench/python/requirements.txt b/llm_bench/python/requirements.txt
@@ -10,7 +10,7 @@ torch
 transformers>=4.40.0
 diffusers>=0.22.0
 #optimum is in dependency list of optimum-intel 
-git+https://github.com/huggingface/optimum-intel.git@f34bd61df89f57f61c282c02297980299981ee78#egg=optimum-intel
+git+https://github.com/huggingface/optimum-intel.git@main#egg=optimum-intel
 git+https://github.com/openvinotoolkit/nncf.git@develop#egg=nncf
 packaging
 psutil

diff --git a/llm_bench/python/who_what_benchmark/requirements.txt b/llm_bench/python/who_what_benchmark/requirements.txt
@@ -2,7 +2,7 @@ transformers>=4.35.2
 sentence-transformers>=2.2.2
 openvino>=2024.3.0
 openvino-telemetry
-optimum-intel>=1.14
+optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
 openvino-tokenizers
 pandas>=2.0.3
 numpy>=1.23.5

diff --git a/samples/cpp/text2image/512x512.bmp b/samples/cpp/text2image/512x512.bmp
diff --git a/samples/cpp/text2image/baseline.bmp b/samples/cpp/text2image/baseline.bmp
diff --git a/samples/cpp/text2image/lora.bmp b/samples/cpp/text2image/lora.bmp
diff --git a/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp b/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp
@@ -21,15 +21,20 @@ int main(int argc, char* argv[]) try {
     // 'task' and 'language' parameters are supported for multilingual models only
     config.language = "<|en|>";
     config.task = "transcribe";
+    config.return_timestamps = true;
 
     auto streamer = [](std::string word) {
         std::cout << word;
         return false;
     };
 
-    pipeline.generate(raw_speech, config, streamer);
+    auto result = pipeline.generate(raw_speech, config, streamer);
 
-    std::cout << std::endl;
+    std::cout << "\n";
+
+    for (auto& chunk : *result.chunks) {
+        std::cout << "timestamps: [" << chunk.start_ts << ", " << chunk.end_ts << "] text: " << chunk.text << "\n";
+    }
 } catch (const std::exception& error) {
     try {
         std::cerr << error.what() << '\n';

diff --git a/samples/generation.gif b/samples/generation.gif
diff --git a/samples/python/vlm_chat_sample/README.md → ...les/python/visual_language_chat/README.md b/samples/python/vlm_chat_sample/README.md → ...les/python/visual_language_chat/README.md
@@ -16,7 +16,7 @@ pip install --upgrade-strategy eager -r ../../requirements.txt
 ## Run:
 [This image](https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11) can be used as a sample image.
 
-`vlm_chat_sample.py ./miniCPM-V-2_6/ 319483352-d5fbbd1a-d484-415c-88cb-9986625b7b11.jpg`
+`visual_language_chat.py ./miniCPM-V-2_6/ 319483352-d5fbbd1a-d484-415c-88cb-9986625b7b11.jpg`
 
 
 Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. # TODO: examples of larger models

diff --git a/...python/vlm_chat_sample/vlm_chat_sample.py → ...ual_language_chat/visual_language_chat.py b/...python/vlm_chat_sample/vlm_chat_sample.py → ...ual_language_chat/visual_language_chat.py
@@ -54,12 +54,16 @@ def main():
     config.max_new_tokens = 100
 
     pipe.start_chat()
+    prompt = input('question:\n')
+    pipe(prompt, image=image, generation_config=config, streamer=streamer)
+    print('\n----------')
+
     while True:
         try:
             prompt = input('question:\n')
         except EOFError:
             break
-        pipe(prompt, image=image, generation_config=config, streamer=streamer)
+        pipe(prompt, generation_config=config, streamer=streamer)
         print('\n----------')
     pipe.finish_chat()
 

diff --git a/samples/python/whisper_speech_recognition/whisper_speech_recognition.py b/samples/python/whisper_speech_recognition/whisper_speech_recognition.py
@@ -26,15 +26,19 @@ def streamer(word: str) -> bool:
         print(word, end="")
         return False
 
-    pipe.generate(
+    result = pipe.generate(
         raw_speech,
         max_new_tokens=100,
         # 'task' and 'language' parameters are supported for multilingual models only
         language="<|en|>",
         task="transcribe",
+        return_timestamps=True,
         streamer=streamer,
     )
 
+    for chunk in result.chunks:
+        print(f"timestamps: [{chunk.start_ts}, {chunk.end_ts}] text: {chunk.text}")
+
     print()
 
 

diff --git a/samples/requirements.txt b/samples/requirements.txt
@@ -1,5 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
-optimum[openvino]==1.22.0
+optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
+numpy<2.0.0; sys_platform == 'darwin'
 einops==0.8.0  # For Qwen
 transformers_stream_generator==0.0.5  # For Qwen
 diffusers==0.30.3

diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt
@@ -15,8 +15,8 @@ endif()
 
 function(ov_genai_build_jinja2cpp)
     FetchContent_Declare(jinja2cpp
-        URL https://github.com/jinja2cpp/Jinja2Cpp/archive/b32fbde7d98d13c34784c332c4a24a6f92c76e38.tar.gz
-        URL_HASH SHA256=7cc25ddbc438a5c874d404e100b4eccd8a331c195417f5487c48aebcf4b9e7fb)
+        URL https://github.com/ilya-lavrenov/Jinja2Cpp/archive/04073b62ec950eab6cdcb2c563c1c9bb7698f1ea.tar.gz
+        URL_HASH SHA256=9f2a346eec91a6a381fe8fd631e9c952fe7087882bbca7f0e4e42e75e680fc1b)
 
     FetchContent_GetProperties(jinja2cpp)
     if(NOT jinja2cpp_POPULATED)

diff --git a/src/cpp/include/openvino/genai/processor_config.hpp b/src/cpp/include/openvino/genai/processor_config.hpp
@@ -34,6 +34,14 @@ class OPENVINO_GENAI_EXPORTS ProcessorConfig {
     /// Applied after norm_mean.
     /// llava calls it image_std.
     std::array<float, 3> norm_std{1.0f, 1.0f, 1.0f};
+
+    // llava specific config params
+    std::array<float, 3> image_mean{0.0f, 0.0f, 0.0f};
+    std::array<float, 3> image_std{1.0f, 1.0f, 1.0f};
+    size_t crop_size_height = 336;
+    size_t crop_size_width = 336;
+    size_t size_shortest_edge = 336;
+
     /// @brief Default constructor
     ProcessorConfig() = default;
     /// @brief Construct ProcessorConfig from values in json_path.

diff --git a/src/cpp/include/openvino/genai/vision_encoder.hpp b/src/cpp/include/openvino/genai/vision_encoder.hpp
@@ -5,6 +5,7 @@
 
 #include "openvino/genai/processor_config.hpp"
 #include <openvino/openvino.hpp>
+#include "vlm_model_type.hpp"
 
 namespace ov::genai {
 /// @brief A pair describing image size.
@@ -41,8 +42,10 @@ struct EncodedImage {
 /// ov::InferRequest and configured by ProcessorConfig.
 class OPENVINO_GENAI_EXPORTS VisionEncoder {
 public:
+    /// @brief A enum denoting model type.
+    VLMModelType model_type;
     /// @brief A model for image encoding.
-    ov::InferRequest m_encoder;
+    ov::InferRequest m_vision_encoder;
     /// @brief A config to follow.
     ProcessorConfig m_processor_config;
 
@@ -52,7 +55,7 @@ class OPENVINO_GENAI_EXPORTS VisionEncoder {
     explicit VisionEncoder(
         const ov::InferRequest& encoder,
         const ProcessorConfig& processor_config=ProcessorConfig{}
-    ) : m_encoder{encoder}, m_processor_config{processor_config} {}
+    ) : m_vision_encoder{encoder}, m_processor_config{processor_config} {}
 
     /// @brief Construct the encoder from model_dir.
     /// @param model_dir A folder containing openvino_embedding.xml and
@@ -63,6 +66,7 @@ class OPENVINO_GENAI_EXPORTS VisionEncoder {
     /// @param core ov::Core to be used to compile the model.
     explicit VisionEncoder(
         const std::filesystem::path& model_dir,
+        const VLMModelType model_type,
         const std::string& device="CPU",
         const ov::AnyMap device_config={},
         ov::Core core=ov::Core{}
@@ -117,5 +121,14 @@ class OPENVINO_GENAI_EXPORTS VisionEncoder {
             image, AnyMap{std::forward<Properties>(properties)...}
         );
     }
+
+private:
+    EncodedImage encode_minicpm(
+        const ov::Tensor& image, const ProcessorConfig& config
+    );
+
+    EncodedImage encode_llava(
+        const ov::Tensor& image, const ProcessorConfig& config
+    );
 };
 }
diff --git a/src/cpp/include/openvino/genai/vlm_config.hpp b/src/cpp/include/openvino/genai/vlm_config.hpp
@@ -6,12 +6,15 @@
 #include "openvino/genai/visibility.hpp"
 #include <openvino/runtime/properties.hpp>
 #include <filesystem>
+#include "vlm_model_type.hpp"
 
 namespace ov::genai {
 /// @brief A Configuration class passed to VLMPipeline and used to
 /// change VLMPipeline's behavior. Corresponds to config.json.
 class OPENVINO_GENAI_EXPORTS VLMConfig {
 public:
+    /// @brief A enum denoting model type.
+    VLMModelType model_type;
     /// @brief A size of a single embedding returned by a resampler.
     /// Used to initialize positional embeddings for resampler input.
     size_t hidden_size = 2304;