openvinotoolkit · Wovchena · Oct 12, 2024 · Oct 12, 2024 · Oct 12, 2024 · Oct 12, 2024
diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
@@ -703,12 +703,13 @@ jobs:
           source ./ov/setupvars.sh
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python ./samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py ./miniCPM-V-2_6/
+          python -m pip install -U "optimum<1.23" --no-dependencies
+          optimum-cli export openvino -m openbmb/MiniCPM-V-2_6 MiniCPM-V-2_6 --trust-remote-code
           wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --output-document cat.jpg
       - name: Run visual_language_chat sample - MiniCPM-V-2_6
         run: >
           source ./ov/setupvars.sh
-          && timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ cat.jpg
+          && timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./MiniCPM-V-2_6/ cat.jpg
           <<< $'What is on the image?\nWhat is special on the image?'
       - name: Download and convert LLaVa 1.5 model and an image
         run: |
@@ -728,7 +729,7 @@ jobs:
           source ./ov/setupvars.sh
           export PYTHONPATH=./build/:$PYTHONPATH
           printf 'What is on the image?\nWhat is special on the image?\n' > ./input.txt
-          timeout 120s python ./samples/python/visual_language_chat/visual_language_chat.py ./miniCPM-V-2_6/ cat.jpg < input.txt > ./pred.txt
+          timeout 120s python ./samples/python/visual_language_chat/visual_language_chat.py ./MiniCPM-V-2_6/ cat.jpg < input.txt > ./pred.txt
 
   cpp-continuous-batching-ubuntu:
     runs-on: ubuntu-20.04-8-cores

diff --git a/samples/cpp/visual_language_chat/README.md b/samples/cpp/visual_language_chat/README.md
@@ -10,7 +10,7 @@ It's not required to install [../../requirements.txt](../../requirements.txt) fo
 
 ```sh
 pip install --upgrade-strategy eager -r ../../requirements.txt
-export_MiniCPM-V-2_6.py miniCPM-V-2_6
+optimum-cli export openvino -m openbmb/MiniCPM-V-2_6 MiniCPM-V-2_6 --trust-remote-code
 ```
 
 ## Run

diff --git a/samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py b/samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py
@@ -23,10 +23,10 @@
 from openvino.runtime.passes import Manager, MatcherPass, WrapType, Matcher
 import time
 
-text_emb_path = Path("embed_tokens.xml")
-image_emb_path = Path("image_encoder.xml")
-resampler_path = Path("resampler.xml")
-llm_path = Path("language_model.xml")
+text_emb_path = Path("openvino_text_embeddings_model.xml")
+image_emb_path = Path("openvino_vision_embeddings_model.xml")
+resampler_path = Path("openvino_resampler_model.xml")
+llm_path = Path("openvino_language_model.xml")
 
 class InsertSlice(MatcherPass):
     def __init__(self):
@@ -596,8 +596,8 @@ def __init__(self, model_dir, device="CPU", ov_config=None, compile=True, slice_
         self.config.is_encoder_decoder = False
         self.generation_config = GenerationConfig.from_model_config(self.config)
         model_dir = Path(model_dir)
-        self.model = core.read_model(model_dir / "language_model.xml")
-        self.token_emb = core.read_model(model_dir / "embed_tokens.xml")
+        self.model = core.read_model(model_dir / "openvino_language_model.xml")
+        self.token_emb = core.read_model(model_dir / "openvino_text_embeddings_model.xml")
         if slice_lm_head:
             self.slice_lm_head()
         self.request = None

diff --git a/samples/cpp/visual_language_chat/visual_language_chat.cpp b/samples/cpp/visual_language_chat/visual_language_chat.cpp
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "load_image.hpp"
-#include <openvino/genai/vlm_pipeline.hpp>
+#include <openvino/genai/visual_language/pipeline.hpp>
 #include <openvino/runtime/intel_gpu/properties.hpp>
 
 bool print_subword(std::string&& subword) {

diff --git a/samples/python/visual_language_chat/README.md b/samples/python/visual_language_chat/README.md
@@ -10,8 +10,8 @@ It's not required to install [../../requirements.txt](../../requirements.txt) fo
 
 ```sh
 pip install --upgrade-strategy eager -r ../../requirements.txt
+optimum-cli export openvino -m openbmb/MiniCPM-V-2_6 miniCPM-V-2_6 --trust-remote-code
 ```
-# TODO: add optimum cli command for miniCPM-V-2_6 when available
 
 ## Run:
 [This image](https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11) can be used as a sample image.

diff --git a/samples/requirements.txt b/samples/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
-optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
+optimum-intel @ git+https://github.com/eaidova/optimum-intel.git@ea/minicpmv
 numpy<2.0.0; sys_platform == 'darwin'
 einops==0.8.0  # For Qwen
 transformers_stream_generator==0.0.5  # For Qwen

diff --git a/...p/include/openvino/genai/vlm_pipeline.hpp → ...envino/genai/visual_language/pipeline.hpp b/...p/include/openvino/genai/vlm_pipeline.hpp → ...envino/genai/visual_language/pipeline.hpp
@@ -6,54 +6,13 @@
 #include "openvino/genai/llm_pipeline.hpp"
 #include "openvino/genai/streamer_base.hpp"
 #include "openvino/genai/tokenizer.hpp"
-#include "openvino/genai/vision_encoder.hpp"
-#include "openvino/genai/vlm_config.hpp"
+#include <filesystem>
 
 namespace ov::genai {
-/// @brief A string prompt and source image.
-struct PromptImages {
-    /// @brief A prompt represented as std::string.
-    std::string prompt;
-    /// @brief An image represented as ov::Tensor.
-    std::vector<ov::Tensor> images;
-};
-
 /// @brief A Visual language modeling pipeline class used to generate a
 /// response or run a chat given a prompt and an image.
 class OPENVINO_GENAI_EXPORTS VLMPipeline {
 public:
-    // A config to follow for LLM input construction.
-    VLMConfig m_vlm_config;
-    // A config to follow for text generation.
-    GenerationConfig m_generation_config;
-    // A tokenizer encoding a prompt.
-    Tokenizer m_tokenizer;
-    // An encoder to infer embeddings of an image.
-    VisionEncoder m_vision_encoder;
-    // A resampler model to resample image embeddings.
-    // [N, H*W, old_hidden_size] is the input shape.
-    // [N, query_num, hidden_size] is the output shape.
-    ov::InferRequest m_resampler;
-    // A model to compute token embeddings.
-    // Input shape: [N, conversation length].
-    // Output shape: [1, conversation length, hidden_size].
-    ov::InferRequest m_embedding;
-    // A language model used to generate a response.
-    // Input shapes: inputs_embeds[N, conversation length, hidden_size],
-    // position_ids[N, conversation length], beam_idx[N].
-    // Output shape: logits[N, conversation length, vocab_size].
-    ov::InferRequest m_language;
-    // Precomputed positional embeddings for the resampler.
-    // [70, 70, hidden_size]. 70 is the initial guess of the image
-    // height and width after dividing by patch_size.
-    ov::Tensor m_pos_embed_cache;
-    // True if chat mode is activated to save conversation
-    // history between generate() calls.
-    bool m_is_chat_conversation;
-    ChatHistory m_history;
-    std::string m_templated_chat_history;
-    size_t image_id = 0;  // Used to insert <image_id>i</image_id> per image (not a slice).
-
     /// @brief Construct a pipeline form a folder containing tokenizer
     /// and model IRs.
     /// @param model_dir A folder to read tokenizer and model IRs.
@@ -122,7 +81,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
     /// role.
     void start_chat(const std::string& system_message="");
     /// @brief Deactivate chat mode.
-    void finish_chat() {m_is_chat_conversation = false;}
+    void finish_chat();
     /// @brief Set a custom chat template. Can be used to deactivate
     /// chat_template application for chat mode if called with
     /// "{% for message in messages %}{{ message['content'] }}{% endfor %}"
@@ -139,9 +98,6 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
 private:
     class VLMPipelineImpl;
     std::unique_ptr<VLMPipelineImpl> m_pimpl;
-
-    ov::Tensor get_inputs_embeds_minicpm(const std::string& prompt, const std::vector<ov::Tensor>& images);
-    ov::Tensor get_inputs_embeds_llava(const std::string& prompt, const std::vector<ov::Tensor>& images);
 };
 
 /*

diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
@@ -6,7 +6,7 @@
 #include <nlohmann/json.hpp>
 
 #include "openvino/genai/llm_pipeline.hpp"
-#include "openvino/genai/processor_config.hpp"
+#include "visual_language/processor_config.hpp"
 
 namespace ov {
 namespace genai {

diff --git a/src/cpp/src/clip.cpp → src/cpp/src/visual_language/clip.cpp b/src/cpp/src/clip.cpp → src/cpp/src/visual_language/clip.cpp
diff --git a/src/cpp/src/clip.hpp → src/cpp/src/visual_language/clip.hpp b/src/cpp/src/clip.hpp → src/cpp/src/visual_language/clip.hpp
diff --git a/src/cpp/src/processor_config.cpp → .../src/visual_language/processor_config.cpp b/src/cpp/src/processor_config.cpp → .../src/visual_language/processor_config.cpp
@@ -1,7 +1,7 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#include "openvino/genai/processor_config.hpp"
+#include "processor_config.hpp"
 #include "utils.hpp"
 #include <fstream>
 

diff --git a/...clude/openvino/genai/processor_config.hpp → .../src/visual_language/processor_config.hpp b/...clude/openvino/genai/processor_config.hpp → .../src/visual_language/processor_config.hpp
diff --git a/src/cpp/src/vision_encoder.cpp → ...pp/src/visual_language/vision_encoder.cpp b/src/cpp/src/vision_encoder.cpp → ...pp/src/visual_language/vision_encoder.cpp
@@ -1,8 +1,8 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#include <openvino/genai/vision_encoder.hpp>
-#include "clip.hpp"
+#include "vision_encoder.hpp"
+#include "visual_language/clip.hpp"
 #include "utils.hpp"
 
 using namespace ov::genai;
@@ -300,8 +300,8 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
     ov::Tensor input_tensor{ov::element::f32, {1, 3, size_t(resized_preprocessed.ny), size_t(resized_preprocessed.nx)}, (void*)(resized_preprocessed.buf.data())};
     ov::Tensor pixel_values = preprocess_for_encoder(input_tensor, patch_size);
     encoder.set_tensor("pixel_values", pixel_values);
-    ov::Tensor patch_attention_mask{ov::element::boolean, {pixel_values.get_shape().at(0), 1, resized_source_size.height * resized_source_size.width}};
-    std::fill_n(patch_attention_mask.data<bool>(), patch_attention_mask.get_size(), true);
+    ov::Tensor patch_attention_mask{ov::element::f32, {pixel_values.get_shape().at(0), 1, resized_source_size.height * resized_source_size.width}};
+    std::fill_n(patch_attention_mask.data<float>(), patch_attention_mask.get_size(), 1.0f);
     encoder.set_tensor("patch_attention_mask", patch_attention_mask);
     ov::Tensor position_ids = prepare_vis_position_ids(pixel_values, patch_attention_mask, {resized_source_size}, ctx_clip.patch_size, ctx_clip.image_size / ctx_clip.patch_size);
     encoder.set_tensor("position_ids", position_ids);
@@ -432,7 +432,7 @@ ov::Tensor preprocess_image_llava(const ov::Tensor& image, const ProcessorConfig
 VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const VLMModelType model_type, const std::string& device, const ov::AnyMap device_config, ov::Core core) :
     model_type(model_type) {
         if (model_type == VLMModelType::MINICPM) {
-            m_vision_encoder = core.compile_model(model_dir / "image_encoder.xml", device, device_config).create_infer_request();
+            m_vision_encoder = core.compile_model(model_dir / "openvino_vision_embeddings_model.xml", device, device_config).create_infer_request();
         } else if (model_type == VLMModelType::LLAVA) {
             // Vision embeddings model is merged with multi modal projector at model export stage by optimum-intel
             m_vision_encoder = core.compile_model(model_dir / "openvino_vision_embeddings_model.xml", device, device_config).create_infer_request();

diff --git a/...include/openvino/genai/vision_encoder.hpp → ...pp/src/visual_language/vision_encoder.hpp b/...include/openvino/genai/vision_encoder.hpp → ...pp/src/visual_language/vision_encoder.hpp
@@ -3,9 +3,9 @@
 
 #pragma once
 
-#include "openvino/genai/processor_config.hpp"
 #include <openvino/openvino.hpp>
-#include "vlm_model_type.hpp"
+#include "visual_language/processor_config.hpp"
+#include "visual_language/vlm_model_type.hpp"
 
 namespace ov::genai {
 /// @brief A pair describing image size.

diff --git a/src/cpp/src/vlm_config.cpp → src/cpp/src/visual_language/vlm_config.cpp b/src/cpp/src/vlm_config.cpp → src/cpp/src/visual_language/vlm_config.cpp
@@ -1,7 +1,7 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#include "openvino/genai/vlm_config.hpp"
+#include "vlm_config.hpp"
 #include "utils.hpp"
 #include <fstream>
 

diff --git a/...cpp/include/openvino/genai/vlm_config.hpp → src/cpp/src/visual_language/vlm_config.hpp b/...cpp/include/openvino/genai/vlm_config.hpp → src/cpp/src/visual_language/vlm_config.hpp
@@ -4,9 +4,9 @@
 #pragma once
 
 #include "openvino/genai/visibility.hpp"
+#include "visual_language/vlm_model_type.hpp"
 #include <openvino/runtime/properties.hpp>
 #include <filesystem>
-#include "vlm_model_type.hpp"
 
 namespace ov::genai {
 /// @brief A Configuration class passed to VLMPipeline and used to

diff --git a/...include/openvino/genai/vlm_model_type.hpp → ...pp/src/visual_language/vlm_model_type.hpp b/...include/openvino/genai/vlm_model_type.hpp → ...pp/src/visual_language/vlm_model_type.hpp
-Original file line number
+Diff line change
@@ Expand Up @@
     ```sh
     pip install --upgrade-strategy eager -r ../../requirements.txt
-    export_MiniCPM-V-2_6.py miniCPM-V-2_6
+    optimum-cli export openvino -m openbmb/MiniCPM-V-2_6 MiniCPM-V-2_6 --trust-remote-code
     ```
     ## Run
@@ Expand Down @@