openvinotoolkit · andrei-kochin · Oct 11, 2024 · Oct 2, 2024 · Oct 2, 2024 · Oct 2, 2024
diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
@@ -698,19 +698,30 @@ jobs:
           source ./ov/setupvars.sh
           cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
           cmake --build ./build/ --config Release --target visual_language_chat py_generate_pipeline -j
-      - name: Download and convert a model and an image
+      - name: Download and convert MiniCPM-V-2_6 model and an image
         run: |
           source ./ov/setupvars.sh
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python ./samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py ./miniCPM-V-2_6/
           wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --output-document cat.jpg
-
-      - name: Run chat chat sample
+      - name: Run visual_language_chat sample - MiniCPM-V-2_6
         run: >
           source ./ov/setupvars.sh
           && timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ cat.jpg
           <<< $'What is on the image?\nWhat is special on the image?'
+      - name: Download and convert LLaVa 1.5 model and an image
+        run: |
+          source ./ov/setupvars.sh
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+          optimum-cli export openvino --model llava-hf/llava-1.5-7b-hf ./llava_1_5_7b_ov/
+          wget https://llava-vl.github.io/static/images/monalisa.jpg
+      - name: Run visual_language_chat sample - LLaVa 1.5
+        run: >
+          source ./ov/setupvars.sh
+          && timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./llava_1_5_7b_ov/ monalisa.jpg
+          <<< $'Who drew this painting?\nWhen did the painter live?'
 
       - name: Run python chat sample
         run: |

diff --git a/README.md b/README.md
@@ -40,10 +40,9 @@ Continuous batching functionality is used within OpenVINO Model Server (OVMS) to
 
     # Install optimum-intel to be able to download, convert and optimize LLMs from Hugging Face
     # Optimum is not required to run models, only to convert and compress
-    pip install optimum[openvino]
+    pip install optimum-intel@git+https://github.com/huggingface/optimum-intel.git
 
     # (Optional) Install (TBD) to be able to download models from Model Scope
-    #pip install optimum[openvino]
 ```
 
 ## Performing text generation 

diff --git a/samples/requirements.txt b/samples/requirements.txt
@@ -1,5 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
-optimum[openvino]==1.22.0
+optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
+numpy<2.0.0; sys_platform == 'darwin'
 einops==0.8.0  # For Qwen
 transformers_stream_generator==0.0.5  # For Qwen
 diffusers==0.30.3

diff --git a/src/cpp/include/openvino/genai/processor_config.hpp b/src/cpp/include/openvino/genai/processor_config.hpp
@@ -34,6 +34,14 @@ class OPENVINO_GENAI_EXPORTS ProcessorConfig {
     /// Applied after norm_mean.
     /// llava calls it image_std.
     std::array<float, 3> norm_std{1.0f, 1.0f, 1.0f};
+
+    // llava specific config params
+    std::array<float, 3> image_mean{0.0f, 0.0f, 0.0f};
+    std::array<float, 3> image_std{1.0f, 1.0f, 1.0f};
+    size_t crop_size_height = 336;
+    size_t crop_size_width = 336;
+    size_t size_shortest_edge = 336;
+
     /// @brief Default constructor
     ProcessorConfig() = default;
     /// @brief Construct ProcessorConfig from values in json_path.

diff --git a/src/cpp/include/openvino/genai/vision_encoder.hpp b/src/cpp/include/openvino/genai/vision_encoder.hpp
@@ -5,6 +5,7 @@
 
 #include "openvino/genai/processor_config.hpp"
 #include <openvino/openvino.hpp>
+#include "vlm_model_type.hpp"
 
 namespace ov::genai {
 /// @brief A pair describing image size.
@@ -41,8 +42,10 @@ struct EncodedImage {
 /// ov::InferRequest and configured by ProcessorConfig.
 class OPENVINO_GENAI_EXPORTS VisionEncoder {
 public:
+    /// @brief A enum denoting model type.
+    VLMModelType model_type;
     /// @brief A model for image encoding.
-    ov::InferRequest m_encoder;
+    ov::InferRequest m_vision_encoder;
     /// @brief A config to follow.
     ProcessorConfig m_processor_config;
 
@@ -52,7 +55,7 @@ class OPENVINO_GENAI_EXPORTS VisionEncoder {
     explicit VisionEncoder(
         const ov::InferRequest& encoder,
         const ProcessorConfig& processor_config=ProcessorConfig{}
-    ) : m_encoder{encoder}, m_processor_config{processor_config} {}
+    ) : m_vision_encoder{encoder}, m_processor_config{processor_config} {}
 
     /// @brief Construct the encoder from model_dir.
     /// @param model_dir A folder containing openvino_embedding.xml and
@@ -63,6 +66,7 @@ class OPENVINO_GENAI_EXPORTS VisionEncoder {
     /// @param core ov::Core to be used to compile the model.
     explicit VisionEncoder(
         const std::filesystem::path& model_dir,
+        const VLMModelType model_type,
         const std::string& device="CPU",
         const ov::AnyMap device_config={},
         ov::Core core=ov::Core{}
@@ -117,5 +121,14 @@ class OPENVINO_GENAI_EXPORTS VisionEncoder {
             image, AnyMap{std::forward<Properties>(properties)...}
         );
     }
+
+private:
+    EncodedImage encode_minicpm(
+        const ov::Tensor& image, const ProcessorConfig& config
+    );
+
+    EncodedImage encode_llava(
+        const ov::Tensor& image, const ProcessorConfig& config
+    );
 };
 }
diff --git a/src/cpp/include/openvino/genai/vlm_config.hpp b/src/cpp/include/openvino/genai/vlm_config.hpp
@@ -6,12 +6,15 @@
 #include "openvino/genai/visibility.hpp"
 #include <openvino/runtime/properties.hpp>
 #include <filesystem>
+#include "vlm_model_type.hpp"
 
 namespace ov::genai {
 /// @brief A Configuration class passed to VLMPipeline and used to
 /// change VLMPipeline's behavior. Corresponds to config.json.
 class OPENVINO_GENAI_EXPORTS VLMConfig {
 public:
+    /// @brief A enum denoting model type.
+    VLMModelType model_type;
     /// @brief A size of a single embedding returned by a resampler.
     /// Used to initialize positional embeddings for resampler input.
     size_t hidden_size = 2304;

diff --git a/src/cpp/include/openvino/genai/vlm_model_type.hpp b/src/cpp/include/openvino/genai/vlm_model_type.hpp
@@ -0,0 +1,31 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+
+#include "openvino/genai/visibility.hpp"
+#include <openvino/core/except.hpp>
+
+namespace ov::genai {
+
+enum class OPENVINO_GENAI_EXPORTS VLMModelType {
+    MINICPM,
+    LLAVA,
+};
+
+inline VLMModelType to_vlm_model_type(const std::string& value) {
+    static const std::unordered_map<std::string, VLMModelType> model_types_map = {
+        {"minicpmv", VLMModelType::MINICPM},
+        {"llava", VLMModelType::LLAVA}
+    };
+
+    auto it = model_types_map.find(value);
+    if (it != model_types_map.end()) {
+        return it->second;
+    }
+    OPENVINO_THROW("Unsupported '", value, "' VLM model type");
+}
+}
diff --git a/src/cpp/include/openvino/genai/vlm_pipeline.hpp b/src/cpp/include/openvino/genai/vlm_pipeline.hpp
@@ -139,6 +139,9 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
 private:
     class VLMPipelineImpl;
     std::unique_ptr<VLMPipelineImpl> m_pimpl;
+
+    ov::Tensor get_inputs_embeds_minicpm(const std::string& prompt, const std::vector<ov::Tensor>& images);
+    ov::Tensor get_inputs_embeds_llava(const std::string& prompt, const std::vector<ov::Tensor>& images);
 };
 
 /*

diff --git a/src/cpp/src/processor_config.cpp b/src/cpp/src/processor_config.cpp
@@ -10,7 +10,7 @@ ov::genai::ProcessorConfig::ProcessorConfig(const std::filesystem::path& json_pa
     OPENVINO_ASSERT(stream.is_open(), "Failed to open '" + json_path.string() + "' with processor config");
     nlohmann::json parsed = nlohmann::json::parse(stream);
     using ov::genai::utils::read_json_param;
-    read_json_param(parsed, "patch_size", patch_size);
+    read_json_param(parsed, "patch_size", patch_size); // For llava - stored in config.json vision_config
     read_json_param(parsed, "scale_resolution", scale_resolution);
     read_json_param(parsed, "max_slice_nums", max_slice_nums);
     if (parsed.contains("norm_mean")) {
@@ -19,4 +19,20 @@ ov::genai::ProcessorConfig::ProcessorConfig(const std::filesystem::path& json_pa
     if (parsed.contains("norm_std")) {
         norm_std = parsed.at("norm_std").get<std::array<float, 3>>();
     }
+
+    // Setting llava config params
+    if (parsed.contains("image_mean")) {
+        image_mean = parsed.at("image_mean").get<std::array<float, 3>>();
+    }
+    if (parsed.contains("image_std")) {
+        image_std = parsed.at("image_std").get<std::array<float, 3>>();
+    }
+
+    if (parsed.contains("crop_size")) {
+        crop_size_height = parsed.at("crop_size").at("height");
+        crop_size_width = parsed.at("crop_size").at("width");
+    }
+    if (parsed.contains("size")) {
+        size_shortest_edge = parsed.at("size").at("shortest_edge");
+    }
 }
diff --git a/src/cpp/src/vision_encoder.cpp b/src/cpp/src/vision_encoder.cpp
@@ -362,29 +362,117 @@ ProcessorConfig from_any_map(
     read_anymap_param(config_map, "norm_std", extracted_config.norm_std);
     return extracted_config;
 }
+
+
+ov::Tensor preprocess_image_llava(const ov::Tensor& image, const ProcessorConfig& config) {
+    bool do_resize = true;
+    bool do_center_crop = true;
+
+    // ov::Tensor to clip_image_u8
+    clip_image_u8 input_image{
+        int(image.get_shape().at(3)),
+        int(image.get_shape().at(2)),
+        {image.data<uint8_t>(), image.data<uint8_t>() + image.get_size()}
+    };
+
+    // Resize
+    clip_image_u8 resized_image;
+    if (do_resize) {
+        int target_size = config.size_shortest_edge;
+        float scale = static_cast<float>(target_size) / std::min(input_image.nx, input_image.ny);
+        int new_width = static_cast<int>(input_image.nx * scale);
+        int new_height = static_cast<int>(input_image.ny * scale);
+        bicubic_resize(input_image, resized_image, new_width, new_height);
+    } else {
+        resized_image = input_image;
+    }
+
+    // Center crop
+    clip_image_u8 cropped_image;
+    if (do_center_crop) {
+        int crop_height = config.crop_size_height;
+        int crop_width = config.crop_size_width;
+        int start_x = (resized_image.nx - crop_width) / 2;
+        int start_y = (resized_image.ny - crop_height) / 2;
+
+        cropped_image.nx = crop_width;
+        cropped_image.ny = crop_height;
+        cropped_image.buf.resize(3 * crop_width * crop_height);
+
+        for (int y = 0; y < crop_height; ++y) {
+            for (int x = 0; x < crop_width; ++x) {
+                for (int c = 0; c < 3; ++c) {
+                    cropped_image.buf[(y * crop_width + x) * 3 + c] = 
+                        resized_image.buf[((start_y + y) * resized_image.nx + (start_x + x)) * 3 + c];
+                }
+            }
+        }
+    } else {
+        cropped_image = resized_image;
+    }
+
+    // Normalize
+    clip_ctx ctx;
+    std::copy(config.image_mean.begin(), config.image_mean.end(), ctx.image_mean);
+    std::copy(config.image_std.begin(), config.image_std.end(), ctx.image_std);
+
+    clip_image_f32 normalized_image = clip_image_preprocess(ctx, cropped_image);
+
+    // Convert clip_image_f32 to ov::Tensor
+    ov::Tensor result(
+        ov::element::f32,
+        {1, 3, size_t(normalized_image.ny), size_t(normalized_image.nx)},
+        (void*)(normalized_image.buf.data())
+    );
+
+    return result;
+}
 }
 
-VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const std::string& device, const ov::AnyMap device_config, ov::Core core) :
-    VisionEncoder{
-        core.compile_model(
-            model_dir / "image_encoder.xml", device, device_config
-        ).create_infer_request(),
-        ov::genai::utils::from_config_json_if_exists<ov::genai::ProcessorConfig>(
+VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const VLMModelType model_type, const std::string& device, const ov::AnyMap device_config, ov::Core core) :
+    model_type(model_type) {
+        if (model_type == VLMModelType::MINICPM) {
+            m_vision_encoder = core.compile_model(model_dir / "image_encoder.xml", device, device_config).create_infer_request();
+        } else if (model_type == VLMModelType::LLAVA) {
+            // Vision embeddings model is merged with multi modal projector at model export stage by optimum-intel
+            m_vision_encoder = core.compile_model(model_dir / "openvino_vision_embeddings_model.xml", device, device_config).create_infer_request();
+        }
+        m_processor_config = ov::genai::utils::from_config_json_if_exists<ov::genai::ProcessorConfig>(
             model_dir, "preprocessor_config.json"
-        )
-    } {}
+        );
+}
 
 EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ProcessorConfig& config) {
+    if (model_type == VLMModelType::MINICPM) {
+        return encode_minicpm(image, config);
+    } else if (model_type == VLMModelType::LLAVA) {
+        return encode_llava(image, config);
+    }
+}
+
+EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ov::AnyMap& config_map) {
+    return encode(image, from_any_map(
+        config_map, m_processor_config
+    ));
+}
+
+EncodedImage VisionEncoder::encode_minicpm(const ov::Tensor& image, const ProcessorConfig& config) {
     clip_ctx ctx_clip;
     ctx_clip.patch_size = m_processor_config.patch_size;
     ctx_clip.image_size = m_processor_config.image_size;
     std::copy(config.norm_mean.begin(), config.norm_mean.end(), ctx_clip.image_mean);
     std::copy(config.norm_std.begin(), config.norm_std.end(), ctx_clip.image_std);
-    return llava_image_embed_make_with_bytes_slice(ctx_clip, image, m_encoder, config.max_slice_nums, config.scale_resolution, config.patch_size, 0 == config.max_slice_nums);
+    return llava_image_embed_make_with_bytes_slice(ctx_clip, image, m_vision_encoder, config.max_slice_nums, config.scale_resolution, config.patch_size, 0 == config.max_slice_nums);
 }
 
-EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ov::AnyMap& config_map) {
-    return encode(image, from_any_map(
-        config_map, m_processor_config
-    ));
+EncodedImage VisionEncoder::encode_llava(const ov::Tensor& image, const ProcessorConfig& config) {
+    ov::Tensor preprocessed_image = preprocess_image_llava(image, config);
+
+    m_vision_encoder.set_tensor("pixel_values", preprocessed_image);
+    m_vision_encoder.infer();
+
+    ov::Tensor image_features = m_vision_encoder.get_output_tensor();
+    ImageSize resized_source_size{config.crop_size_height / config.patch_size, config.crop_size_width / config.patch_size};
+
+    return {image_features, resized_source_size};
 }
diff --git a/src/cpp/src/vlm_config.cpp b/src/cpp/src/vlm_config.cpp
@@ -10,6 +10,7 @@ ov::genai::VLMConfig::VLMConfig(const std::filesystem::path& json_path) {
     OPENVINO_ASSERT(stream.is_open(), "Failed to open '" + json_path.string() + "' with processor config");
     nlohmann::json parsed = nlohmann::json::parse(stream);
     using ov::genai::utils::read_json_param;
+    model_type = to_vlm_model_type(parsed.at("model_type"));
     read_json_param(parsed, "hidden_size", hidden_size);
     read_json_param(parsed, "scale_emb", scale_emb);
     read_json_param(parsed, "query_num", query_num);