openvinotoolkit · ilya-lavrenov · Oct 15, 2024 · Oct 12, 2024 · Oct 12, 2024 · Oct 12, 2024
diff --git a/samples/cpp/visual_language_chat/visual_language_chat.cpp b/samples/cpp/visual_language_chat/visual_language_chat.cpp
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "load_image.hpp"
-#include <openvino/genai/vlm_pipeline.hpp>
+#include <openvino/genai/visual_language/vlm_pipeline.hpp>
 #include <openvino/runtime/intel_gpu/properties.hpp>
 
 bool print_subword(std::string&& subword) {

diff --git a/...p/include/openvino/genai/vlm_pipeline.hpp → ...no/genai/visual_language/vlm_pipeline.hpp b/...p/include/openvino/genai/vlm_pipeline.hpp → ...no/genai/visual_language/vlm_pipeline.hpp
@@ -6,54 +6,13 @@
 #include "openvino/genai/llm_pipeline.hpp"
 #include "openvino/genai/streamer_base.hpp"
 #include "openvino/genai/tokenizer.hpp"
-#include "openvino/genai/vision_encoder.hpp"
-#include "openvino/genai/vlm_config.hpp"
+#include <filesystem>
 
 namespace ov::genai {
-/// @brief A string prompt and source image.
-struct PromptImages {
-    /// @brief A prompt represented as std::string.
-    std::string prompt;
-    /// @brief An image represented as ov::Tensor.
-    std::vector<ov::Tensor> images;
-};
-
 /// @brief A Visual language modeling pipeline class used to generate a
 /// response or run a chat given a prompt and an image.
 class OPENVINO_GENAI_EXPORTS VLMPipeline {
 public:
-    // A config to follow for LLM input construction.
-    VLMConfig m_vlm_config;
-    // A config to follow for text generation.
-    GenerationConfig m_generation_config;
-    // A tokenizer encoding a prompt.
-    Tokenizer m_tokenizer;
-    // An encoder to infer embeddings of an image.
-    VisionEncoder m_vision_encoder;
-    // A resampler model to resample image embeddings.
-    // [N, H*W, old_hidden_size] is the input shape.
-    // [N, query_num, hidden_size] is the output shape.
-    ov::InferRequest m_resampler;
-    // A model to compute token embeddings.
-    // Input shape: [N, conversation length].
-    // Output shape: [1, conversation length, hidden_size].
-    ov::InferRequest m_embedding;
-    // A language model used to generate a response.
-    // Input shapes: inputs_embeds[N, conversation length, hidden_size],
-    // position_ids[N, conversation length], beam_idx[N].
-    // Output shape: logits[N, conversation length, vocab_size].
-    ov::InferRequest m_language;
-    // Precomputed positional embeddings for the resampler.
-    // [70, 70, hidden_size]. 70 is the initial guess of the image
-    // height and width after dividing by patch_size.
-    ov::Tensor m_pos_embed_cache;
-    // True if chat mode is activated to save conversation
-    // history between generate() calls.
-    bool m_is_chat_conversation;
-    ChatHistory m_history;
-    std::string m_templated_chat_history;
-    size_t image_id = 0;  // Used to insert <image_id>i</image_id> per image (not a slice).
-
     /// @brief Construct a pipeline form a folder containing tokenizer
     /// and model IRs.
     /// @param model_dir A folder to read tokenizer and model IRs.
@@ -122,7 +81,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
     /// role.
     void start_chat(const std::string& system_message="");
     /// @brief Deactivate chat mode.
-    void finish_chat() {m_is_chat_conversation = false;}
+    void finish_chat();
     /// @brief Set a custom chat template. Can be used to deactivate
     /// chat_template application for chat mode if called with
     /// "{% for message in messages %}{{ message['content'] }}{% endfor %}"
@@ -139,9 +98,6 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
 private:
     class VLMPipelineImpl;
     std::unique_ptr<VLMPipelineImpl> m_pimpl;
-
-    ov::Tensor get_inputs_embeds_minicpm(const std::string& prompt, const std::vector<ov::Tensor>& images);
-    ov::Tensor get_inputs_embeds_llava(const std::string& prompt, const std::vector<ov::Tensor>& images);
 };
 
 /*

diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
@@ -6,7 +6,7 @@
 #include <nlohmann/json.hpp>
 
 #include "openvino/genai/llm_pipeline.hpp"
-#include "openvino/genai/processor_config.hpp"
+#include "visual_language/processor_config.hpp"
 
 namespace ov {
 namespace genai {

diff --git a/src/cpp/src/vision_encoder.cpp b/src/cpp/src/vision_encoder.cpp
@@ -1,8 +1,8 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#include <openvino/genai/vision_encoder.hpp>
-#include "clip.hpp"
+#include "vision_encoder.hpp"
+#include "visual_language/clip.hpp"
 #include "utils.hpp"
 
 using namespace ov::genai;

diff --git a/...include/openvino/genai/vision_encoder.hpp → src/cpp/src/vision_encoder.hpp b/...include/openvino/genai/vision_encoder.hpp → src/cpp/src/vision_encoder.hpp
@@ -3,9 +3,9 @@
 
 #pragma once
 
-#include "openvino/genai/processor_config.hpp"
 #include <openvino/openvino.hpp>
-#include "vlm_model_type.hpp"
+#include "visual_language/processor_config.hpp"
+#include "visual_language/vlm_model_type.hpp"
 
 namespace ov::genai {
 /// @brief A pair describing image size.

diff --git a/src/cpp/src/clip.cpp → src/cpp/src/visual_language/clip.cpp b/src/cpp/src/clip.cpp → src/cpp/src/visual_language/clip.cpp
diff --git a/src/cpp/src/clip.hpp → src/cpp/src/visual_language/clip.hpp b/src/cpp/src/clip.hpp → src/cpp/src/visual_language/clip.hpp
diff --git a/src/cpp/src/processor_config.cpp → .../src/visual_language/processor_config.cpp b/src/cpp/src/processor_config.cpp → .../src/visual_language/processor_config.cpp
@@ -1,7 +1,7 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#include "openvino/genai/processor_config.hpp"
+#include "processor_config.hpp"
 #include "utils.hpp"
 #include <fstream>
 

diff --git a/...clude/openvino/genai/processor_config.hpp → .../src/visual_language/processor_config.hpp b/...clude/openvino/genai/processor_config.hpp → .../src/visual_language/processor_config.hpp
diff --git a/src/cpp/src/vlm_config.cpp → src/cpp/src/visual_language/vlm_config.cpp b/src/cpp/src/vlm_config.cpp → src/cpp/src/visual_language/vlm_config.cpp
@@ -1,7 +1,7 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#include "openvino/genai/vlm_config.hpp"
+#include "vlm_config.hpp"
 #include "utils.hpp"
 #include <fstream>
 

diff --git a/...cpp/include/openvino/genai/vlm_config.hpp → src/cpp/src/visual_language/vlm_config.hpp b/...cpp/include/openvino/genai/vlm_config.hpp → src/cpp/src/visual_language/vlm_config.hpp
@@ -4,9 +4,9 @@
 #pragma once
 
 #include "openvino/genai/visibility.hpp"
+#include "visual_language/vlm_model_type.hpp"
 #include <openvino/runtime/properties.hpp>
 #include <filesystem>
-#include "vlm_model_type.hpp"
 
 namespace ov::genai {
 /// @brief A Configuration class passed to VLMPipeline and used to

diff --git a/...include/openvino/genai/vlm_model_type.hpp → ...pp/src/visual_language/vlm_model_type.hpp b/...include/openvino/genai/vlm_model_type.hpp → ...pp/src/visual_language/vlm_model_type.hpp