From 61512311139daf87c1d8da5f62f08e59a11801a6 Mon Sep 17 00:00:00 2001 From: Vladimir Date: Sat, 12 Oct 2024 14:44:12 +0400 Subject: [PATCH 01/28] Hide VLM files and API --- .../include/openvino/genai/vlm_pipeline.hpp | 48 +- src/cpp/src/processor_config.cpp | 2 +- .../genai => src}/processor_config.hpp | 0 src/cpp/src/utils.hpp | 2 +- src/cpp/src/vision_encoder.cpp | 2 +- .../openvino/genai => src}/vision_encoder.hpp | 4 +- src/cpp/src/vlm_config.cpp | 2 +- .../openvino/genai => src}/vlm_config.hpp | 2 +- src/cpp/src/vlm_pipeline.cpp | 847 +++++++++--------- 9 files changed, 442 insertions(+), 467 deletions(-) rename src/cpp/{include/openvino/genai => src}/processor_config.hpp (100%) rename src/cpp/{include/openvino/genai => src}/vision_encoder.hpp (98%) rename src/cpp/{include/openvino/genai => src}/vlm_config.hpp (98%) diff --git a/src/cpp/include/openvino/genai/vlm_pipeline.hpp b/src/cpp/include/openvino/genai/vlm_pipeline.hpp index 0eb0b5a646..bd83318bb4 100644 --- a/src/cpp/include/openvino/genai/vlm_pipeline.hpp +++ b/src/cpp/include/openvino/genai/vlm_pipeline.hpp @@ -6,54 +6,13 @@ #include "openvino/genai/llm_pipeline.hpp" #include "openvino/genai/streamer_base.hpp" #include "openvino/genai/tokenizer.hpp" -#include "openvino/genai/vision_encoder.hpp" -#include "openvino/genai/vlm_config.hpp" +#include namespace ov::genai { -/// @brief A string prompt and source image. -struct PromptImages { - /// @brief A prompt represented as std::string. - std::string prompt; - /// @brief An image represented as ov::Tensor. - std::vector images; -}; - /// @brief A Visual language modeling pipeline class used to generate a /// response or run a chat given a prompt and an image. class OPENVINO_GENAI_EXPORTS VLMPipeline { public: - // A config to follow for LLM input construction. - VLMConfig m_vlm_config; - // A config to follow for text generation. - GenerationConfig m_generation_config; - // A tokenizer encoding a prompt. - Tokenizer m_tokenizer; - // An encoder to infer embeddings of an image. - VisionEncoder m_vision_encoder; - // A resampler model to resample image embeddings. - // [N, H*W, old_hidden_size] is the input shape. - // [N, query_num, hidden_size] is the output shape. - ov::InferRequest m_resampler; - // A model to compute token embeddings. - // Input shape: [N, conversation length]. - // Output shape: [1, conversation length, hidden_size]. - ov::InferRequest m_embedding; - // A language model used to generate a response. - // Input shapes: inputs_embeds[N, conversation length, hidden_size], - // position_ids[N, conversation length], beam_idx[N]. - // Output shape: logits[N, conversation length, vocab_size]. - ov::InferRequest m_language; - // Precomputed positional embeddings for the resampler. - // [70, 70, hidden_size]. 70 is the initial guess of the image - // height and width after dividing by patch_size. - ov::Tensor m_pos_embed_cache; - // True if chat mode is activated to save conversation - // history between generate() calls. - bool m_is_chat_conversation; - ChatHistory m_history; - std::string m_templated_chat_history; - size_t image_id = 0; // Used to insert i per image (not a slice). - /// @brief Construct a pipeline form a folder containing tokenizer /// and model IRs. /// @param model_dir A folder to read tokenizer and model IRs. @@ -122,7 +81,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { /// role. void start_chat(const std::string& system_message=""); /// @brief Deactivate chat mode. - void finish_chat() {m_is_chat_conversation = false;} + void finish_chat(); /// @brief Set a custom chat template. Can be used to deactivate /// chat_template application for chat mode if called with /// "{% for message in messages %}{{ message['content'] }}{% endfor %}" @@ -139,9 +98,6 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { private: class VLMPipelineImpl; std::unique_ptr m_pimpl; - - ov::Tensor get_inputs_embeds_minicpm(const std::string& prompt, const std::vector& images); - ov::Tensor get_inputs_embeds_llava(const std::string& prompt, const std::vector& images); }; /* diff --git a/src/cpp/src/processor_config.cpp b/src/cpp/src/processor_config.cpp index cea7f98fd4..22d068feaf 100644 --- a/src/cpp/src/processor_config.cpp +++ b/src/cpp/src/processor_config.cpp @@ -1,7 +1,7 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 -#include "openvino/genai/processor_config.hpp" +#include "processor_config.hpp" #include "utils.hpp" #include diff --git a/src/cpp/include/openvino/genai/processor_config.hpp b/src/cpp/src/processor_config.hpp similarity index 100% rename from src/cpp/include/openvino/genai/processor_config.hpp rename to src/cpp/src/processor_config.hpp diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index b5228eede0..3ba551e169 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -6,7 +6,7 @@ #include #include "openvino/genai/llm_pipeline.hpp" -#include "openvino/genai/processor_config.hpp" +#include "processor_config.hpp" namespace ov { namespace genai { diff --git a/src/cpp/src/vision_encoder.cpp b/src/cpp/src/vision_encoder.cpp index 6c926e0ed8..1153329b98 100644 --- a/src/cpp/src/vision_encoder.cpp +++ b/src/cpp/src/vision_encoder.cpp @@ -1,7 +1,7 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 -#include +#include "vision_encoder.hpp" #include "clip.hpp" #include "utils.hpp" diff --git a/src/cpp/include/openvino/genai/vision_encoder.hpp b/src/cpp/src/vision_encoder.hpp similarity index 98% rename from src/cpp/include/openvino/genai/vision_encoder.hpp rename to src/cpp/src/vision_encoder.hpp index 902557d316..7cf8c88e71 100644 --- a/src/cpp/include/openvino/genai/vision_encoder.hpp +++ b/src/cpp/src/vision_encoder.hpp @@ -3,9 +3,9 @@ #pragma once -#include "openvino/genai/processor_config.hpp" #include -#include "vlm_model_type.hpp" +#include "processor_config.hpp" +#include "openvino/genai/vlm_model_type.hpp" namespace ov::genai { /// @brief A pair describing image size. diff --git a/src/cpp/src/vlm_config.cpp b/src/cpp/src/vlm_config.cpp index 8d7585f2bb..f3a54c5ec7 100644 --- a/src/cpp/src/vlm_config.cpp +++ b/src/cpp/src/vlm_config.cpp @@ -1,7 +1,7 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 -#include "openvino/genai/vlm_config.hpp" +#include "vlm_config.hpp" #include "utils.hpp" #include diff --git a/src/cpp/include/openvino/genai/vlm_config.hpp b/src/cpp/src/vlm_config.hpp similarity index 98% rename from src/cpp/include/openvino/genai/vlm_config.hpp rename to src/cpp/src/vlm_config.hpp index 46983c080a..11f91cda2e 100644 --- a/src/cpp/include/openvino/genai/vlm_config.hpp +++ b/src/cpp/src/vlm_config.hpp @@ -4,9 +4,9 @@ #pragma once #include "openvino/genai/visibility.hpp" +#include "openvino/genai/vlm_model_type.hpp" #include #include -#include "vlm_model_type.hpp" namespace ov::genai { /// @brief A Configuration class passed to VLMPipeline and used to diff --git a/src/cpp/src/vlm_pipeline.cpp b/src/cpp/src/vlm_pipeline.cpp index 0678f2b074..ef7a90b717 100644 --- a/src/cpp/src/vlm_pipeline.cpp +++ b/src/cpp/src/vlm_pipeline.cpp @@ -5,9 +5,11 @@ #include "openvino/genai/tokenizer.hpp" #include "vlm_sampling.hpp" #include "clip.hpp" -#include -#include "../src/text_callback_streamer.hpp" +#include "text_callback_streamer.hpp" #include "utils.hpp" +#include "vision_encoder.hpp" +#include "vlm_config.hpp" +#include #include #include @@ -19,65 +21,6 @@ template overloaded(Ts...) -> overloaded; constexpr size_t BATCH_SIZE = 1; -struct Args { - bool do_sample = false; - int top_k = 0; - float top_p = 0.7f; - float temp = 0.95f; - float repeat_penalty = 1.0f; -}; - -int64_t get_out_token_id(const std::vector& input_ids, float* logits, size_t vocab_size, Args args) { - int64_t out_token; - - // logits pre-process - if (args.repeat_penalty != 1.f) { - sampling_repetition_penalty(logits, logits + vocab_size, input_ids, args.repeat_penalty); - } - - if (args.do_sample) - { - if (args.temp > 0) { - sampling_temperature(logits, logits + vocab_size, args.temp); - } - - std::vector token_scores(vocab_size); - for (int i = 0; i < vocab_size; i++) { - token_scores[i] = TokenIdScore(i, logits[i]); - } - - // top_k sampling - if (0 < args.top_k && args.top_k < (int)token_scores.size()) { - sampling_top_k(token_scores.data(), token_scores.data() + args.top_k, - token_scores.data() + token_scores.size()); - token_scores.resize(args.top_k); - } - - // top_p sampling - if (0.f < args.top_p && args.top_p < 1.f) { - auto pos = sampling_top_p(token_scores.data(), token_scores.data() + token_scores.size(), args.top_p); - token_scores.resize(pos - token_scores.data()); - } - - // sample next token - sampling_softmax_inplace(token_scores.data(), token_scores.data() + token_scores.size()); - for (size_t i = 0; i < token_scores.size(); i++) { - logits[i] = token_scores[i].score; - } - - thread_local std::random_device rd; - thread_local std::mt19937 gen(rd()); - - std::discrete_distribution<> dist(logits, logits + token_scores.size()); - out_token = token_scores[dist(gen)].id; - } - else { - out_token = std::max_element(logits, logits + vocab_size) - logits; - } - - return out_token; -} - ov::Tensor process_prompt(ov::InferRequest& embedding, const ov::Tensor& prompt, float scale_emb) { embedding.set_input_tensor(prompt); embedding.infer(); @@ -250,51 +193,6 @@ void adjust_pos_cache( } } -ov::Tensor resample(VLMPipeline& pipe, const ov::Tensor& encoded_image, const std::vector& target_sizes) { - size_t bs = encoded_image.get_shape().at(0); - std::vector patch_len{target_sizes.size()}; - std::transform(target_sizes.begin(), target_sizes.end(), patch_len.begin(), [](const ImageSize& height_width) { - return height_width.height * height_width.width; - }); - adjust_pos_cache( - target_sizes, - pipe.m_vlm_config.hidden_size, - pipe.m_pos_embed_cache - ); - size_t max_patch_len = *std::max_element(patch_len.begin(), patch_len.end()); - ov::Tensor key_padding_mask(ov::element::boolean, {bs, max_patch_len}); - bool* mask_data = key_padding_mask.data(); - size_t embed_len = pipe.m_pos_embed_cache.get_shape().at(2); - ov::Tensor pos_embed(ov::element::f32, {max_patch_len, bs, embed_len}); // BLD => L * B * D - float* pos_embed_data = pos_embed.data(); - float* cache_data = pipe.m_pos_embed_cache.data(); - size_t _d0 = pipe.m_pos_embed_cache.get_shape().at(0); - size_t _d1 = pipe.m_pos_embed_cache.get_shape().at(1); - for (size_t i = 0; i < bs; ++i) { - size_t target_h = target_sizes.at(i).height; - size_t target_w = target_sizes.at(i).width; - for (size_t h_idx = 0; h_idx < target_h; ++h_idx) { - for (size_t w_idx = 0; w_idx < target_w; ++w_idx) { - std::copy_n( - cache_data + h_idx * _d1 + w_idx, - embed_len, - pos_embed_data + (h_idx * target_w + w_idx) * bs * embed_len + i * embed_len - ); - } - } - for (size_t flat = target_h * target_w; flat < max_patch_len; ++flat) { - std::fill_n(pos_embed_data + flat * bs * embed_len + i * embed_len, embed_len, 0.0f); - } - std::fill_n(mask_data + i * max_patch_len, patch_len[i], false); - std::fill_n(mask_data + i * max_patch_len + patch_len[i], max_patch_len - patch_len[i], true); - } - pipe.m_resampler.set_tensor("x", encoded_image); // [N, H*W, old_hidden_size] - pipe.m_resampler.set_tensor("pos_embed", pos_embed); // [H*W, N, new_hidden_size] - pipe.m_resampler.set_tensor("key_padding_mask", key_padding_mask); // [N, H*W] - pipe.m_resampler.infer(); - return pipe.m_resampler.get_output_tensor(); // [N, query_num, new_hidden_size] -} - ov::Tensor merge_text_and_image_embeddings_llava( const ov::Tensor& input_ids, const ov::Tensor& text_embeds, @@ -345,358 +243,479 @@ ov::Tensor merge_text_and_image_embeddings_llava( } class ov::genai::VLMPipeline::VLMPipelineImpl { -}; +public: + // A config to follow for LLM input construction. + VLMConfig m_vlm_config; + // A config to follow for text generation. + GenerationConfig m_generation_config; + // A tokenizer encoding a prompt. + Tokenizer m_tokenizer; + // An encoder to infer embeddings of an image. + VisionEncoder m_vision_encoder; + // A resampler model to resample image embeddings. + // [N, H*W, old_hidden_size] is the input shape. + // [N, query_num, hidden_size] is the output shape. + ov::InferRequest m_resampler; + // A model to compute token embeddings. + // Input shape: [N, conversation length]. + // Output shape: [1, conversation length, hidden_size]. + ov::InferRequest m_embedding; + // A language model used to generate a response. + // Input shapes: inputs_embeds[N, conversation length, hidden_size], + // position_ids[N, conversation length], beam_idx[N]. + // Output shape: logits[N, conversation length, vocab_size]. + ov::InferRequest m_language; + // Precomputed positional embeddings for the resampler. + // [70, 70, hidden_size]. 70 is the initial guess of the image + // height and width after dividing by patch_size. + ov::Tensor m_pos_embed_cache; + // True if chat mode is activated to save conversation + // history between generate() calls. + bool m_is_chat_conversation; + ChatHistory m_history; + std::string m_templated_chat_history; + size_t image_id = 0; // Used to insert i per image (not a slice). + + VLMPipelineImpl( + const std::filesystem::path& model_dir, + const std::string& device, + const ov::AnyMap device_config + ) : + m_vlm_config{ + utils::from_config_json_if_exists( + model_dir, "config.json" + ) + }, + m_tokenizer{Tokenizer(model_dir.string(), device_config)}, + m_vision_encoder(model_dir, m_vlm_config.model_type, device, device_config, ov::Core{}), + m_is_chat_conversation{false} { + if (m_vlm_config.model_type == VLMModelType::MINICPM) { + m_resampler = ov::Core{}.compile_model( + model_dir / "resampler.xml", device, device_config + ).create_infer_request(); + + m_embedding = ov::Core{}.compile_model( + model_dir / "embed_tokens.xml", device, device_config + ).create_infer_request(); + + m_language = ov::Core{}.compile_model( + model_dir / "language_model.xml", device, device_config + ).create_infer_request(); + + m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70}); + } else if (m_vlm_config.model_type == VLMModelType::LLAVA) { + m_language = ov::Core{}.compile_model( + model_dir / "openvino_language_model.xml", device, device_config + ).create_infer_request(); + + // Reusing the same m_embedding for llava text_embeddings model + m_embedding = ov::Core{}.compile_model( + model_dir / "openvino_text_embeddings_model.xml", device, device_config + ).create_infer_request(); + } -VLMPipeline::VLMPipeline( - const std::filesystem::path& model_dir, - const std::string& device, - const ov::AnyMap device_config -) : - m_vlm_config{ - utils::from_config_json_if_exists( - model_dir, "config.json" - ) - }, - m_tokenizer{Tokenizer(model_dir.string(), device_config)}, - m_vision_encoder(model_dir, m_vlm_config.model_type, device, device_config, ov::Core{}), - m_is_chat_conversation{false} { + m_language.get_tensor("attention_mask").set_shape({1, 0}); + } + + DecodedResults generate( + const std::string& prompt, + const std::vector& rgbs, + const GenerationConfig& generation_config, + const StreamerVariant& streamer + ) { + ov::Tensor inputs_embeds; if (m_vlm_config.model_type == VLMModelType::MINICPM) { - m_resampler = ov::Core{}.compile_model( - model_dir / "resampler.xml", device, device_config - ).create_infer_request(); + inputs_embeds = get_inputs_embeds_minicpm(prompt, rgbs); + } else if (m_vlm_config.model_type == VLMModelType::LLAVA) { + inputs_embeds = get_inputs_embeds_llava(prompt, rgbs); + } - m_embedding = ov::Core{}.compile_model( - model_dir / "embed_tokens.xml", device, device_config - ).create_infer_request(); + m_language.set_tensor("inputs_embeds", inputs_embeds); + size_t history_len = m_language.get_tensor("attention_mask").get_shape().at(1); + m_language.get_tensor("attention_mask").set_shape({1, history_len + inputs_embeds.get_shape()[1]}); + std::fill_n(m_language.get_tensor("attention_mask").data(), m_language.get_tensor("attention_mask").get_size(), 1); - m_language = ov::Core{}.compile_model( - model_dir / "language_model.xml", device, device_config - ).create_infer_request(); + m_language.get_tensor("position_ids").set_shape({1, inputs_embeds.get_shape().at(1)}); + std::iota(m_language.get_tensor("position_ids").data(), m_language.get_tensor("position_ids").data() + m_language.get_tensor("position_ids").get_size(), history_len); - m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70}); - } else if (m_vlm_config.model_type == VLMModelType::LLAVA) { - m_language = ov::Core{}.compile_model( - model_dir / "openvino_language_model.xml", device, device_config - ).create_infer_request(); - - // Reusing the same m_embedding for llava text_embeddings model - m_embedding = ov::Core{}.compile_model( - model_dir / "openvino_text_embeddings_model.xml", device, device_config - ).create_infer_request(); - } + m_language.get_tensor("beam_idx").set_shape({ BATCH_SIZE }); + m_language.get_tensor("beam_idx").data()[0] = 0; - m_language.get_tensor("attention_mask").set_shape({1, 0}); - } + m_language.infer(); -ov::genai::VLMPipeline::~VLMPipeline() = default; + ov::Shape logits_shape = m_language.get_tensor("logits").get_shape(); + auto attention_size = m_language.get_tensor("attention_mask").get_size(); + + int64_t sequence_len = m_language.get_tensor("logits").get_shape().at(1) - 1; + size_t vocab_size = m_language.get_tensor("logits").get_shape().back(); + float* logits = m_language.get_tensor("logits").data() + sequence_len * vocab_size; + int64_t out_token = std::max_element(logits, logits + vocab_size) - logits; + + m_language.get_tensor("inputs_embeds").set_shape({BATCH_SIZE, 1, m_vlm_config.hidden_size}); + m_language.get_tensor("position_ids").set_shape({ BATCH_SIZE, 1 }); + + m_embedding.get_input_tensor().set_shape({ 1, 1 }); + + int64_t eos_token_id = m_tokenizer.get_eos_token_id(); + std::shared_ptr streamer_ptr = std::visit(overloaded{ + [&m_tokenizer = m_tokenizer]( + const std::function& callback + ) -> std::shared_ptr { + return std::make_shared(m_tokenizer, callback); + }, + [](const std::shared_ptr& ptr) { + return ptr; + }, + [](std::monostate) { + return std::shared_ptr{nullptr}; + }, + }, streamer); + std::vector generated; + while (true) { //(out_token != eos_token_id) + m_embedding.get_input_tensor().data()[0] = out_token; + m_embedding.infer(); + const ov::Tensor& embed_prompt_tensor = m_embedding.get_output_tensor(); + float* embed_data = embed_prompt_tensor.data(); + for (auto idx = 0; idx < embed_prompt_tensor.get_size(); idx++) { + embed_data[idx] = embed_data[idx] * m_vlm_config.scale_emb; + } -DecodedResults VLMPipeline::generate( - const std::string& prompt, - const std::vector& rgbs, - const GenerationConfig& generation_config, - const StreamerVariant& streamer -) { - ov::Tensor inputs_embeds; - if (m_vlm_config.model_type == VLMModelType::MINICPM) { - inputs_embeds = get_inputs_embeds_minicpm(prompt, rgbs); - } else if (m_vlm_config.model_type == VLMModelType::LLAVA) { - inputs_embeds = get_inputs_embeds_llava(prompt, rgbs); - } + m_language.set_tensor("inputs_embeds", embed_prompt_tensor); + m_language.get_tensor("attention_mask").set_shape({ BATCH_SIZE, m_language.get_tensor("attention_mask").get_shape()[1] + 1 }); + std::fill_n(m_language.get_tensor("attention_mask").data(), m_language.get_tensor("attention_mask").get_size(), 1); + m_language.get_tensor("position_ids").data()[0] = int64_t(m_language.get_tensor("attention_mask").get_size() - 2); - m_language.set_tensor("inputs_embeds", inputs_embeds); - size_t history_len = m_language.get_tensor("attention_mask").get_shape().at(1); - m_language.get_tensor("attention_mask").set_shape({1, history_len + inputs_embeds.get_shape()[1]}); - std::fill_n(m_language.get_tensor("attention_mask").data(), m_language.get_tensor("attention_mask").get_size(), 1); - - m_language.get_tensor("position_ids").set_shape({1, inputs_embeds.get_shape().at(1)}); - std::iota(m_language.get_tensor("position_ids").data(), m_language.get_tensor("position_ids").data() + m_language.get_tensor("position_ids").get_size(), history_len); - - m_language.get_tensor("beam_idx").set_shape({ BATCH_SIZE }); - m_language.get_tensor("beam_idx").data()[0] = 0; - - m_language.infer(); - - ov::Shape logits_shape = m_language.get_tensor("logits").get_shape(); - auto attention_size = m_language.get_tensor("attention_mask").get_size(); - - int64_t sequence_len = m_language.get_tensor("logits").get_shape().at(1) - 1; - size_t vocab_size = m_language.get_tensor("logits").get_shape().back(); - float* logits = m_language.get_tensor("logits").data() + sequence_len * vocab_size; - int64_t out_token = std::max_element(logits, logits + vocab_size) - logits; - - m_language.get_tensor("inputs_embeds").set_shape({BATCH_SIZE, 1, m_vlm_config.hidden_size}); - m_language.get_tensor("position_ids").set_shape({ BATCH_SIZE, 1 }); - - m_embedding.get_input_tensor().set_shape({ 1, 1 }); - - int64_t eos_token_id = m_tokenizer.get_eos_token_id(); - std::shared_ptr streamer_ptr = std::visit(overloaded{ - [&m_tokenizer = m_tokenizer]( - const std::function& callback - ) -> std::shared_ptr { - return std::make_shared(m_tokenizer, callback); - }, - [](const std::shared_ptr& ptr) { - return ptr; - }, - [](std::monostate) { - return std::shared_ptr{nullptr}; - }, - }, streamer); - std::vector generated; - while (true) { //(out_token != eos_token_id) - m_embedding.get_input_tensor().data()[0] = out_token; - m_embedding.infer(); - const ov::Tensor& embed_prompt_tensor = m_embedding.get_output_tensor(); - float* embed_data = embed_prompt_tensor.data(); - for (auto idx = 0; idx < embed_prompt_tensor.get_size(); idx++) { - embed_data[idx] = embed_data[idx] * m_vlm_config.scale_emb; - } + m_language.infer(); - m_language.set_tensor("inputs_embeds", embed_prompt_tensor); - m_language.get_tensor("attention_mask").set_shape({ BATCH_SIZE, m_language.get_tensor("attention_mask").get_shape()[1] + 1 }); - std::fill_n(m_language.get_tensor("attention_mask").data(), m_language.get_tensor("attention_mask").get_size(), 1); - m_language.get_tensor("position_ids").data()[0] = int64_t(m_language.get_tensor("attention_mask").get_size() - 2); + generated.push_back(out_token); + if (streamer_ptr && streamer_ptr->put(out_token)) { + break; + } + logits = m_language.get_tensor("logits").data(); - m_language.infer(); + out_token = std::max_element(logits, logits + vocab_size) - logits; + if (out_token == eos_token_id) { + break; + } + } - generated.push_back(out_token); - if (streamer_ptr && streamer_ptr->put(out_token)) { - break; + if (streamer_ptr) { + streamer_ptr->end(); } - logits = m_language.get_tensor("logits").data(); - out_token = std::max_element(logits, logits + vocab_size) - logits; - if (out_token == eos_token_id) { - break; + std::string decoded_results = m_tokenizer.decode(generated); + if (m_is_chat_conversation) { + // Tail of chat template is missing in KV cache. + // Find the tail to concatenate it with the next input prompt. + m_templated_chat_history.append(decoded_results); + m_history.push_back({{"role", "assistant"}, {"content", decoded_results}}); + } else { + for (auto& variable : m_language.query_state()) { + variable.reset(); + } + m_language.get_tensor("attention_mask").set_shape({1, 0}); } + return {{std::move(decoded_results)}}; } - if (streamer_ptr) { - streamer_ptr->end(); + DecodedResults generate( + const std::string& prompt, + const ov::AnyMap& config_map + ) { + auto image = config_map.find(ov::genai::image.name()); + auto images = config_map.find(ov::genai::images.name()); + OPENVINO_ASSERT( + config_map.end() == image || config_map.end() == images, + "Only one property can be set: image of images." + ); + std::vector rgbs; + if (config_map.end() != image) { + rgbs = {image->second.as()}; + } if (config_map.end() != images) { + rgbs = images->second.as>(); + } + ov::genai::OptionalGenerationConfig config_arg = utils::get_config_from_map(config_map); + GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config(); + config.update_generation_config(config_map); + return generate( + prompt, + rgbs, + config, + utils::get_streamer_from_map(config_map) + ); } - std::string decoded_results = m_tokenizer.decode(generated); - if (m_is_chat_conversation) { - // Tail of chat template is missing in KV cache. - // Find the tail to concatenate it with the next input prompt. - m_templated_chat_history.append(decoded_results); - m_history.push_back({{"role", "assistant"}, {"content", decoded_results}}); - } else { - for (auto& variable : m_language.query_state()) { - variable.reset(); + void start_chat(const std::string& system_message) { + m_is_chat_conversation = true; + bool have_state = 0 != m_language.get_tensor("attention_mask").get_size(); + if (have_state) { + // Resetting state may be slow. + for (ov::VariableState& variable : m_language.query_state()) { + variable.reset(); + } + // Since if is already introduced, move all resetting here. + m_language.get_tensor("attention_mask").set_shape({1, 0}); + m_history.clear(); + m_templated_chat_history.clear(); } - m_language.get_tensor("attention_mask").set_shape({1, 0}); + if (system_message.empty()) { + return; + } + m_history = {{{"role", "system"}, {"content", system_message}}}; + constexpr bool add_generation_prompt = false; + m_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); } - return {{std::move(decoded_results)}}; -} -DecodedResults VLMPipeline::generate( - const std::string& prompt, - const ov::AnyMap& config_map -) { - auto image = config_map.find(ov::genai::image.name()); - auto images = config_map.find(ov::genai::images.name()); - OPENVINO_ASSERT( - config_map.end() == image || config_map.end() == images, - "Only one property can be set: image of images." - ); - std::vector rgbs; - if (config_map.end() != image) { - rgbs = {image->second.as()}; - } if (config_map.end() != images) { - rgbs = images->second.as>(); - } - ov::genai::OptionalGenerationConfig config_arg = utils::get_config_from_map(config_map); - GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config(); - config.update_generation_config(config_map); - return generate( - prompt, - rgbs, - config, - utils::get_streamer_from_map(config_map) - ); -} + void finish_chat() {m_is_chat_conversation = false;} -void VLMPipeline::start_chat(const std::string& system_message) { - m_is_chat_conversation = true; - bool have_state = 0 != m_language.get_tensor("attention_mask").get_size(); - if (have_state) { - // Resetting state may be slow. - for (ov::VariableState& variable : m_language.query_state()) { - variable.reset(); - } - // Since if is already introduced, move all resetting here. - m_language.get_tensor("attention_mask").set_shape({1, 0}); - m_history.clear(); - m_templated_chat_history.clear(); + void set_chat_template(const std::string& new_template) { + m_tokenizer.set_chat_template(new_template); } - if (system_message.empty()) { - return; - } - m_history = {{{"role", "system"}, {"content", system_message}}}; - constexpr bool add_generation_prompt = false; - m_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); -} -void VLMPipeline::set_chat_template(const std::string& new_template) { - m_tokenizer.set_chat_template(new_template); -} + GenerationConfig get_generation_config() const { + return m_generation_config; + } -GenerationConfig VLMPipeline::get_generation_config() const { - return m_generation_config; -} + void set_generation_config(const GenerationConfig& new_config) { + m_generation_config = new_config; + } -void VLMPipeline::set_generation_config(const GenerationConfig& new_config) { - m_generation_config = new_config; -} + ov::Tensor get_inputs_embeds_llava(const std::string& prompt, const std::vector& images) { + std::string image_token = ""; // TODO Consider getting from vlm_config or json + std::string formatted_prompt = "USER: " + (images.empty() ? prompt : image_token + "\n" + prompt) + " ASSISTANT:"; + ov::Tensor input_ids = m_tokenizer.encode(formatted_prompt).input_ids; + if (images.empty()) { + return process_prompt(m_embedding, input_ids, m_vlm_config.scale_emb); + } else { + OPENVINO_ASSERT(1 == images.size(), "Only a single image allowed"); + EncodedImage encoded_image = m_vision_encoder.encode(images.at(0)); + ov::Tensor image_embeds = encoded_image.resized_source; -ov::Tensor VLMPipeline::get_inputs_embeds_llava(const std::string& prompt, const std::vector& images) { - std::string image_token = ""; // TODO Consider getting from vlm_config or json - std::string formatted_prompt = "USER: " + (images.empty() ? prompt : image_token + "\n" + prompt) + " ASSISTANT:"; - ov::Tensor input_ids = m_tokenizer.encode(formatted_prompt).input_ids; - if (images.empty()) { - return process_prompt(m_embedding, input_ids, m_vlm_config.scale_emb); - } else { - OPENVINO_ASSERT(1 == images.size(), "Only a single image allowed"); - EncodedImage encoded_image = m_vision_encoder.encode(images.at(0)); - ov::Tensor image_embeds = encoded_image.resized_source; - - ov::Tensor text_embeds = process_prompt(m_embedding, input_ids, m_vlm_config.scale_emb); + ov::Tensor text_embeds = process_prompt(m_embedding, input_ids, m_vlm_config.scale_emb); - int64_t image_token_index = 32000; // TODO Consider getting from m_vlm_config.image_token_index or config.json + int64_t image_token_index = 32000; // TODO Consider getting from m_vlm_config.image_token_index or config.json - return merge_text_and_image_embeddings_llava(input_ids, text_embeds, image_embeds, image_token_index); + return merge_text_and_image_embeddings_llava(input_ids, text_embeds, image_embeds, image_token_index); + } } -} -ov::Tensor VLMPipeline::get_inputs_embeds_minicpm(const std::string& prompt, const std::vector& images) { - std::string images_prompt; - std::vector embeds; - for (const ov::Tensor& rgb : images) { - ov::Tensor reshaped = rgb; - ov::Shape rgb_shape = rgb.get_shape(); - switch (rgb_shape.size()) { - case 3: - reshaped.set_shape({1, rgb_shape.at(0), rgb_shape.at(1), rgb_shape.at(2)}); - break; - case 4: break; - default: OPENVINO_THROW("Input image must have [NHWC] or [HWC] layout"); - } - ov::Shape reshaped_shape = reshaped.get_shape(); - for (size_t batch_idx = 0; batch_idx < reshaped_shape.at(0); ++batch_idx) { - ov::Tensor single_image{ - ov::element::u8, - {1, reshaped_shape.at(1), reshaped_shape.at(2), reshaped_shape.at(3)}, - reshaped.data() + batch_idx * reshaped_shape.at(1) * reshaped_shape.at(1) * reshaped_shape.at(1) - }; - EncodedImage encoded_image = m_vision_encoder.encode(single_image); - if (m_vlm_config.use_image_id) { - images_prompt += m_vlm_config.im_id_start + std::to_string(image_id) + m_vlm_config.im_id_end; - ++image_id; - } - std::string unk64; - for (size_t idx = 0; idx < m_vlm_config.query_num; ++idx) { - unk64 += m_vlm_config.unk; + ov::Tensor get_inputs_embeds_minicpm(const std::string& prompt, const std::vector& images) { + std::string images_prompt; + std::vector embeds; + for (const ov::Tensor& rgb : images) { + ov::Tensor reshaped = rgb; + ov::Shape rgb_shape = rgb.get_shape(); + switch (rgb_shape.size()) { + case 3: + reshaped.set_shape({1, rgb_shape.at(0), rgb_shape.at(1), rgb_shape.at(2)}); + break; + case 4: break; + default: OPENVINO_THROW("Input image must have [NHWC] or [HWC] layout"); } - images_prompt += m_vlm_config.im_start + unk64 + m_vlm_config.im_end; - if (encoded_image.slices) { - ov::Shape slices_shape = encoded_image.slices.get_shape(); - for (size_t row_idx = 0; row_idx < slices_shape.at(0); ++row_idx) { - for (size_t col_idx = 0; col_idx < slices_shape.at(1); ++col_idx) { - images_prompt += m_vlm_config.slice_start + unk64 + m_vlm_config.slice_end; + ov::Shape reshaped_shape = reshaped.get_shape(); + for (size_t batch_idx = 0; batch_idx < reshaped_shape.at(0); ++batch_idx) { + ov::Tensor single_image{ + ov::element::u8, + {1, reshaped_shape.at(1), reshaped_shape.at(2), reshaped_shape.at(3)}, + reshaped.data() + batch_idx * reshaped_shape.at(1) * reshaped_shape.at(1) * reshaped_shape.at(1) + }; + EncodedImage encoded_image = m_vision_encoder.encode(single_image); + if (m_vlm_config.use_image_id) { + images_prompt += m_vlm_config.im_id_start + std::to_string(image_id) + m_vlm_config.im_id_end; + ++image_id; + } + std::string unk64; + for (size_t idx = 0; idx < m_vlm_config.query_num; ++idx) { + unk64 += m_vlm_config.unk; + } + images_prompt += m_vlm_config.im_start + unk64 + m_vlm_config.im_end; + if (encoded_image.slices) { + ov::Shape slices_shape = encoded_image.slices.get_shape(); + for (size_t row_idx = 0; row_idx < slices_shape.at(0); ++row_idx) { + for (size_t col_idx = 0; col_idx < slices_shape.at(1); ++col_idx) { + images_prompt += m_vlm_config.slice_start + unk64 + m_vlm_config.slice_end; + } + images_prompt += '\n'; } + } + if ('\n' != *(images_prompt.end() - 1)) { + // Image wasn't sliced, add \n to the end of image anyway. + // Strangely, \n isn't placed between . images_prompt += '\n'; } + embeds.push_back(std::move(encoded_image)); } - if ('\n' != *(images_prompt.end() - 1)) { - // Image wasn't sliced, add \n to the end of image anyway. - // Strangely, \n isn't placed between . - images_prompt += '\n'; - } - embeds.push_back(std::move(encoded_image)); } - } - images_prompt += prompt; - ov::Tensor encoded_input; - if (m_is_chat_conversation) { - // KV cache in model already contains prompts and answers from previous iterations. - // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns - // token_ids = {, ...}. So if tokenizer applies only to the new prompt, - // will be inserted on every iteration. - // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt - // and takes only the difference between them. - // The chat history cannot be saved as already encoded tokens because generate call doesn't return token, but - // KV cache contains it. So we have to add it manually or get it by tokenization all chat history. - m_history.push_back({{"role", "user"}, {"content", images_prompt}}); - constexpr bool add_generation_prompt = true; - std::string new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); - ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history).input_ids; - if (0 == m_language.get_tensor("attention_mask").get_shape().at(1)) { - encoded_input = new_chat_tokens; + images_prompt += prompt; + ov::Tensor encoded_input; + if (m_is_chat_conversation) { + // KV cache in model already contains prompts and answers from previous iterations. + // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns + // token_ids = {, ...}. So if tokenizer applies only to the new prompt, + // will be inserted on every iteration. + // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt + // and takes only the difference between them. + // The chat history cannot be saved as already encoded tokens because generate call doesn't return token, but + // KV cache contains it. So we have to add it manually or get it by tokenization all chat history. + m_history.push_back({{"role", "user"}, {"content", images_prompt}}); + constexpr bool add_generation_prompt = true; + std::string new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); + ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history).input_ids; + if (0 == m_language.get_tensor("attention_mask").get_shape().at(1)) { + encoded_input = new_chat_tokens; + } else { + TokenizedInputs prev_chat_tokens = m_tokenizer.encode( + m_templated_chat_history + ); + encoded_input = utils::subtract_chat_tokenized_inputs( + {new_chat_tokens}, prev_chat_tokens + ).input_ids; + } + m_templated_chat_history = std::move(new_templated_chat_history); } else { - TokenizedInputs prev_chat_tokens = m_tokenizer.encode( - m_templated_chat_history - ); - encoded_input = utils::subtract_chat_tokenized_inputs( - {new_chat_tokens}, prev_chat_tokens - ).input_ids; + encoded_input = m_tokenizer.encode(images_prompt).input_ids; } - m_templated_chat_history = std::move(new_templated_chat_history); - } else { - encoded_input = m_tokenizer.encode(images_prompt).input_ids; + m_embedding.set_input_tensor(encoded_input); + m_embedding.infer(); + ov::Tensor inputs_embeds = m_embedding.get_output_tensor(); + OPENVINO_ASSERT( + m_vlm_config.hidden_size == inputs_embeds.get_shape().at(2), + "Unexpected embedding size" + ); + ov::Tensor special_tokens = m_tokenizer.encode( + m_vlm_config.im_start + + m_vlm_config.im_end + + m_vlm_config.slice_start + + m_vlm_config.slice_end + ).input_ids; + OPENVINO_ASSERT( + 4 == special_tokens.get_shape().at(1), + "Every special token must be represented with a single int." + ); + int64_t im_start_id = special_tokens.data()[0]; + int64_t im_end_id = special_tokens.data()[1]; + int64_t slice_start_id = special_tokens.data()[2]; + int64_t slice_end_id = special_tokens.data()[3]; + int64_t im_start_pos = 0, slice_start_pos = 0; + int64_t* begin = encoded_input.data(); + int64_t* ids = begin; + size_t encoded_input_size = encoded_input.get_size(); + int64_t* end = ids + encoded_input_size; + float* inputs_embeds_data = inputs_embeds.data(); + for (const EncodedImage& encoded_image : embeds) { + const ov::Tensor& resampled_source = resample(*this, encoded_image.resized_source, {encoded_image.resized_source_size}); + float* emb = resampled_source.data(); + ids = std::find(ids, end, im_start_id); + OPENVINO_ASSERT(end != ids); + std::copy_n(emb, resampled_source.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size); + ids += m_vlm_config.query_num; + if (encoded_image.slices) { + size_t token_idx = 0; + const ov::Shape& slices_shape = encoded_image.slices.get_shape(); + for (size_t i = 0; i < slices_shape.at(0); ++i) { + for (size_t ja = 0; ja < slices_shape.at(1); ++ja) { + size_t d2 = slices_shape.at(2); + size_t d3 = slices_shape.at(3); + ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, encoded_image.slices.data() + (i * slices_shape.at(1) + ja) * d2 * d3}; + const ov::Tensor& vision_embed_tensor_i_j = resample(*this, encoded_view, {encoded_image.slices_size}); + ids = std::find(ids, end, slice_start_id); + OPENVINO_ASSERT(end != ids); + std::copy_n(vision_embed_tensor_i_j.data(), vision_embed_tensor_i_j.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size); + ids += m_vlm_config.query_num; + } + } + } + } + + return inputs_embeds; } - m_embedding.set_input_tensor(encoded_input); - m_embedding.infer(); - ov::Tensor inputs_embeds = m_embedding.get_output_tensor(); - OPENVINO_ASSERT( - m_vlm_config.hidden_size == inputs_embeds.get_shape().at(2), - "Unexpected embedding size" - ); - ov::Tensor special_tokens = m_tokenizer.encode( - m_vlm_config.im_start - + m_vlm_config.im_end - + m_vlm_config.slice_start - + m_vlm_config.slice_end - ).input_ids; - OPENVINO_ASSERT( - 4 == special_tokens.get_shape().at(1), - "Every special token must be represented with a single int." - ); - int64_t im_start_id = special_tokens.data()[0]; - int64_t im_end_id = special_tokens.data()[1]; - int64_t slice_start_id = special_tokens.data()[2]; - int64_t slice_end_id = special_tokens.data()[3]; - int64_t im_start_pos = 0, slice_start_pos = 0; - int64_t* begin = encoded_input.data(); - int64_t* ids = begin; - size_t encoded_input_size = encoded_input.get_size(); - int64_t* end = ids + encoded_input_size; - float* inputs_embeds_data = inputs_embeds.data(); - for (const EncodedImage& encoded_image : embeds) { - const ov::Tensor& resampled_source = resample(*this, encoded_image.resized_source, {encoded_image.resized_source_size}); - float* emb = resampled_source.data(); - ids = std::find(ids, end, im_start_id); - OPENVINO_ASSERT(end != ids); - std::copy_n(emb, resampled_source.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size); - ids += m_vlm_config.query_num; - if (encoded_image.slices) { - size_t token_idx = 0; - const ov::Shape& slices_shape = encoded_image.slices.get_shape(); - for (size_t i = 0; i < slices_shape.at(0); ++i) { - for (size_t ja = 0; ja < slices_shape.at(1); ++ja) { - size_t d2 = slices_shape.at(2); - size_t d3 = slices_shape.at(3); - ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, encoded_image.slices.data() + (i * slices_shape.at(1) + ja) * d2 * d3}; - const ov::Tensor& vision_embed_tensor_i_j = resample(*this, encoded_view, {encoded_image.slices_size}); - ids = std::find(ids, end, slice_start_id); - OPENVINO_ASSERT(end != ids); - std::copy_n(vision_embed_tensor_i_j.data(), vision_embed_tensor_i_j.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size); - ids += m_vlm_config.query_num; + + ov::Tensor resample(VLMPipeline::VLMPipelineImpl& pipe, const ov::Tensor& encoded_image, const std::vector& target_sizes) { + size_t bs = encoded_image.get_shape().at(0); + std::vector patch_len{target_sizes.size()}; + std::transform(target_sizes.begin(), target_sizes.end(), patch_len.begin(), [](const ImageSize& height_width) { + return height_width.height * height_width.width; + }); + adjust_pos_cache( + target_sizes, + pipe.m_vlm_config.hidden_size, + pipe.m_pos_embed_cache + ); + size_t max_patch_len = *std::max_element(patch_len.begin(), patch_len.end()); + ov::Tensor key_padding_mask(ov::element::boolean, {bs, max_patch_len}); + bool* mask_data = key_padding_mask.data(); + size_t embed_len = pipe.m_pos_embed_cache.get_shape().at(2); + ov::Tensor pos_embed(ov::element::f32, {max_patch_len, bs, embed_len}); // BLD => L * B * D + float* pos_embed_data = pos_embed.data(); + float* cache_data = pipe.m_pos_embed_cache.data(); + size_t _d0 = pipe.m_pos_embed_cache.get_shape().at(0); + size_t _d1 = pipe.m_pos_embed_cache.get_shape().at(1); + for (size_t i = 0; i < bs; ++i) { + size_t target_h = target_sizes.at(i).height; + size_t target_w = target_sizes.at(i).width; + for (size_t h_idx = 0; h_idx < target_h; ++h_idx) { + for (size_t w_idx = 0; w_idx < target_w; ++w_idx) { + std::copy_n( + cache_data + h_idx * _d1 + w_idx, + embed_len, + pos_embed_data + (h_idx * target_w + w_idx) * bs * embed_len + i * embed_len + ); } } + for (size_t flat = target_h * target_w; flat < max_patch_len; ++flat) { + std::fill_n(pos_embed_data + flat * bs * embed_len + i * embed_len, embed_len, 0.0f); + } + std::fill_n(mask_data + i * max_patch_len, patch_len[i], false); + std::fill_n(mask_data + i * max_patch_len + patch_len[i], max_patch_len - patch_len[i], true); } + pipe.m_resampler.set_tensor("x", encoded_image); // [N, H*W, old_hidden_size] + pipe.m_resampler.set_tensor("pos_embed", pos_embed); // [H*W, N, new_hidden_size] + pipe.m_resampler.set_tensor("key_padding_mask", key_padding_mask); // [N, H*W] + pipe.m_resampler.infer(); + return pipe.m_resampler.get_output_tensor(); // [N, query_num, new_hidden_size] } +}; - return inputs_embeds; +VLMPipeline::VLMPipeline( + const std::filesystem::path& model_dir, + const std::string& device, + const ov::AnyMap device_config +) : m_pimpl{std::make_unique(model_dir, device, device_config)} {} + +ov::genai::VLMPipeline::~VLMPipeline() = default; + +DecodedResults VLMPipeline::generate( + const std::string& prompt, + const std::vector& rgbs, + const GenerationConfig& generation_config, + const StreamerVariant& streamer +) { + return m_pimpl->generate(prompt, rgbs, generation_config, streamer); +} + +DecodedResults VLMPipeline::generate( + const std::string& prompt, + const ov::AnyMap& config_map +) { + return m_pimpl->generate(prompt, config_map); +} + +void VLMPipeline::start_chat(const std::string& system_message) { + m_pimpl->start_chat(system_message); +} + +void VLMPipeline::finish_chat() { + m_pimpl->finish_chat(); +} + +void VLMPipeline::set_chat_template(const std::string& new_template) { + m_pimpl->set_chat_template(new_template); +} + +GenerationConfig VLMPipeline::get_generation_config() const { + return m_pimpl->get_generation_config(); +} + +void VLMPipeline::set_generation_config(const GenerationConfig& new_config) { + m_pimpl->set_generation_config(new_config); } From 7d94e1a82b7174def025c44648f3f3651cf07e82 Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Sat, 12 Oct 2024 15:10:05 +0400 Subject: [PATCH 02/28] Remove unused concatenate_mid_dim --- src/cpp/src/vlm_pipeline.cpp | 36 +++--------------------------------- 1 file changed, 3 insertions(+), 33 deletions(-) diff --git a/src/cpp/src/vlm_pipeline.cpp b/src/cpp/src/vlm_pipeline.cpp index ef7a90b717..7259cbd747 100644 --- a/src/cpp/src/vlm_pipeline.cpp +++ b/src/cpp/src/vlm_pipeline.cpp @@ -63,36 +63,6 @@ ov::Tensor concatenate_last_dim(const ov::Tensor& first, const ov::Tensor& secon return res; } -ov::Tensor concatenate_mid_dim(const ov::Tensor& first, const ov::Tensor& second) { - size_t res_d_0 = first.get_shape().at(0); - size_t res_d_2 = first.get_shape().at(2); - OPENVINO_ASSERT(second.get_shape().at(0) == res_d_0); - OPENVINO_ASSERT(second.get_shape().at(2) == res_d_2); - size_t res_d_1 = first.get_shape().at(1) + second.get_shape().at(1); - ov::Tensor res{first.get_element_type(), {res_d_0, res_d_1, res_d_2}}; - float* first_data = first.data(); - float* second_data = second.data(); - float* res_data = res.data(); - for (size_t i = 0; i < res_d_0; ++i) { - size_t j = 0; - for (; j < first.get_shape().at(1); ++j) { - std::copy_n( - first_data + i * first.get_shape().at(1) * res_d_2 + j * res_d_2, - res_d_2, - res_data + i * res_d_1 * res_d_2 + j * res_d_2 - ); - } - for (size_t k = 0; k < second.get_shape().at(1); ++k, ++j) { - std::copy_n( - second_data + i * second.get_shape().at(1) * res_d_2 + k * res_d_2, - res_d_2, - res_data + i * res_d_1 * res_d_2 + j * res_d_2 - ); - } - } - return res; -} - /// embed_dim: output dimension for each position /// pos: a list of positions to be encoded: size (H, W) /// out: (H, W, D) @@ -274,7 +244,7 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { bool m_is_chat_conversation; ChatHistory m_history; std::string m_templated_chat_history; - size_t image_id = 0; // Used to insert i per image (not a slice). + size_t m_image_id = 0; // Used to insert i per image (not a slice). VLMPipelineImpl( const std::filesystem::path& model_dir, @@ -521,8 +491,8 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { }; EncodedImage encoded_image = m_vision_encoder.encode(single_image); if (m_vlm_config.use_image_id) { - images_prompt += m_vlm_config.im_id_start + std::to_string(image_id) + m_vlm_config.im_id_end; - ++image_id; + images_prompt += m_vlm_config.im_id_start + std::to_string(m_image_id) + m_vlm_config.im_id_end; + ++m_image_id; } std::string unk64; for (size_t idx = 0; idx < m_vlm_config.query_num; ++idx) { From eeb818d3d1131bd45e2bcb6455f248c041b1e1eb Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Sat, 12 Oct 2024 15:15:10 +0400 Subject: [PATCH 03/28] Initialize m_image_id in constructor similar to the reset of the fields --- src/cpp/src/vlm_pipeline.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/cpp/src/vlm_pipeline.cpp b/src/cpp/src/vlm_pipeline.cpp index 7259cbd747..d9d7c0c508 100644 --- a/src/cpp/src/vlm_pipeline.cpp +++ b/src/cpp/src/vlm_pipeline.cpp @@ -244,7 +244,7 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { bool m_is_chat_conversation; ChatHistory m_history; std::string m_templated_chat_history; - size_t m_image_id = 0; // Used to insert i per image (not a slice). + size_t m_image_id; // Used to insert i per image (not a slice). VLMPipelineImpl( const std::filesystem::path& model_dir, @@ -258,7 +258,8 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { }, m_tokenizer{Tokenizer(model_dir.string(), device_config)}, m_vision_encoder(model_dir, m_vlm_config.model_type, device, device_config, ov::Core{}), - m_is_chat_conversation{false} { + m_is_chat_conversation{false}, + m_image_id{0} { if (m_vlm_config.model_type == VLMModelType::MINICPM) { m_resampler = ov::Core{}.compile_model( model_dir / "resampler.xml", device, device_config From 20a6954dbc0c4adb80d81bfe08348c3401a3282f Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Sat, 12 Oct 2024 15:56:05 +0400 Subject: [PATCH 04/28] Retrigger From 0737db2eaf909a42248d23fed5c67d1a88461d67 Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Mon, 14 Oct 2024 09:40:17 +0400 Subject: [PATCH 05/28] Move to visual_language --- samples/cpp/visual_language_chat/visual_language_chat.cpp | 2 +- .../openvino/genai/{ => visual_language}/vlm_pipeline.hpp | 0 src/cpp/src/utils.hpp | 2 +- src/cpp/src/vision_encoder.cpp | 2 +- src/cpp/src/vision_encoder.hpp | 4 ++-- src/cpp/src/{ => visual_language}/clip.cpp | 0 src/cpp/src/{ => visual_language}/clip.hpp | 0 src/cpp/src/{ => visual_language}/processor_config.cpp | 0 src/cpp/src/{ => visual_language}/processor_config.hpp | 0 src/cpp/src/{ => visual_language}/vlm_config.cpp | 0 src/cpp/src/{ => visual_language}/vlm_config.hpp | 2 +- .../openvino/genai => src/visual_language}/vlm_model_type.hpp | 0 src/cpp/src/{ => visual_language}/vlm_pipeline.cpp | 2 +- 13 files changed, 7 insertions(+), 7 deletions(-) rename src/cpp/include/openvino/genai/{ => visual_language}/vlm_pipeline.hpp (100%) rename src/cpp/src/{ => visual_language}/clip.cpp (100%) rename src/cpp/src/{ => visual_language}/clip.hpp (100%) rename src/cpp/src/{ => visual_language}/processor_config.cpp (100%) rename src/cpp/src/{ => visual_language}/processor_config.hpp (100%) rename src/cpp/src/{ => visual_language}/vlm_config.cpp (100%) rename src/cpp/src/{ => visual_language}/vlm_config.hpp (98%) rename src/cpp/{include/openvino/genai => src/visual_language}/vlm_model_type.hpp (100%) rename src/cpp/src/{ => visual_language}/vlm_pipeline.cpp (99%) diff --git a/samples/cpp/visual_language_chat/visual_language_chat.cpp b/samples/cpp/visual_language_chat/visual_language_chat.cpp index b9af689fce..b36f0bec0d 100644 --- a/samples/cpp/visual_language_chat/visual_language_chat.cpp +++ b/samples/cpp/visual_language_chat/visual_language_chat.cpp @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 #include "load_image.hpp" -#include +#include #include bool print_subword(std::string&& subword) { diff --git a/src/cpp/include/openvino/genai/vlm_pipeline.hpp b/src/cpp/include/openvino/genai/visual_language/vlm_pipeline.hpp similarity index 100% rename from src/cpp/include/openvino/genai/vlm_pipeline.hpp rename to src/cpp/include/openvino/genai/visual_language/vlm_pipeline.hpp diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index 3ba551e169..7a0f3ddef2 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -6,7 +6,7 @@ #include #include "openvino/genai/llm_pipeline.hpp" -#include "processor_config.hpp" +#include "visual_language/processor_config.hpp" namespace ov { namespace genai { diff --git a/src/cpp/src/vision_encoder.cpp b/src/cpp/src/vision_encoder.cpp index 1153329b98..df7f43af77 100644 --- a/src/cpp/src/vision_encoder.cpp +++ b/src/cpp/src/vision_encoder.cpp @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 #include "vision_encoder.hpp" -#include "clip.hpp" +#include "visual_language/clip.hpp" #include "utils.hpp" using namespace ov::genai; diff --git a/src/cpp/src/vision_encoder.hpp b/src/cpp/src/vision_encoder.hpp index 7cf8c88e71..446c093093 100644 --- a/src/cpp/src/vision_encoder.hpp +++ b/src/cpp/src/vision_encoder.hpp @@ -4,8 +4,8 @@ #pragma once #include -#include "processor_config.hpp" -#include "openvino/genai/vlm_model_type.hpp" +#include "visual_language/processor_config.hpp" +#include "visual_language/vlm_model_type.hpp" namespace ov::genai { /// @brief A pair describing image size. diff --git a/src/cpp/src/clip.cpp b/src/cpp/src/visual_language/clip.cpp similarity index 100% rename from src/cpp/src/clip.cpp rename to src/cpp/src/visual_language/clip.cpp diff --git a/src/cpp/src/clip.hpp b/src/cpp/src/visual_language/clip.hpp similarity index 100% rename from src/cpp/src/clip.hpp rename to src/cpp/src/visual_language/clip.hpp diff --git a/src/cpp/src/processor_config.cpp b/src/cpp/src/visual_language/processor_config.cpp similarity index 100% rename from src/cpp/src/processor_config.cpp rename to src/cpp/src/visual_language/processor_config.cpp diff --git a/src/cpp/src/processor_config.hpp b/src/cpp/src/visual_language/processor_config.hpp similarity index 100% rename from src/cpp/src/processor_config.hpp rename to src/cpp/src/visual_language/processor_config.hpp diff --git a/src/cpp/src/vlm_config.cpp b/src/cpp/src/visual_language/vlm_config.cpp similarity index 100% rename from src/cpp/src/vlm_config.cpp rename to src/cpp/src/visual_language/vlm_config.cpp diff --git a/src/cpp/src/vlm_config.hpp b/src/cpp/src/visual_language/vlm_config.hpp similarity index 98% rename from src/cpp/src/vlm_config.hpp rename to src/cpp/src/visual_language/vlm_config.hpp index 11f91cda2e..726e322511 100644 --- a/src/cpp/src/vlm_config.hpp +++ b/src/cpp/src/visual_language/vlm_config.hpp @@ -4,7 +4,7 @@ #pragma once #include "openvino/genai/visibility.hpp" -#include "openvino/genai/vlm_model_type.hpp" +#include "visual_language/vlm_model_type.hpp" #include #include diff --git a/src/cpp/include/openvino/genai/vlm_model_type.hpp b/src/cpp/src/visual_language/vlm_model_type.hpp similarity index 100% rename from src/cpp/include/openvino/genai/vlm_model_type.hpp rename to src/cpp/src/visual_language/vlm_model_type.hpp diff --git a/src/cpp/src/vlm_pipeline.cpp b/src/cpp/src/visual_language/vlm_pipeline.cpp similarity index 99% rename from src/cpp/src/vlm_pipeline.cpp rename to src/cpp/src/visual_language/vlm_pipeline.cpp index d9d7c0c508..de5b7057ba 100644 --- a/src/cpp/src/vlm_pipeline.cpp +++ b/src/cpp/src/visual_language/vlm_pipeline.cpp @@ -1,7 +1,7 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 -#include "openvino/genai/vlm_pipeline.hpp" +#include "openvino/genai/visual_language/vlm_pipeline.hpp" #include "openvino/genai/tokenizer.hpp" #include "vlm_sampling.hpp" #include "clip.hpp" From 0bddfba0f36afbe44318469fa7736d28a25793f2 Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Mon, 14 Oct 2024 09:44:36 +0400 Subject: [PATCH 06/28] Correct py_vlm_pipeline.cpp include --- src/python/py_vlm_pipeline.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/py_vlm_pipeline.cpp b/src/python/py_vlm_pipeline.cpp index 04faed542a..765784f16f 100644 --- a/src/python/py_vlm_pipeline.cpp +++ b/src/python/py_vlm_pipeline.cpp @@ -7,7 +7,7 @@ #include #include #include -#include "openvino/genai/vlm_pipeline.hpp" +#include "openvino/genai/visual_image/vlm_pipeline.hpp" #include "../cpp/src/tokenizers_path.hpp" #include "./utils.hpp" From 1b2da2dd8aeb2b4df218e9ffd20d92a42d92efcd Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Mon, 14 Oct 2024 09:54:42 +0400 Subject: [PATCH 07/28] fix --- src/python/py_vlm_pipeline.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/py_vlm_pipeline.cpp b/src/python/py_vlm_pipeline.cpp index 765784f16f..fb5a8a7c68 100644 --- a/src/python/py_vlm_pipeline.cpp +++ b/src/python/py_vlm_pipeline.cpp @@ -7,7 +7,7 @@ #include #include #include -#include "openvino/genai/visual_image/vlm_pipeline.hpp" +#include "openvino/genai/visual_language/vlm_pipeline.hpp" #include "../cpp/src/tokenizers_path.hpp" #include "./utils.hpp" From 7f0ef7a17d151b61492e945476a30cfccbe2b991 Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Mon, 14 Oct 2024 12:22:04 +0400 Subject: [PATCH 08/28] Move vision_encoder, pipeline.hpp --- samples/cpp/visual_language_chat/visual_language_chat.cpp | 2 +- .../genai/visual_language/{vlm_pipeline.hpp => pipeline.hpp} | 0 src/cpp/src/{ => visual_language}/vision_encoder.cpp | 0 src/cpp/src/{ => visual_language}/vision_encoder.hpp | 0 src/cpp/src/visual_language/vlm_pipeline.cpp | 2 +- src/python/py_vlm_pipeline.cpp | 2 +- 6 files changed, 3 insertions(+), 3 deletions(-) rename src/cpp/include/openvino/genai/visual_language/{vlm_pipeline.hpp => pipeline.hpp} (100%) rename src/cpp/src/{ => visual_language}/vision_encoder.cpp (100%) rename src/cpp/src/{ => visual_language}/vision_encoder.hpp (100%) diff --git a/samples/cpp/visual_language_chat/visual_language_chat.cpp b/samples/cpp/visual_language_chat/visual_language_chat.cpp index b36f0bec0d..95342402cb 100644 --- a/samples/cpp/visual_language_chat/visual_language_chat.cpp +++ b/samples/cpp/visual_language_chat/visual_language_chat.cpp @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 #include "load_image.hpp" -#include +#include #include bool print_subword(std::string&& subword) { diff --git a/src/cpp/include/openvino/genai/visual_language/vlm_pipeline.hpp b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp similarity index 100% rename from src/cpp/include/openvino/genai/visual_language/vlm_pipeline.hpp rename to src/cpp/include/openvino/genai/visual_language/pipeline.hpp diff --git a/src/cpp/src/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp similarity index 100% rename from src/cpp/src/vision_encoder.cpp rename to src/cpp/src/visual_language/vision_encoder.cpp diff --git a/src/cpp/src/vision_encoder.hpp b/src/cpp/src/visual_language/vision_encoder.hpp similarity index 100% rename from src/cpp/src/vision_encoder.hpp rename to src/cpp/src/visual_language/vision_encoder.hpp diff --git a/src/cpp/src/visual_language/vlm_pipeline.cpp b/src/cpp/src/visual_language/vlm_pipeline.cpp index de5b7057ba..868d4c586f 100644 --- a/src/cpp/src/visual_language/vlm_pipeline.cpp +++ b/src/cpp/src/visual_language/vlm_pipeline.cpp @@ -1,7 +1,7 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 -#include "openvino/genai/visual_language/vlm_pipeline.hpp" +#include "openvino/genai/visual_language/pipeline.hpp" #include "openvino/genai/tokenizer.hpp" #include "vlm_sampling.hpp" #include "clip.hpp" diff --git a/src/python/py_vlm_pipeline.cpp b/src/python/py_vlm_pipeline.cpp index fb5a8a7c68..5d7809ffcf 100644 --- a/src/python/py_vlm_pipeline.cpp +++ b/src/python/py_vlm_pipeline.cpp @@ -7,7 +7,7 @@ #include #include #include -#include "openvino/genai/visual_language/vlm_pipeline.hpp" +#include "openvino/genai/visual_language/pipeline.hpp" #include "../cpp/src/tokenizers_path.hpp" #include "./utils.hpp" From 457024c8353c801ec5bd733c0c974d5ce784830d Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Mon, 14 Oct 2024 13:00:17 +0400 Subject: [PATCH 09/28] Replace export_MiniCPM-V-2_6.py --- .github/workflows/causal_lm_cpp.yml | 6 +++--- samples/cpp/visual_language_chat/README.md | 2 +- .../cpp/visual_language_chat/export_MiniCPM-V-2_6.py | 12 ++++++------ samples/python/visual_language_chat/README.md | 2 +- samples/requirements.txt | 2 +- src/cpp/src/visual_language/vision_encoder.cpp | 6 +++--- src/cpp/src/visual_language/vlm_pipeline.cpp | 12 ++++++------ 7 files changed, 21 insertions(+), 21 deletions(-) diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index b8fbe397d2..85a0e8b8d4 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -703,12 +703,12 @@ jobs: source ./ov/setupvars.sh python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - python ./samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py ./miniCPM-V-2_6/ + optimum-cli export openvino -m openbmb/MiniCPM-V-2_6 MiniCPM-V-2_6 --trust-remote-code wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --output-document cat.jpg - name: Run visual_language_chat sample - MiniCPM-V-2_6 run: > source ./ov/setupvars.sh - && timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ cat.jpg + && timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./MiniCPM-V-2_6/ cat.jpg <<< $'What is on the image?\nWhat is special on the image?' - name: Download and convert LLaVa 1.5 model and an image run: | @@ -728,7 +728,7 @@ jobs: source ./ov/setupvars.sh export PYTHONPATH=./build/:$PYTHONPATH printf 'What is on the image?\nWhat is special on the image?\n' > ./input.txt - timeout 120s python ./samples/python/visual_language_chat/visual_language_chat.py ./miniCPM-V-2_6/ cat.jpg < input.txt > ./pred.txt + timeout 120s python ./samples/python/visual_language_chat/visual_language_chat.py ./MiniCPM-V-2_6/ cat.jpg < input.txt > ./pred.txt cpp-continuous-batching-ubuntu: runs-on: ubuntu-20.04-8-cores diff --git a/samples/cpp/visual_language_chat/README.md b/samples/cpp/visual_language_chat/README.md index b9d0ebcfe4..e487d5c1a6 100644 --- a/samples/cpp/visual_language_chat/README.md +++ b/samples/cpp/visual_language_chat/README.md @@ -10,7 +10,7 @@ It's not required to install [../../requirements.txt](../../requirements.txt) fo ```sh pip install --upgrade-strategy eager -r ../../requirements.txt -export_MiniCPM-V-2_6.py miniCPM-V-2_6 +optimum-cli export openvino -m openbmb/MiniCPM-V-2_6 MiniCPM-V-2_6 --trust-remote-code ``` ## Run diff --git a/samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py b/samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py index 7d2f0f1175..d466c9b683 100644 --- a/samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py +++ b/samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py @@ -23,10 +23,10 @@ from openvino.runtime.passes import Manager, MatcherPass, WrapType, Matcher import time -text_emb_path = Path("embed_tokens.xml") -image_emb_path = Path("image_encoder.xml") -resampler_path = Path("resampler.xml") -llm_path = Path("language_model.xml") +text_emb_path = Path("openvino_text_embeddings_model.xml") +image_emb_path = Path("openvino_vision_embeddings_model.xml") +resampler_path = Path("openvino_resampler_model.xml") +llm_path = Path("openvino_language_model.xml") class InsertSlice(MatcherPass): def __init__(self): @@ -596,8 +596,8 @@ def __init__(self, model_dir, device="CPU", ov_config=None, compile=True, slice_ self.config.is_encoder_decoder = False self.generation_config = GenerationConfig.from_model_config(self.config) model_dir = Path(model_dir) - self.model = core.read_model(model_dir / "language_model.xml") - self.token_emb = core.read_model(model_dir / "embed_tokens.xml") + self.model = core.read_model(model_dir / "openvino_language_model.xml") + self.token_emb = core.read_model(model_dir / "openvino_text_embeddings_model.xml") if slice_lm_head: self.slice_lm_head() self.request = None diff --git a/samples/python/visual_language_chat/README.md b/samples/python/visual_language_chat/README.md index 16ef0959c5..12ffb27f99 100644 --- a/samples/python/visual_language_chat/README.md +++ b/samples/python/visual_language_chat/README.md @@ -10,8 +10,8 @@ It's not required to install [../../requirements.txt](../../requirements.txt) fo ```sh pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino -m openbmb/MiniCPM-V-2_6 miniCPM-V-2_6 --trust-remote-code ``` -# TODO: add optimum cli command for miniCPM-V-2_6 when available ## Run: [This image](https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11) can be used as a sample image. diff --git a/samples/requirements.txt b/samples/requirements.txt index df71d0cbb1..2cd9df8df4 100644 --- a/samples/requirements.txt +++ b/samples/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://download.pytorch.org/whl/cpu -optimum-intel @ git+https://github.com/huggingface/optimum-intel.git +optimum-intel @ git+https://github.com/eaidova/optimum-intel.git@ea/minicpmv numpy<2.0.0; sys_platform == 'darwin' einops==0.8.0 # For Qwen transformers_stream_generator==0.0.5 # For Qwen diff --git a/src/cpp/src/visual_language/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp index df7f43af77..26a8312bfb 100644 --- a/src/cpp/src/visual_language/vision_encoder.cpp +++ b/src/cpp/src/visual_language/vision_encoder.cpp @@ -300,8 +300,8 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o ov::Tensor input_tensor{ov::element::f32, {1, 3, size_t(resized_preprocessed.ny), size_t(resized_preprocessed.nx)}, (void*)(resized_preprocessed.buf.data())}; ov::Tensor pixel_values = preprocess_for_encoder(input_tensor, patch_size); encoder.set_tensor("pixel_values", pixel_values); - ov::Tensor patch_attention_mask{ov::element::boolean, {pixel_values.get_shape().at(0), 1, resized_source_size.height * resized_source_size.width}}; - std::fill_n(patch_attention_mask.data(), patch_attention_mask.get_size(), true); + ov::Tensor patch_attention_mask{ov::element::f32, {pixel_values.get_shape().at(0), 1, resized_source_size.height * resized_source_size.width}}; + std::fill_n(patch_attention_mask.data(), patch_attention_mask.get_size(), 1.0f); encoder.set_tensor("patch_attention_mask", patch_attention_mask); ov::Tensor position_ids = prepare_vis_position_ids(pixel_values, patch_attention_mask, {resized_source_size}, ctx_clip.patch_size, ctx_clip.image_size / ctx_clip.patch_size); encoder.set_tensor("position_ids", position_ids); @@ -432,7 +432,7 @@ ov::Tensor preprocess_image_llava(const ov::Tensor& image, const ProcessorConfig VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const VLMModelType model_type, const std::string& device, const ov::AnyMap device_config, ov::Core core) : model_type(model_type) { if (model_type == VLMModelType::MINICPM) { - m_vision_encoder = core.compile_model(model_dir / "image_encoder.xml", device, device_config).create_infer_request(); + m_vision_encoder = core.compile_model(model_dir / "openvino_vision_embeddings_model.xml", device, device_config).create_infer_request(); } else if (model_type == VLMModelType::LLAVA) { // Vision embeddings model is merged with multi modal projector at model export stage by optimum-intel m_vision_encoder = core.compile_model(model_dir / "openvino_vision_embeddings_model.xml", device, device_config).create_infer_request(); diff --git a/src/cpp/src/visual_language/vlm_pipeline.cpp b/src/cpp/src/visual_language/vlm_pipeline.cpp index 868d4c586f..575279aa42 100644 --- a/src/cpp/src/visual_language/vlm_pipeline.cpp +++ b/src/cpp/src/visual_language/vlm_pipeline.cpp @@ -262,15 +262,15 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { m_image_id{0} { if (m_vlm_config.model_type == VLMModelType::MINICPM) { m_resampler = ov::Core{}.compile_model( - model_dir / "resampler.xml", device, device_config + model_dir / "openvino_resampler_model.xml", device, device_config ).create_infer_request(); m_embedding = ov::Core{}.compile_model( - model_dir / "embed_tokens.xml", device, device_config + model_dir / "openvino_text_embeddings_model.xml", device, device_config ).create_infer_request(); m_language = ov::Core{}.compile_model( - model_dir / "language_model.xml", device, device_config + model_dir / "openvino_language_model.xml", device, device_config ).create_infer_request(); m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70}); @@ -613,8 +613,8 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { pipe.m_pos_embed_cache ); size_t max_patch_len = *std::max_element(patch_len.begin(), patch_len.end()); - ov::Tensor key_padding_mask(ov::element::boolean, {bs, max_patch_len}); - bool* mask_data = key_padding_mask.data(); + ov::Tensor key_padding_mask(ov::element::f32, {bs, max_patch_len}); + float* mask_data = key_padding_mask.data(); size_t embed_len = pipe.m_pos_embed_cache.get_shape().at(2); ov::Tensor pos_embed(ov::element::f32, {max_patch_len, bs, embed_len}); // BLD => L * B * D float* pos_embed_data = pos_embed.data(); @@ -639,7 +639,7 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { std::fill_n(mask_data + i * max_patch_len, patch_len[i], false); std::fill_n(mask_data + i * max_patch_len + patch_len[i], max_patch_len - patch_len[i], true); } - pipe.m_resampler.set_tensor("x", encoded_image); // [N, H*W, old_hidden_size] + pipe.m_resampler.set_tensor("image_feature", encoded_image); // [N, H*W, old_hidden_size] pipe.m_resampler.set_tensor("pos_embed", pos_embed); // [H*W, N, new_hidden_size] pipe.m_resampler.set_tensor("key_padding_mask", key_padding_mask); // [N, H*W] pipe.m_resampler.infer(); From d11f18da9adf5e3d72cdc45a5a9ed030307252c7 Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Mon, 14 Oct 2024 13:08:37 +0400 Subject: [PATCH 10/28] Downgrade optimum --- .github/workflows/causal_lm_cpp.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 85a0e8b8d4..b767e5a016 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -703,6 +703,7 @@ jobs: source ./ov/setupvars.sh python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies optimum-cli export openvino -m openbmb/MiniCPM-V-2_6 MiniCPM-V-2_6 --trust-remote-code wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --output-document cat.jpg - name: Run visual_language_chat sample - MiniCPM-V-2_6 From a82fe790ca1b8f7278a99e85b989ec7aac00b167 Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Mon, 14 Oct 2024 13:27:07 +0400 Subject: [PATCH 11/28] Everywhere python -m pip install -U optimum<1.23 --no-dependencies --- .github/workflows/causal_lm_cpp.yml | 17 +++++++++++++++++ .github/workflows/lcm_dreamshaper_cpp.yml | 2 ++ .github/workflows/linux.yml | 1 + .github/workflows/mac.yml | 1 + .github/workflows/stable_diffusion_1_5_cpp.yml | 2 ++ .github/workflows/windows.yml | 1 + 6 files changed, 24 insertions(+) diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index b767e5a016..0921646fa7 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -45,6 +45,7 @@ jobs: source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies optimum-cli export openvino --trust-remote-code --weight-format fp16 --model openlm-research/open_llama_3b_v2 open_llama_3b_v2 - run: > . ./ov/setupvars.sh @@ -94,6 +95,7 @@ jobs: source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - name: Compare run: | @@ -230,6 +232,7 @@ jobs: call .\ov\setupvars.bat python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - run: > set PATH=.\build\openvino_genai\;%PATH% @@ -281,6 +284,7 @@ jobs: source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat - run: > . ./ov/setupvars.sh @@ -314,6 +318,7 @@ jobs: source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen1.5-7B-Chat Qwen1.5-7B-Chat - run: > . ./ov/setupvars.sh @@ -348,6 +353,7 @@ jobs: source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-2 phi-2 - run: > . ./ov/setupvars.sh @@ -382,6 +388,7 @@ jobs: source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies optimum-cli export openvino --trust-remote-code --weight-format fp16 --model argilla/notus-7b-v1 notus-7b-v1 - run: > . ./ov/setupvars.sh @@ -416,6 +423,7 @@ jobs: source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-3b dolly-v2-3b optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-7b dolly-v2-7b - name: run and compare @@ -459,6 +467,7 @@ jobs: source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat --task text-generation-with-past - name: run and compare @@ -532,6 +541,7 @@ jobs: source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-1_5 phi-1_5 - name: Run Generation run: | @@ -586,6 +596,7 @@ jobs: source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies optimum-cli export openvino --trust-remote-code --weight-format fp16 --model ikala/redpajama-3b-chat redpajama-3b-chat - name: Run Generation run: | @@ -640,6 +651,7 @@ jobs: source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - name: Compare run: | @@ -704,6 +716,7 @@ jobs: python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install -U "optimum<1.23" --no-dependencies + python -m pip install -U "optimum<1.23" --no-dependencies optimum-cli export openvino -m openbmb/MiniCPM-V-2_6 MiniCPM-V-2_6 --trust-remote-code wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --output-document cat.jpg - name: Run visual_language_chat sample - MiniCPM-V-2_6 @@ -716,6 +729,7 @@ jobs: source ./ov/setupvars.sh python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies optimum-cli export openvino --model llava-hf/llava-1.5-7b-hf ./llava_1_5_7b_ov/ wget https://llava-vl.github.io/static/images/monalisa.jpg - name: Run visual_language_chat sample - LLaVa 1.5 @@ -758,6 +772,7 @@ jobs: source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - name: Run gtests run: | @@ -804,6 +819,7 @@ jobs: call .\ov\setupvars.bat python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - name: Run gtests run: | @@ -849,6 +865,7 @@ jobs: source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - name: Run gtests run: | diff --git a/.github/workflows/lcm_dreamshaper_cpp.yml b/.github/workflows/lcm_dreamshaper_cpp.yml index 8fab023bd1..b00aeb2cb0 100644 --- a/.github/workflows/lcm_dreamshaper_cpp.yml +++ b/.github/workflows/lcm_dreamshaper_cpp.yml @@ -58,6 +58,7 @@ jobs: source openvino_lcm_cpp/bin/activate python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies - name: Download and convert models and tokenizer run: | @@ -108,6 +109,7 @@ jobs: . "./openvino_lcm_cpp/Scripts/Activate.ps1" python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies - name: Download and convert models and tokenizer run: | diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index bdc5c27d34..7e19bffe52 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -435,6 +435,7 @@ jobs: source ${OV_INSTALL_DIR}/setupvars.sh python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --find-links ${OV_INSTALL_DIR}/tools python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --find-links ${OV_INSTALL_DIR}/tools + python -m pip install -U "optimum<1.23" --no-dependencies optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - name: Install samples diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml index 000f35f280..963d328104 100644 --- a/.github/workflows/mac.yml +++ b/.github/workflows/mac.yml @@ -357,6 +357,7 @@ jobs: source ${OV_INSTALL_DIR}/setupvars.sh python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --find-links ${OV_INSTALL_DIR}/tools python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --find-links ${OV_INSTALL_DIR}/tools + python -m pip install -U "optimum<1.23" --no-dependencies optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - name: Install samples diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml index 6840321d9a..72ab6f4b58 100644 --- a/.github/workflows/stable_diffusion_1_5_cpp.yml +++ b/.github/workflows/stable_diffusion_1_5_cpp.yml @@ -58,6 +58,7 @@ jobs: source openvino_sd_cpp/bin/activate python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies - name: Download and convert models and tokenizer run: | @@ -114,6 +115,7 @@ jobs: . "./openvino_sd_cpp/Scripts/Activate.ps1" python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies - name: Download and convert models and tokenizer run: | diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 4b6692312b..88610f17fb 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -369,6 +369,7 @@ jobs: . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --find-links ${env:OV_INSTALL_DIR}/tools python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --find-links ${env:OV_INSTALL_DIR}/tools + python -m pip install -U "optimum<1.23" --no-dependencies optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - name: Install samples From 6d37b64940c3dbfdf7f73985fcd911856141c9c6 Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Mon, 14 Oct 2024 14:29:21 +0400 Subject: [PATCH 12/28] Remove duplicates --- .github/workflows/causal_lm_cpp.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 0921646fa7..07d567c44b 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -716,7 +716,6 @@ jobs: python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install -U "optimum<1.23" --no-dependencies - python -m pip install -U "optimum<1.23" --no-dependencies optimum-cli export openvino -m openbmb/MiniCPM-V-2_6 MiniCPM-V-2_6 --trust-remote-code wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --output-document cat.jpg - name: Run visual_language_chat sample - MiniCPM-V-2_6 @@ -727,9 +726,6 @@ jobs: - name: Download and convert LLaVa 1.5 model and an image run: | source ./ov/setupvars.sh - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - python -m pip install -U "optimum<1.23" --no-dependencies optimum-cli export openvino --model llava-hf/llava-1.5-7b-hf ./llava_1_5_7b_ov/ wget https://llava-vl.github.io/static/images/monalisa.jpg - name: Run visual_language_chat sample - LLaVa 1.5 From b8fd628301860db5de7857a3f37ee83204760cf5 Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Mon, 14 Oct 2024 14:43:49 +0400 Subject: [PATCH 13/28] Fix dtype --- src/cpp/src/visual_language/vision_encoder.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cpp/src/visual_language/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp index 26a8312bfb..d7308e6534 100644 --- a/src/cpp/src/visual_language/vision_encoder.cpp +++ b/src/cpp/src/visual_language/vision_encoder.cpp @@ -333,8 +333,8 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o patch_size ); encoder.set_tensor("pixel_values", pixel_values); - ov::Tensor patch_attention_mask{ov::element::boolean, {1, 1, slices_size.height * slices_size.width}}; - std::fill_n(patch_attention_mask.data(), patch_attention_mask.get_size(), true); + ov::Tensor patch_attention_mask{ov::element::f32, {1, 1, slices_size.height * slices_size.width}}; + std::fill_n(patch_attention_mask.data(), patch_attention_mask.get_size(), 1.0f); encoder.set_tensor("patch_attention_mask", patch_attention_mask); ov::Tensor position_ids = prepare_vis_position_ids(pixel_values, patch_attention_mask, {slices_size}, ctx_clip.patch_size, ctx_clip.image_size / ctx_clip.patch_size); encoder.set_tensor("position_ids", position_ids); From 7bdce55d1bea0efb0db0185dd0561cb6ce1ccc36 Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Tue, 15 Oct 2024 10:58:54 +0400 Subject: [PATCH 14/28] fix merge --- src/cpp/src/visual_language/pipeline.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index a75b5a5bb8..c1939f2766 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -350,15 +350,15 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { m_image_id{0} { if (m_vlm_config.model_type == VLMModelType::MINICPM) { m_resampler = ov::Core{}.compile_model( - model_dir / "resampler.xml", device, device_config + model_dir / "openvino_resampler_model.xml", device, device_config ).create_infer_request(); m_embedding = ov::Core{}.compile_model( - model_dir / "embed_tokens.xml", device, device_config + model_dir / "openvino_text_embeddings_model.xml", device, device_config ).create_infer_request(); m_language = ov::Core{}.compile_model( - model_dir / "language_model.xml", device, device_config + model_dir / "openvino_language_model.xml", device, device_config ).create_infer_request(); m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70}); @@ -703,8 +703,8 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { pipe.m_pos_embed_cache ); size_t max_patch_len = *std::max_element(patch_len.begin(), patch_len.end()); - ov::Tensor key_padding_mask(ov::element::boolean, {bs, max_patch_len}); - bool* mask_data = key_padding_mask.data(); + ov::Tensor key_padding_mask(ov::element::f32, {bs, max_patch_len}); + float* mask_data = key_padding_mask.data(); size_t embed_len = pipe.m_pos_embed_cache.get_shape().at(2); ov::Tensor pos_embed(ov::element::f32, {max_patch_len, bs, embed_len}); // BLD => L * B * D float* pos_embed_data = pos_embed.data(); @@ -726,10 +726,10 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { for (size_t flat = target_h * target_w; flat < max_patch_len; ++flat) { std::fill_n(pos_embed_data + flat * bs * embed_len + i * embed_len, embed_len, 0.0f); } - std::fill_n(mask_data + i * max_patch_len, patch_len[i], false); - std::fill_n(mask_data + i * max_patch_len + patch_len[i], max_patch_len - patch_len[i], true); + std::fill_n(mask_data + i * max_patch_len, patch_len[i], 0.0f); + std::fill_n(mask_data + i * max_patch_len + patch_len[i], max_patch_len - patch_len[i], 1.0f); } - pipe.m_resampler.set_tensor("x", encoded_image); // [N, H*W, old_hidden_size] + pipe.m_resampler.set_tensor("image_feature", encoded_image); // [N, H*W, old_hidden_size] pipe.m_resampler.set_tensor("pos_embed", pos_embed); // [H*W, N, new_hidden_size] pipe.m_resampler.set_tensor("key_padding_mask", key_padding_mask); // [N, H*W] pipe.m_resampler.infer(); From ff4f4be934709fa97dc99381581ff9bf892c7b5a Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Tue, 15 Oct 2024 11:03:29 +0400 Subject: [PATCH 15/28] delete src/cpp/src/visual_language/vlm_pipeline.cpp --- src/cpp/src/visual_language/vlm_pipeline.cpp | 692 ------------------- 1 file changed, 692 deletions(-) delete mode 100644 src/cpp/src/visual_language/vlm_pipeline.cpp diff --git a/src/cpp/src/visual_language/vlm_pipeline.cpp b/src/cpp/src/visual_language/vlm_pipeline.cpp deleted file mode 100644 index 575279aa42..0000000000 --- a/src/cpp/src/visual_language/vlm_pipeline.cpp +++ /dev/null @@ -1,692 +0,0 @@ -// Copyright (C) 2023-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "openvino/genai/visual_language/pipeline.hpp" -#include "openvino/genai/tokenizer.hpp" -#include "vlm_sampling.hpp" -#include "clip.hpp" -#include "text_callback_streamer.hpp" -#include "utils.hpp" -#include "vision_encoder.hpp" -#include "vlm_config.hpp" -#include -#include -#include - -using namespace ov::genai; - -namespace { -template struct overloaded : Ts... {using Ts::operator()...;}; -template overloaded(Ts...) -> overloaded; - -constexpr size_t BATCH_SIZE = 1; - -ov::Tensor process_prompt(ov::InferRequest& embedding, const ov::Tensor& prompt, float scale_emb) { - embedding.set_input_tensor(prompt); - embedding.infer(); - - const ov::Tensor& embed_output_tensor = embedding.get_output_tensor(); - - ov::Shape out_shape = embed_output_tensor.get_shape(); - float* data = embed_output_tensor.data(); - - //embedding * scale_emb - for (size_t idx = 0; idx < embed_output_tensor.get_size(); idx++) { - data[idx] = data[idx] * scale_emb; - } - return embed_output_tensor; -} - -ov::Tensor concatenate_last_dim(const ov::Tensor& first, const ov::Tensor& second) { - size_t res_d_0 = first.get_shape().at(0); - size_t res_d_1 = first.get_shape().at(1); - OPENVINO_ASSERT(second.get_shape().at(0) == res_d_0); - OPENVINO_ASSERT(second.get_shape().at(1) == res_d_1); - size_t res_d_2 = first.get_shape().at(2) + second.get_shape().at(2); - ov::Tensor res{first.get_element_type(), {res_d_0, res_d_1, res_d_2}}; - float* first_data = first.data(); - float* second_data = second.data(); - float* res_data = res.data(); - for (size_t i = 0; i < res_d_0; ++i) { - for (size_t j = 0; j < res_d_1; ++j) { - size_t k = 0; - for (; k < first.get_shape().at(2); ++k) { - res_data[i * res_d_1 * res_d_2 + j * res_d_2 + k] - = first_data[i * res_d_1 * first.get_shape().at(2) + j * first.get_shape().at(2) + k]; - } - for (size_t l = 0; l < second.get_shape().at(2); ++l, ++k) { - res_data[i * res_d_1 * res_d_2 + j * res_d_2 + k] - = second_data[i * res_d_1 * second.get_shape().at(2) + j * second.get_shape().at(2) + l]; - } - } - } - return res; -} - -/// embed_dim: output dimension for each position -/// pos: a list of positions to be encoded: size (H, W) -/// out: (H, W, D) -ov::Tensor get_1d_sincos_pos_embed_from_grid_new(size_t embed_dim, const ov::Tensor& pos) { - OPENVINO_ASSERT(embed_dim % 2 == 0); - OPENVINO_ASSERT(pos.get_shape().size() == 3); - OPENVINO_ASSERT(pos.get_shape().at(0) == 1); - size_t d0 = pos.get_shape().at(1); - size_t d1 = pos.get_shape().at(2); - size_t d2 = embed_dim / 2; - std::vector omega(d2); - for (size_t idx = 0; idx < omega.size(); ++idx) { - omega.at(idx) = idx / (embed_dim / 2.0f); - omega.at(idx) = 1.0f / std::pow(10000.0f, omega.at(idx)); // (D/2,) - } - const float* const pos_data = pos.data(); - ov::Tensor out(ov::element::f32, {d0, d1, d2}); // (H, W, D/2), outer product - float* out_data = out.data(); - for (size_t i = 0; i < d0; ++i) { - for (size_t j = 0; j < d1; ++j) { - for (size_t k = 0; k < d2; ++k) { - out_data[i * d1 * d2 + j * d2 + k] - = pos_data[i * d1 + j] * omega[k]; - } - } - } - - ov::Tensor emb_sin{out.get_element_type(), out.get_shape()}; // (H, W, D/2) - float* emb_sin_data = emb_sin.data(); - std::transform(out_data, out_data + out.get_size(), emb_sin_data, [](float arg) { - return std::sin(arg); - }); - ov::Tensor emb_cos{out.get_element_type(), out.get_shape()}; // (H, W, D/2) - float* emb_cos_data = emb_cos.data(); - std::transform(out_data, out_data + out.get_size(), emb_cos_data, [](float arg) { - return std::cos(arg); - }); - return concatenate_last_dim(emb_sin, emb_cos); // (H, W, D) -} - -ov::Tensor get_2d_sincos_pos_embed_from_grid(size_t embed_dim, const ov::Tensor& grid) { - OPENVINO_ASSERT(embed_dim % 2 == 0); - // use half of dimensions to encode grid_h - ov::Coordinate begin_h{0, 0, 0}; - ov::Coordinate end_h{grid.get_shape()}; - end_h.at(0) = 1; - ov::Coordinate begin_w{1, 0, 0}; - ov::Coordinate end_w{grid.get_shape()}; - end_w.at(0) = 2; - ov::Tensor emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, ov::Tensor{grid, begin_h, end_h}); // (H, W, D/2) - ov::Tensor emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, ov::Tensor{grid, begin_w, end_w}); // (H, W, D/2) - return concatenate_last_dim(emb_h, emb_w); -} - -/// image_size: image_size or (image_height, image_width) -/// return: -/// pos_embed: [image_height, image_width, embed_dim] -ov::Tensor get_2d_sincos_pos_embed(size_t embed_dim, const ImageSize& image_size) { - size_t grid_h_size = image_size.height, grid_w_size = image_size.width; - ov::Tensor grid(ov::element::f32, {2, grid_h_size, grid_w_size}); - float* data = grid.data(); - for (size_t y = 0; y < grid_h_size; ++y) { - std::iota(data, data + grid_w_size, 0.0f); - data += grid_w_size; - } - for (float y = 0.0f; y < grid_h_size; ++y) { - std::fill(data, data + grid_w_size, y); - data += grid_w_size; - } - return get_2d_sincos_pos_embed_from_grid(embed_dim, grid); -} - -void adjust_pos_cache( - const std::vector& target_sizes, - size_t hidden_size, - ov::Tensor& pos_embed_cache -) { - size_t max_h = std::max_element(target_sizes.begin(), target_sizes.end(), [](const ImageSize& left, const ImageSize& right) { - return left.height < right.height; - })->height; - size_t max_w = std::max_element(target_sizes.begin(), target_sizes.end(), [](const ImageSize& left, const ImageSize& right) { - return left.width < right.width; - })->width; - size_t allocated_height, allocated_width; - if (pos_embed_cache) { - const ov::Shape& allocated_shape = pos_embed_cache.get_shape(); - allocated_height = allocated_shape.at(0); - allocated_width = allocated_shape.at(1); - } else { - allocated_height = allocated_width = 70; - } - if (max_h > allocated_height || max_w > allocated_width) { - allocated_height = std::max(max_h, allocated_height); - allocated_width = std::max(max_w, allocated_width); - pos_embed_cache = get_2d_sincos_pos_embed( - hidden_size, {allocated_height, allocated_width} - ); - } -} - -ov::Tensor merge_text_and_image_embeddings_llava( - const ov::Tensor& input_ids, - const ov::Tensor& text_embeds, - const ov::Tensor& image_embeds, - int64_t image_token_index -) { - auto text_embeds_shape = text_embeds.get_shape(); - auto image_embeds_shape = image_embeds.get_shape(); - - OPENVINO_ASSERT( - text_embeds_shape[2] == image_embeds_shape[2], - "Incompatible shapes between text_embeds and image_embeds" - ); - - size_t text_embeds_seq_length = text_embeds_shape[1]; - size_t hidden_size = text_embeds_shape[2]; - size_t image_embeds_seq_length = image_embeds_shape[1]; - - size_t merged_seq_length = text_embeds_seq_length + (image_embeds_seq_length - 1); - - ov::Tensor merged_embeds(text_embeds.get_element_type(), {BATCH_SIZE, merged_seq_length, hidden_size}); - - const int64_t* input_ids_data = input_ids.data(); - const float* text_embeds_data = text_embeds.data(); - const float* image_embeds_data = image_embeds.data(); - float* merged_data = merged_embeds.data(); - - - size_t merged_idx = 0; - for (size_t s = 0; s < text_embeds_seq_length; ++s) { - if (input_ids_data[s] == image_token_index) { - for (size_t i = 0; i < image_embeds_seq_length; ++i) { - std::copy_n(image_embeds_data + i * hidden_size, - hidden_size, - merged_data + merged_idx * hidden_size); - merged_idx++; - } - } else { - std::copy_n(text_embeds_data + s * hidden_size, - hidden_size, - merged_data + merged_idx * hidden_size); - merged_idx++; - } - } - - return merged_embeds; -} -} - -class ov::genai::VLMPipeline::VLMPipelineImpl { -public: - // A config to follow for LLM input construction. - VLMConfig m_vlm_config; - // A config to follow for text generation. - GenerationConfig m_generation_config; - // A tokenizer encoding a prompt. - Tokenizer m_tokenizer; - // An encoder to infer embeddings of an image. - VisionEncoder m_vision_encoder; - // A resampler model to resample image embeddings. - // [N, H*W, old_hidden_size] is the input shape. - // [N, query_num, hidden_size] is the output shape. - ov::InferRequest m_resampler; - // A model to compute token embeddings. - // Input shape: [N, conversation length]. - // Output shape: [1, conversation length, hidden_size]. - ov::InferRequest m_embedding; - // A language model used to generate a response. - // Input shapes: inputs_embeds[N, conversation length, hidden_size], - // position_ids[N, conversation length], beam_idx[N]. - // Output shape: logits[N, conversation length, vocab_size]. - ov::InferRequest m_language; - // Precomputed positional embeddings for the resampler. - // [70, 70, hidden_size]. 70 is the initial guess of the image - // height and width after dividing by patch_size. - ov::Tensor m_pos_embed_cache; - // True if chat mode is activated to save conversation - // history between generate() calls. - bool m_is_chat_conversation; - ChatHistory m_history; - std::string m_templated_chat_history; - size_t m_image_id; // Used to insert i per image (not a slice). - - VLMPipelineImpl( - const std::filesystem::path& model_dir, - const std::string& device, - const ov::AnyMap device_config - ) : - m_vlm_config{ - utils::from_config_json_if_exists( - model_dir, "config.json" - ) - }, - m_tokenizer{Tokenizer(model_dir.string(), device_config)}, - m_vision_encoder(model_dir, m_vlm_config.model_type, device, device_config, ov::Core{}), - m_is_chat_conversation{false}, - m_image_id{0} { - if (m_vlm_config.model_type == VLMModelType::MINICPM) { - m_resampler = ov::Core{}.compile_model( - model_dir / "openvino_resampler_model.xml", device, device_config - ).create_infer_request(); - - m_embedding = ov::Core{}.compile_model( - model_dir / "openvino_text_embeddings_model.xml", device, device_config - ).create_infer_request(); - - m_language = ov::Core{}.compile_model( - model_dir / "openvino_language_model.xml", device, device_config - ).create_infer_request(); - - m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70}); - } else if (m_vlm_config.model_type == VLMModelType::LLAVA) { - m_language = ov::Core{}.compile_model( - model_dir / "openvino_language_model.xml", device, device_config - ).create_infer_request(); - - // Reusing the same m_embedding for llava text_embeddings model - m_embedding = ov::Core{}.compile_model( - model_dir / "openvino_text_embeddings_model.xml", device, device_config - ).create_infer_request(); - } - - m_language.get_tensor("attention_mask").set_shape({1, 0}); - } - - DecodedResults generate( - const std::string& prompt, - const std::vector& rgbs, - const GenerationConfig& generation_config, - const StreamerVariant& streamer - ) { - ov::Tensor inputs_embeds; - if (m_vlm_config.model_type == VLMModelType::MINICPM) { - inputs_embeds = get_inputs_embeds_minicpm(prompt, rgbs); - } else if (m_vlm_config.model_type == VLMModelType::LLAVA) { - inputs_embeds = get_inputs_embeds_llava(prompt, rgbs); - } - - m_language.set_tensor("inputs_embeds", inputs_embeds); - size_t history_len = m_language.get_tensor("attention_mask").get_shape().at(1); - m_language.get_tensor("attention_mask").set_shape({1, history_len + inputs_embeds.get_shape()[1]}); - std::fill_n(m_language.get_tensor("attention_mask").data(), m_language.get_tensor("attention_mask").get_size(), 1); - - m_language.get_tensor("position_ids").set_shape({1, inputs_embeds.get_shape().at(1)}); - std::iota(m_language.get_tensor("position_ids").data(), m_language.get_tensor("position_ids").data() + m_language.get_tensor("position_ids").get_size(), history_len); - - m_language.get_tensor("beam_idx").set_shape({ BATCH_SIZE }); - m_language.get_tensor("beam_idx").data()[0] = 0; - - m_language.infer(); - - ov::Shape logits_shape = m_language.get_tensor("logits").get_shape(); - auto attention_size = m_language.get_tensor("attention_mask").get_size(); - - int64_t sequence_len = m_language.get_tensor("logits").get_shape().at(1) - 1; - size_t vocab_size = m_language.get_tensor("logits").get_shape().back(); - float* logits = m_language.get_tensor("logits").data() + sequence_len * vocab_size; - int64_t out_token = std::max_element(logits, logits + vocab_size) - logits; - - m_language.get_tensor("inputs_embeds").set_shape({BATCH_SIZE, 1, m_vlm_config.hidden_size}); - m_language.get_tensor("position_ids").set_shape({ BATCH_SIZE, 1 }); - - m_embedding.get_input_tensor().set_shape({ 1, 1 }); - - int64_t eos_token_id = m_tokenizer.get_eos_token_id(); - std::shared_ptr streamer_ptr = std::visit(overloaded{ - [&m_tokenizer = m_tokenizer]( - const std::function& callback - ) -> std::shared_ptr { - return std::make_shared(m_tokenizer, callback); - }, - [](const std::shared_ptr& ptr) { - return ptr; - }, - [](std::monostate) { - return std::shared_ptr{nullptr}; - }, - }, streamer); - std::vector generated; - while (true) { //(out_token != eos_token_id) - m_embedding.get_input_tensor().data()[0] = out_token; - m_embedding.infer(); - const ov::Tensor& embed_prompt_tensor = m_embedding.get_output_tensor(); - float* embed_data = embed_prompt_tensor.data(); - for (auto idx = 0; idx < embed_prompt_tensor.get_size(); idx++) { - embed_data[idx] = embed_data[idx] * m_vlm_config.scale_emb; - } - - m_language.set_tensor("inputs_embeds", embed_prompt_tensor); - m_language.get_tensor("attention_mask").set_shape({ BATCH_SIZE, m_language.get_tensor("attention_mask").get_shape()[1] + 1 }); - std::fill_n(m_language.get_tensor("attention_mask").data(), m_language.get_tensor("attention_mask").get_size(), 1); - m_language.get_tensor("position_ids").data()[0] = int64_t(m_language.get_tensor("attention_mask").get_size() - 2); - - m_language.infer(); - - generated.push_back(out_token); - if (streamer_ptr && streamer_ptr->put(out_token)) { - break; - } - logits = m_language.get_tensor("logits").data(); - - out_token = std::max_element(logits, logits + vocab_size) - logits; - if (out_token == eos_token_id) { - break; - } - } - - if (streamer_ptr) { - streamer_ptr->end(); - } - - std::string decoded_results = m_tokenizer.decode(generated); - if (m_is_chat_conversation) { - // Tail of chat template is missing in KV cache. - // Find the tail to concatenate it with the next input prompt. - m_templated_chat_history.append(decoded_results); - m_history.push_back({{"role", "assistant"}, {"content", decoded_results}}); - } else { - for (auto& variable : m_language.query_state()) { - variable.reset(); - } - m_language.get_tensor("attention_mask").set_shape({1, 0}); - } - return {{std::move(decoded_results)}}; - } - - DecodedResults generate( - const std::string& prompt, - const ov::AnyMap& config_map - ) { - auto image = config_map.find(ov::genai::image.name()); - auto images = config_map.find(ov::genai::images.name()); - OPENVINO_ASSERT( - config_map.end() == image || config_map.end() == images, - "Only one property can be set: image of images." - ); - std::vector rgbs; - if (config_map.end() != image) { - rgbs = {image->second.as()}; - } if (config_map.end() != images) { - rgbs = images->second.as>(); - } - ov::genai::OptionalGenerationConfig config_arg = utils::get_config_from_map(config_map); - GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config(); - config.update_generation_config(config_map); - return generate( - prompt, - rgbs, - config, - utils::get_streamer_from_map(config_map) - ); - } - - void start_chat(const std::string& system_message) { - m_is_chat_conversation = true; - bool have_state = 0 != m_language.get_tensor("attention_mask").get_size(); - if (have_state) { - // Resetting state may be slow. - for (ov::VariableState& variable : m_language.query_state()) { - variable.reset(); - } - // Since if is already introduced, move all resetting here. - m_language.get_tensor("attention_mask").set_shape({1, 0}); - m_history.clear(); - m_templated_chat_history.clear(); - } - if (system_message.empty()) { - return; - } - m_history = {{{"role", "system"}, {"content", system_message}}}; - constexpr bool add_generation_prompt = false; - m_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); - } - - void finish_chat() {m_is_chat_conversation = false;} - - void set_chat_template(const std::string& new_template) { - m_tokenizer.set_chat_template(new_template); - } - - GenerationConfig get_generation_config() const { - return m_generation_config; - } - - void set_generation_config(const GenerationConfig& new_config) { - m_generation_config = new_config; - } - - ov::Tensor get_inputs_embeds_llava(const std::string& prompt, const std::vector& images) { - std::string image_token = ""; // TODO Consider getting from vlm_config or json - std::string formatted_prompt = "USER: " + (images.empty() ? prompt : image_token + "\n" + prompt) + " ASSISTANT:"; - ov::Tensor input_ids = m_tokenizer.encode(formatted_prompt).input_ids; - if (images.empty()) { - return process_prompt(m_embedding, input_ids, m_vlm_config.scale_emb); - } else { - OPENVINO_ASSERT(1 == images.size(), "Only a single image allowed"); - EncodedImage encoded_image = m_vision_encoder.encode(images.at(0)); - ov::Tensor image_embeds = encoded_image.resized_source; - - ov::Tensor text_embeds = process_prompt(m_embedding, input_ids, m_vlm_config.scale_emb); - - int64_t image_token_index = 32000; // TODO Consider getting from m_vlm_config.image_token_index or config.json - - return merge_text_and_image_embeddings_llava(input_ids, text_embeds, image_embeds, image_token_index); - } - } - - ov::Tensor get_inputs_embeds_minicpm(const std::string& prompt, const std::vector& images) { - std::string images_prompt; - std::vector embeds; - for (const ov::Tensor& rgb : images) { - ov::Tensor reshaped = rgb; - ov::Shape rgb_shape = rgb.get_shape(); - switch (rgb_shape.size()) { - case 3: - reshaped.set_shape({1, rgb_shape.at(0), rgb_shape.at(1), rgb_shape.at(2)}); - break; - case 4: break; - default: OPENVINO_THROW("Input image must have [NHWC] or [HWC] layout"); - } - ov::Shape reshaped_shape = reshaped.get_shape(); - for (size_t batch_idx = 0; batch_idx < reshaped_shape.at(0); ++batch_idx) { - ov::Tensor single_image{ - ov::element::u8, - {1, reshaped_shape.at(1), reshaped_shape.at(2), reshaped_shape.at(3)}, - reshaped.data() + batch_idx * reshaped_shape.at(1) * reshaped_shape.at(1) * reshaped_shape.at(1) - }; - EncodedImage encoded_image = m_vision_encoder.encode(single_image); - if (m_vlm_config.use_image_id) { - images_prompt += m_vlm_config.im_id_start + std::to_string(m_image_id) + m_vlm_config.im_id_end; - ++m_image_id; - } - std::string unk64; - for (size_t idx = 0; idx < m_vlm_config.query_num; ++idx) { - unk64 += m_vlm_config.unk; - } - images_prompt += m_vlm_config.im_start + unk64 + m_vlm_config.im_end; - if (encoded_image.slices) { - ov::Shape slices_shape = encoded_image.slices.get_shape(); - for (size_t row_idx = 0; row_idx < slices_shape.at(0); ++row_idx) { - for (size_t col_idx = 0; col_idx < slices_shape.at(1); ++col_idx) { - images_prompt += m_vlm_config.slice_start + unk64 + m_vlm_config.slice_end; - } - images_prompt += '\n'; - } - } - if ('\n' != *(images_prompt.end() - 1)) { - // Image wasn't sliced, add \n to the end of image anyway. - // Strangely, \n isn't placed between . - images_prompt += '\n'; - } - embeds.push_back(std::move(encoded_image)); - } - } - images_prompt += prompt; - ov::Tensor encoded_input; - if (m_is_chat_conversation) { - // KV cache in model already contains prompts and answers from previous iterations. - // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns - // token_ids = {, ...}. So if tokenizer applies only to the new prompt, - // will be inserted on every iteration. - // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt - // and takes only the difference between them. - // The chat history cannot be saved as already encoded tokens because generate call doesn't return token, but - // KV cache contains it. So we have to add it manually or get it by tokenization all chat history. - m_history.push_back({{"role", "user"}, {"content", images_prompt}}); - constexpr bool add_generation_prompt = true; - std::string new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); - ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history).input_ids; - if (0 == m_language.get_tensor("attention_mask").get_shape().at(1)) { - encoded_input = new_chat_tokens; - } else { - TokenizedInputs prev_chat_tokens = m_tokenizer.encode( - m_templated_chat_history - ); - encoded_input = utils::subtract_chat_tokenized_inputs( - {new_chat_tokens}, prev_chat_tokens - ).input_ids; - } - m_templated_chat_history = std::move(new_templated_chat_history); - } else { - encoded_input = m_tokenizer.encode(images_prompt).input_ids; - } - m_embedding.set_input_tensor(encoded_input); - m_embedding.infer(); - ov::Tensor inputs_embeds = m_embedding.get_output_tensor(); - OPENVINO_ASSERT( - m_vlm_config.hidden_size == inputs_embeds.get_shape().at(2), - "Unexpected embedding size" - ); - ov::Tensor special_tokens = m_tokenizer.encode( - m_vlm_config.im_start - + m_vlm_config.im_end - + m_vlm_config.slice_start - + m_vlm_config.slice_end - ).input_ids; - OPENVINO_ASSERT( - 4 == special_tokens.get_shape().at(1), - "Every special token must be represented with a single int." - ); - int64_t im_start_id = special_tokens.data()[0]; - int64_t im_end_id = special_tokens.data()[1]; - int64_t slice_start_id = special_tokens.data()[2]; - int64_t slice_end_id = special_tokens.data()[3]; - int64_t im_start_pos = 0, slice_start_pos = 0; - int64_t* begin = encoded_input.data(); - int64_t* ids = begin; - size_t encoded_input_size = encoded_input.get_size(); - int64_t* end = ids + encoded_input_size; - float* inputs_embeds_data = inputs_embeds.data(); - for (const EncodedImage& encoded_image : embeds) { - const ov::Tensor& resampled_source = resample(*this, encoded_image.resized_source, {encoded_image.resized_source_size}); - float* emb = resampled_source.data(); - ids = std::find(ids, end, im_start_id); - OPENVINO_ASSERT(end != ids); - std::copy_n(emb, resampled_source.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size); - ids += m_vlm_config.query_num; - if (encoded_image.slices) { - size_t token_idx = 0; - const ov::Shape& slices_shape = encoded_image.slices.get_shape(); - for (size_t i = 0; i < slices_shape.at(0); ++i) { - for (size_t ja = 0; ja < slices_shape.at(1); ++ja) { - size_t d2 = slices_shape.at(2); - size_t d3 = slices_shape.at(3); - ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, encoded_image.slices.data() + (i * slices_shape.at(1) + ja) * d2 * d3}; - const ov::Tensor& vision_embed_tensor_i_j = resample(*this, encoded_view, {encoded_image.slices_size}); - ids = std::find(ids, end, slice_start_id); - OPENVINO_ASSERT(end != ids); - std::copy_n(vision_embed_tensor_i_j.data(), vision_embed_tensor_i_j.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size); - ids += m_vlm_config.query_num; - } - } - } - } - - return inputs_embeds; - } - - ov::Tensor resample(VLMPipeline::VLMPipelineImpl& pipe, const ov::Tensor& encoded_image, const std::vector& target_sizes) { - size_t bs = encoded_image.get_shape().at(0); - std::vector patch_len{target_sizes.size()}; - std::transform(target_sizes.begin(), target_sizes.end(), patch_len.begin(), [](const ImageSize& height_width) { - return height_width.height * height_width.width; - }); - adjust_pos_cache( - target_sizes, - pipe.m_vlm_config.hidden_size, - pipe.m_pos_embed_cache - ); - size_t max_patch_len = *std::max_element(patch_len.begin(), patch_len.end()); - ov::Tensor key_padding_mask(ov::element::f32, {bs, max_patch_len}); - float* mask_data = key_padding_mask.data(); - size_t embed_len = pipe.m_pos_embed_cache.get_shape().at(2); - ov::Tensor pos_embed(ov::element::f32, {max_patch_len, bs, embed_len}); // BLD => L * B * D - float* pos_embed_data = pos_embed.data(); - float* cache_data = pipe.m_pos_embed_cache.data(); - size_t _d0 = pipe.m_pos_embed_cache.get_shape().at(0); - size_t _d1 = pipe.m_pos_embed_cache.get_shape().at(1); - for (size_t i = 0; i < bs; ++i) { - size_t target_h = target_sizes.at(i).height; - size_t target_w = target_sizes.at(i).width; - for (size_t h_idx = 0; h_idx < target_h; ++h_idx) { - for (size_t w_idx = 0; w_idx < target_w; ++w_idx) { - std::copy_n( - cache_data + h_idx * _d1 + w_idx, - embed_len, - pos_embed_data + (h_idx * target_w + w_idx) * bs * embed_len + i * embed_len - ); - } - } - for (size_t flat = target_h * target_w; flat < max_patch_len; ++flat) { - std::fill_n(pos_embed_data + flat * bs * embed_len + i * embed_len, embed_len, 0.0f); - } - std::fill_n(mask_data + i * max_patch_len, patch_len[i], false); - std::fill_n(mask_data + i * max_patch_len + patch_len[i], max_patch_len - patch_len[i], true); - } - pipe.m_resampler.set_tensor("image_feature", encoded_image); // [N, H*W, old_hidden_size] - pipe.m_resampler.set_tensor("pos_embed", pos_embed); // [H*W, N, new_hidden_size] - pipe.m_resampler.set_tensor("key_padding_mask", key_padding_mask); // [N, H*W] - pipe.m_resampler.infer(); - return pipe.m_resampler.get_output_tensor(); // [N, query_num, new_hidden_size] - } -}; - -VLMPipeline::VLMPipeline( - const std::filesystem::path& model_dir, - const std::string& device, - const ov::AnyMap device_config -) : m_pimpl{std::make_unique(model_dir, device, device_config)} {} - -ov::genai::VLMPipeline::~VLMPipeline() = default; - -DecodedResults VLMPipeline::generate( - const std::string& prompt, - const std::vector& rgbs, - const GenerationConfig& generation_config, - const StreamerVariant& streamer -) { - return m_pimpl->generate(prompt, rgbs, generation_config, streamer); -} - -DecodedResults VLMPipeline::generate( - const std::string& prompt, - const ov::AnyMap& config_map -) { - return m_pimpl->generate(prompt, config_map); -} - -void VLMPipeline::start_chat(const std::string& system_message) { - m_pimpl->start_chat(system_message); -} - -void VLMPipeline::finish_chat() { - m_pimpl->finish_chat(); -} - -void VLMPipeline::set_chat_template(const std::string& new_template) { - m_pimpl->set_chat_template(new_template); -} - -GenerationConfig VLMPipeline::get_generation_config() const { - return m_pimpl->get_generation_config(); -} - -void VLMPipeline::set_generation_config(const GenerationConfig& new_config) { - m_pimpl->set_generation_config(new_config); -} From 4112edfa833d09dd3e08d2d78c5f7bd533546c4a Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Tue, 15 Oct 2024 14:17:07 +0400 Subject: [PATCH 16/28] fix conversion in test --- tests/python_tests/test_vlm_api.py | 107 ++++++++++------------------- 1 file changed, 38 insertions(+), 69 deletions(-) diff --git a/tests/python_tests/test_vlm_api.py b/tests/python_tests/test_vlm_api.py index b32b2b5fb6..d0a788b9b3 100644 --- a/tests/python_tests/test_vlm_api.py +++ b/tests/python_tests/test_vlm_api.py @@ -1,69 +1,33 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import openvino_genai +import openvino_tokenizers +import openvino import pytest -import gc -import os -import numpy as np -from PIL import Image -from multiprocessing import Process - +import transformers +from optimum.intel.openvino import OVModelForVisualCausalLM from openvino_genai import VLMPipeline -from openvino import Tensor from common import get_greedy, get_image_by_link, get_beam_search, get_greedy, get_multinomial_all_parameters def get_ov_model(model_dir): - import sys - from pathlib import Path - #TODO: use optimum-intel - - sys.path.append(str(Path(__file__).resolve().parents[2] / 'samples/cpp/visual_language_chat')) - import importlib - export_MiniCPM = importlib.import_module("export_MiniCPM-V-2_6", "export_MiniCPM") - convert_llm = getattr(export_MiniCPM, "convert_llm") - convert_vision_encoder = getattr(export_MiniCPM, "convert_vision_encoder") - from transformers import AutoModel, AutoTokenizer, AutoProcessor - import os - import openvino_tokenizers - import openvino as ov - import gc - + if (model_dir / "openvino_language_model.xml").exists(): + return model_dir model_id = "openbmb/MiniCPM-V-2_6" - ckpt = Path(os.path.join(model_dir, "ckpt")) - if not ckpt.exists(): - snapshot_download = getattr(export_MiniCPM, "snapshot_download") - patch_model_code = getattr(export_MiniCPM, "patch_model_code") - snapshot_download(model_id, local_dir=ckpt, force_download=True) - patch_model_code(ckpt) - model = AutoModel.from_pretrained(ckpt, trust_remote_code=True) - model.eval() + processor = transformers.AutoProcessor.from_pretrained(model_id, trust_remote_code=True) + processor.tokenizer.save_pretrained(model_dir) + ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(processor.tokenizer, with_detokenizer=True) + openvino.save_model(ov_tokenizer, model_dir / "openvino_tokenizer.xml") + openvino.save_model(ov_detokenizer, model_dir / "openvino_detokenizer.xml") + model = OVModelForVisualCausalLM.from_pretrained(model_id, compile=False, device="CPU", export=True, load_in_8bit=False, trust_remote_code=True) model.config.save_pretrained(model_dir) - tokenizer = AutoTokenizer.from_pretrained(ckpt, trust_remote_code=True) - tokenizer.save_pretrained(model_dir) - ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(tokenizer, with_detokenizer=True) - ov.save_model(ov_tokenizer, os.path.join(model_dir, "openvino_tokenizer.xml")) - ov.save_model(ov_detokenizer, os.path.join(model_dir, "openvino_detokenizer.xml")) - processor = AutoProcessor.from_pretrained(ckpt, trust_remote_code=True) - processor.save_pretrained(model_dir) - - convert_llm(model, model_dir) - del model.llm - gc.collect() - - convert_vision_encoder(model, model_dir) + model.generation_config.save_pretrained(model_dir) + model.save_pretrained(model_dir) return model_dir -sampling_configs = [ - get_beam_search(), - get_greedy(), - get_multinomial_all_parameters() -] prompts = [ "What is on the image?", "What is special about this image?", - "Tell me more about this image." ] image_links = [ @@ -75,37 +39,42 @@ def get_ov_model(model_dir): image_links_for_testing = [ [], [image_links[0]], - [image_links[1], image_links[0]], [image_links[0], image_links[2], image_links[1]] ] @pytest.mark.precommit -def test_vlm_pipeline(tmp_path): - import os - +@pytest.mark.nightly +def test_vlm_pipeline(cache): def streamer(word: str) -> bool: print(word, end="") return False - model_path = get_ov_model(os.path.join(tmp_path, "miniCPM")) + model_path = get_ov_model(cache.mkdir("MiniCPM-V-2_6")) - for generation_config in sampling_configs: - for links in image_links_for_testing: - images = [] - for link in links: - images.append(get_image_by_link(link)) + for links in image_links_for_testing: + images = [] + for link in links: + images.append(get_image_by_link(link)) - pipe = VLMPipeline(model_path, "CPU") - pipe.start_chat() + pipe = VLMPipeline(str(model_path), "CPU") + pipe.start_chat() - pipe.generate(prompts[0], images=images, generation_config=generation_config, streamer=streamer) + pipe.generate(prompts[0], images=images, generation_config=get_greedy(), streamer=streamer) - for prompt in prompts[1:]: - pipe.generate(prompt, generation_config=generation_config, streamer=streamer) + for prompt in prompts[1:]: + pipe.generate(prompt, generation_config=get_greedy(), streamer=streamer) - pipe.finish_chat() - gc.collect() - del pipe - gc.collect() + pipe.finish_chat() +@pytest.mark.precommit +@pytest.mark.nightly +@pytest.mark.parametrize("config", [ + get_beam_search(), + get_multinomial_all_parameters(), +]) +def test_sampling(config, cache): + model_path = get_ov_model(cache.mkdir("MiniCPM-V-2_6")) + image = get_image_by_link(image_links[0]) + pipe = VLMPipeline(str(model_path), "CPU") + pipe.generate(prompts[0], image=image, generation_config=config) From c4573b86d6890d555bc4219cb04ffaac1b296cb0 Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Tue, 15 Oct 2024 14:21:27 +0400 Subject: [PATCH 17/28] dont print in test --- tests/python_tests/test_vlm_api.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/python_tests/test_vlm_api.py b/tests/python_tests/test_vlm_api.py index d0a788b9b3..38c79a2f3d 100644 --- a/tests/python_tests/test_vlm_api.py +++ b/tests/python_tests/test_vlm_api.py @@ -46,7 +46,6 @@ def get_ov_model(model_dir): @pytest.mark.nightly def test_vlm_pipeline(cache): def streamer(word: str) -> bool: - print(word, end="") return False model_path = get_ov_model(cache.mkdir("MiniCPM-V-2_6")) From 8c67805ca27ab12649786beb9f83861525b0b81b Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Tue, 15 Oct 2024 14:23:37 +0400 Subject: [PATCH 18/28] skip --- tests/python_tests/test_vlm_api.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/python_tests/test_vlm_api.py b/tests/python_tests/test_vlm_api.py index 38c79a2f3d..94ad8c76be 100644 --- a/tests/python_tests/test_vlm_api.py +++ b/tests/python_tests/test_vlm_api.py @@ -72,6 +72,7 @@ def streamer(word: str) -> bool: get_beam_search(), get_multinomial_all_parameters(), ]) +@pytest.mark.sip("Enable after sampler are enabled") def test_sampling(config, cache): model_path = get_ov_model(cache.mkdir("MiniCPM-V-2_6")) image = get_image_by_link(image_links[0]) From 24015daac9b3352931c86efce14a7e0b62469236 Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Tue, 15 Oct 2024 14:28:56 +0400 Subject: [PATCH 19/28] cleanup --- samples/cpp/visual_language_chat/README.md | 2 +- .../export_MiniCPM-V-2_6.py | 1199 ----------------- samples/python/visual_language_chat/README.md | 2 +- samples/requirements.txt | 1 - 4 files changed, 2 insertions(+), 1202 deletions(-) delete mode 100644 samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py diff --git a/samples/cpp/visual_language_chat/README.md b/samples/cpp/visual_language_chat/README.md index e487d5c1a6..99ba417baf 100644 --- a/samples/cpp/visual_language_chat/README.md +++ b/samples/cpp/visual_language_chat/README.md @@ -10,7 +10,7 @@ It's not required to install [../../requirements.txt](../../requirements.txt) fo ```sh pip install --upgrade-strategy eager -r ../../requirements.txt -optimum-cli export openvino -m openbmb/MiniCPM-V-2_6 MiniCPM-V-2_6 --trust-remote-code +optimum-cli export openvino --model openbmb/MiniCPM-V-2_6 --trust-remote-code MiniCPM-V-2_6 ``` ## Run diff --git a/samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py b/samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py deleted file mode 100644 index 94472bcd77..0000000000 --- a/samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py +++ /dev/null @@ -1,1199 +0,0 @@ -import argparse -import requests -import torch -from threading import Thread -from copy import deepcopy -import shutil -import json -from PIL import Image -from transformers import AutoModel, AutoTokenizer, AutoProcessor, TextIteratorStreamer -from transformers.generation import GenerationMixin -from transformers import AutoConfig, GenerationConfig -from transformers.modeling_outputs import CausalLMOutputWithPast, BaseModelOutputWithPooling -from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask -from pathlib import Path -from huggingface_hub import snapshot_download -import types -from typing import Optional, Tuple, List, Union -from openvino.runtime import opset13 -import openvino as ov -import openvino_tokenizers -import numpy as np -import gc -from openvino.runtime.passes import Manager, MatcherPass, WrapType, Matcher -import time - -text_emb_path = Path("openvino_text_embeddings_model.xml") -image_emb_path = Path("openvino_vision_embeddings_model.xml") -resampler_path = Path("openvino_resampler_model.xml") -llm_path = Path("openvino_language_model.xml") - -class InsertSlice(MatcherPass): - def __init__(self): - MatcherPass.__init__(self) - self.model_changed = False - - param = WrapType("opset10.Result") - - def callback(matcher: Matcher) -> bool: - root = matcher.get_match_root() - if root is None: - return False - if len(root.get_output_partial_shape(0)) == 3: - parent = root.input_value(0).get_node() - grand_parent = parent.input_value(0).get_node() - - grand_parent_output = parent.input(0).get_source_output() - consumers = grand_parent_output.get_target_inputs() - start = np.array([0, -1, 0], dtype=np.int32) - stop = np.array([1, -2, grand_parent_output.get_partial_shape()[-1].get_length()], dtype=np.int32) - step = np.array([1, -1, 1], dtype=np.int32) - axes = np.array([0, 1, 2], dtype=np.int32) - slice = opset13.slice(grand_parent, start, stop, step, axes, name="inserted_slice") - for consumer in consumers: - consumer.replace_source_output(slice.output(0)) - self.model_changed = True - # Use new operation for additional matching - self.register_new_node(slice) - print("applied slice for lm head") - - return True - - self.register_matcher(Matcher(param, "InsertSlice"), callback) - - -def model_has_state(ov_model: ov.Model): - return len(ov_model.get_sinks()) > 0 - - -def model_has_input_output_name(ov_model: ov.Model, name: str): - """ - Helper function for checking that model has specified input or output name - - Parameters: - ov_model (ov.Model): - name (str): - name of input or output - - Returns: - True if input or output with requested name exists else False - """ - return name in sum([list(t.get_names()) for t in ov_model.inputs + ov_model.outputs], []) - - -def fuse_cache_reorder( - ov_model: ov.Model, - not_kv_inputs: List[str], - key_value_input_names: List[str], - gather_dim: int, -): - """ - Fuses reored_cache during generate cycle into ov.Model. Used with stateful models, because we can not modify model state directly. - - Adds a new beam_idx parameter and Gather op per each kv-cache input in a given model. - Should be run before make_stateful. Implements optimumum's _reorder_cache - inside the model in the beginning of each iteration. - Gather works along given gather_dim dimension that may vary from model to model. - KV-cache inputs are identified based on names in key_value_input_names. - Append the new beam_idx parameter to not_kv_inputs. - - Parameters: - ov_model (`ov.Model`): - openvino model for processing - not_kv_inputs (`List[str]`): - list of input nodes in model that not related to past key values - key_value_input_names (`List[str]`): - list of names for key value input layers - gather_dim (int): - dimension for gathering cache during reorder pass - """ - - if model_has_input_output_name(ov_model, "beam_idx"): - raise ValueError("Model already has fused cache") - input_batch = ov_model.input("inputs_embeds").get_partial_shape()[0] - beam_idx = opset13.parameter(name="beam_idx", dtype=ov.Type.i32, shape=ov.PartialShape([input_batch])) - beam_idx.output(0).get_tensor().add_names({"beam_idx"}) - ov_model.add_parameters([beam_idx]) - not_kv_inputs.append(ov_model.inputs[-1]) - # Go over all cache parameters and fuse _reorder_cache with indices provided by the new parameter beam_idx - for input_name in key_value_input_names: - parameter_output_port = ov_model.input(input_name) - consumers = parameter_output_port.get_target_inputs() - gather = opset13.gather(parameter_output_port, beam_idx, opset13.constant(gather_dim)) - for consumer in consumers: - consumer.replace_source_output(gather.output(0)) - ov_model.validate_nodes_and_infer_types() - - -def build_state_initializer(ov_model: ov.Model, batch_dim: int): - """ - Build initialization ShapeOf Expression for all ReadValue ops - - Parameters: - ov_model (ov.Model): - openvino model - batch_dim (int): - index of dimension corresponding to batch size - """ - input_ids = ov_model.input("inputs_embeds") - batch = opset13.gather( - opset13.shape_of(input_ids, output_type="i64"), - opset13.constant([0]), - opset13.constant(0), - ) - for op in ov_model.get_ops(): - if op.get_type_name() == "ReadValue": - dims = [dim.min_length for dim in list(op.get_output_partial_shape(0))] - dims[batch_dim] = batch - dims = [(opset13.constant(np.array([dim], dtype=np.int64)) if isinstance(dim, int) else dim) for dim in dims] - shape = opset13.concat(dims, axis=0) - broadcast = opset13.broadcast(opset13.constant(0.0, dtype=op.get_output_element_type(0)), shape) - op.set_arguments([broadcast]) - ov_model.validate_nodes_and_infer_types() - - -def make_stateful( - ov_model: ov.Model, - not_kv_inputs: List[str], - key_value_input_names: List[str], - key_value_output_names: List[str], - batch_dim: int, - num_attention_heads: int, - num_beams_and_batch: int = None, -): - """ - Hides kv-cache inputs and outputs inside the model as variables. - - Parameters: - ov_model (ov.Model): - openvino model - not_kv_inputs (`List[str]`): - list of input nodes in model that not related to past key values - key_value_input_names (`List[str]`): - list of names for key value input layers - key_value_output_names (`List[str]`): - list of names for key value input layers - batch_dim (int): - index of batch dimension in key value layers - num_attention_heads (int): - number of attention heads for batch dimension initialization - num_beams_an_batch (int): - precalculated number of beams and batch for shapes initialization - """ - from openvino._offline_transformations import apply_make_stateful_transformation - - input_output_map = {} - - if num_beams_and_batch is not None: - # Set batch size for input_ids and attention mask to avoid dynamic dimension got propagated from the end of the model back to ReadValue - for input in not_kv_inputs: - shape = input.get_partial_shape() - if shape.rank.get_length() <= 2: # == 1 for beam_index - shape[0] = num_beams_and_batch - input.get_node().set_partial_shape(shape) - for kv_name_pair in zip(key_value_input_names, key_value_output_names): - input_output_map[kv_name_pair[0]] = kv_name_pair[1] - if num_beams_and_batch is not None: - input = ov_model.input(kv_name_pair[0]) - shape = input.get_partial_shape() - shape[batch_dim] = num_beams_and_batch * num_attention_heads - input.get_node().set_partial_shape(shape) - - if num_beams_and_batch is not None: - # Re-validation model if shapes are altered above - ov_model.validate_nodes_and_infer_types() - - apply_make_stateful_transformation(ov_model, input_output_map) - if num_beams_and_batch is None: - build_state_initializer(ov_model, batch_dim) - - -def patch_stateful(ov_model): - key_value_input_names = [key.get_any_name() for key in ov_model.inputs[2:-1]] - key_value_output_names = [key.get_any_name() for key in ov_model.outputs[1:]] - not_kv_inputs = [input for input in ov_model.inputs if not any(name in key_value_input_names for name in input.get_names())] - if not key_value_input_names or not key_value_output_names: - return - batch_dim = 0 - num_attention_heads = 1 - - fuse_cache_reorder(ov_model, not_kv_inputs, key_value_input_names, batch_dim) - make_stateful( - ov_model, - not_kv_inputs, - key_value_input_names, - key_value_output_names, - batch_dim, - num_attention_heads, - None, - ) - - -def cleanup_torchscript_cache(): - """ - Helper for removing cached model representation - """ - torch._C._jit_clear_class_registry() - torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore() - torch.jit._state._clear_class_state() - - -def get_2d_sincos_pos_embed(embed_dim, image_size): - """ - image_size: image_size or (image_height, image_width) - return: - pos_embed: [image_height, image_width, embed_dim] - """ - if isinstance(image_size, int): - grid_h_size, grid_w_size = image_size, image_size - else: - grid_h_size, grid_w_size = image_size[0], image_size[1] - - grid_h = np.arange(grid_h_size, dtype=np.float32) - grid_w = np.arange(grid_w_size, dtype=np.float32) - grid = np.meshgrid(grid_w, grid_h) # here w goes first - grid = np.stack(grid, axis=0) - - pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) - return pos_embed - - -def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): - assert embed_dim % 2 == 0 - - # use half of dimensions to encode grid_h - emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim // 2, grid[0]) # (H, W, D/2) - emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim // 2, grid[1]) # (H, W, D/2) - - emb = np.concatenate([emb_h, emb_w], axis=-1) # (H, W, D) - return emb - - -def get_1d_sincos_pos_embed_from_grid_new(embed_dim, pos): - """ - embed_dim: output dimension for each position - pos: a list of positions to be encoded: size (H, W) - out: (H, W, D) - """ - assert embed_dim % 2 == 0 - omega = np.arange(embed_dim // 2, dtype=np.float32) - omega /= embed_dim / 2.0 - omega = 1.0 / 10000**omega # (D/2,) - - out = np.einsum("hw,d->hwd", pos, omega) # (H, W, D/2), outer product - - # Align with C++ which always uses double - emb_sin = np.sin(out.astype(np.float64)).astype(np.float32) # (H, W, D/2) - emb_cos = np.cos(out.astype(np.float64)).astype(np.float32) # (H, W, D/2) - - emb = np.concatenate([emb_sin, emb_cos], axis=-1) # (H, W, D) - return emb - - -def patch_model_code(orig_model_dir): - model_file = orig_model_dir / "modeling_navit_siglip.py" - orig_model_file = model_file.parent / ("orig_" + model_file.name) - if not orig_model_file.exists(): - model_file.rename(orig_model_file) - with orig_model_file.open("r") as f: - content = f.read() - content = content.replace("if is_flash_attn_2_available():", "") - content = content.replace("from flash_attn import flash_attn_func, flash_attn_varlen_func", "") - content = content.replace("from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input", "") - - with model_file.open("w") as out_f: - out_f.write(content) - - -def convert_llm(model, model_dir): - model.llm.config.save_pretrained(model_dir / text_emb_path.parent) - print("⌛ Convert Input embedding model") - ov_model = ov.convert_model(model.llm.model.embed_tokens, example_input=torch.ones([1, 10], dtype=torch.long)) - - ov.save_model(ov_model, model_dir / text_emb_path) - del ov_model - cleanup_torchscript_cache() - gc.collect() - print("✅ Input embedding model successfully converted") - - print("⌛ Convert Language model") - hidden_size = model.llm.config.hidden_size - num_pkv = model.llm.config.num_hidden_layers - pkv_shape = (2, model.llm.config.num_key_value_heads, 2, hidden_size // model.llm.config.num_attention_heads) - - input_embeds = torch.randn((2, 2, hidden_size)) - attention_mask = torch.ones([2, 4], dtype=torch.long) - position_ids = torch.tensor([[2, 3], [2, 3]], dtype=torch.long) - input_names = ["attention_mask", "position_ids"] - output_names = ["logits"] - - past_key_values = [] - for i in range(num_pkv): - kv = [torch.randn(pkv_shape) for _ in range(2)] - past_key_values.append(kv) - input_names.extend([f"past_key_values.{i}.key", f"past_key_values.{i}.value"]) - output_names.extend([f"present.{i}.key", f"present.{i}.value"]) - input_names.append("inputs_embeds") - - example_input = {"inputs_embeds": input_embeds, "attention_mask": attention_mask, "position_ids": position_ids, "past_key_values": past_key_values} - - model.llm.config.torchscript = True - - ov_model = ov.convert_model(model.llm, example_input=example_input) - - for out, out_name in zip(ov_model.outputs, output_names): - out.get_tensor().set_names({out_name}) - - for inp, inp_name in zip(ov_model.inputs, input_names): - inp.get_tensor().set_names({inp_name}) - - patch_stateful(ov_model) - - ov.save_model(ov_model, model_dir / llm_path) - del ov_model - - cleanup_torchscript_cache() - gc.collect() - print("✅ Language model successfully converted") - - -def convert_vision_encoder(model, model_dir): - tgt_sizes = torch.tensor([[23, 45]]) - if not (model_dir / image_emb_path).exists(): - print("⌛ Convert Image embedding model") - def siglip_vis_embed_forward( - self, - pixel_values: torch.FloatTensor, - patch_attention_mask: torch.BoolTensor, - tgt_sizes: Optional[torch.IntTensor] = None, - position_ids: Optional[torch.FloatTensor] = None, - ) -> torch.Tensor: - patch_embeds = self.patch_embedding(pixel_values) - embeddings = patch_embeds.flatten(2).transpose(1, 2) - - if position_ids is None: - batch_size = pixel_values.size(0) - max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3) - max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, max_im_w // self.patch_size - boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side) - position_ids = torch.full( - size=( - batch_size, - max_nb_patches_h * max_nb_patches_w, - ), - fill_value=0, - ) - - for batch_idx, p_attn_mask in enumerate(patch_attention_mask): - if tgt_sizes is not None: - nb_patches_h = tgt_sizes[batch_idx][0] - nb_patches_w = tgt_sizes[batch_idx][1] - else: - nb_patches_h = p_attn_mask[:, 0].sum() - nb_patches_w = p_attn_mask[0].sum() - - fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h) - fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w) - - bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True) - bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True) - - pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten() - position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids - - position_ids = position_ids.to(self.position_embedding.weight.device) - - embeddings = embeddings + self.position_embedding(position_ids) - return embeddings - - def siglip_attn_forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - """Input shape: Batch x Time x Channel""" - - batch_size, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2) - - attn_output = torch.nn.functional.scaled_dot_product_attention( - query_states, key_states, value_states, attention_mask, is_causal=attention_mask is None - ) - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim) - - attn_output = self.out_proj(attn_output) - - return attn_output, None - - def siglip_transformer_forward( - self, - pixel_values, - patch_attention_mask: Optional[torch.BoolTensor] = None, - tgt_sizes: Optional[torch.IntTensor] = None, - position_ids: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - batch_size = pixel_values.size(0) - if patch_attention_mask is None: - patch_attention_mask = torch.ones( - size=( - batch_size, - pixel_values.size(2) // self.config.patch_size, - pixel_values.size(3) // self.config.patch_size, - ), - dtype=torch.bool, - device=pixel_values.device, - ) - - hidden_states = self.embeddings( - pixel_values=pixel_values, patch_attention_mask=patch_attention_mask, tgt_sizes=tgt_sizes, position_ids=position_ids - ) - - patch_attention_mask = patch_attention_mask.view(batch_size, -1) - attention_mask = _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype) if not self._use_flash_attention_2 else patch_attention_mask - - encoder_outputs = self.encoder( - inputs_embeds=hidden_states, - attention_mask=attention_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - last_hidden_state = encoder_outputs[0] - last_hidden_state = self.post_layernorm(last_hidden_state) - - if not return_dict: - return (last_hidden_state, None) + encoder_outputs[1:] - - return BaseModelOutputWithPooling( - last_hidden_state=last_hidden_state, - pooler_output=None, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - ) - - vpm = model.vpm - vpm.embeddings.forward = types.MethodType(siglip_vis_embed_forward, vpm.embeddings) - for layer in vpm.encoder.layers: - layer.self_attn.forward = types.MethodType(siglip_attn_forward, layer.self_attn) - vpm.forward = types.MethodType(siglip_transformer_forward, vpm) - - pixel_values = torch.randn([1, 3, 14, 14490]) - patch_attn_mask = torch.zeros((1, 1, 1035), dtype=torch.bool) - patch_attn_mask[0, 0, : tgt_sizes[0][0] * tgt_sizes[0][1]] = True - position_ids = prepare_vis_position_ids( - pixel_values, patch_attn_mask, tgt_sizes, model.config.vision_config.patch_size, model.config.vision_config.image_size // model.config.patch_size - ) - ov_model = ov.convert_model(vpm, example_input={"pixel_values": pixel_values, "position_ids": position_ids, "patch_attention_mask": patch_attn_mask}) - ov.save_model(ov_model, model_dir / image_emb_path) - del ov_model - cleanup_torchscript_cache() - gc.collect() - print("✅ Image embedding model successfully converted") - - if not (model_dir / resampler_path).exists(): - print("⌛ Convert Resamler model") - - def resampler_forward(self, x, pos_embed, key_padding_mask): - bs = x.shape[0] - x = self.kv_proj(x) # B * L * D - x = self.ln_kv(x).permute(1, 0, 2) # L * B * D - - q = self.ln_q(self.query) # Q * D - - q_bs = q.unsqueeze(1).repeat(1, bs, 1) - - out = self.attn(q_bs, x + pos_embed, x, key_padding_mask=key_padding_mask)[0] # Q * B * D # L * B * D + L * B * D - # out: Q * B * D - x = out.permute(1, 0, 2) # B * Q * D - - x = self.ln_post(x) - x = x @ self.proj - return x - - model.resampler.forward = types.MethodType(resampler_forward, model.resampler) - - pos_embed_base = get_2d_sincos_pos_embed(model.resampler.embed_dim, 70) - - patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1] - - max_patch_len = torch.max(patch_len) - key_padding_mask = torch.zeros((1, max_patch_len), dtype=torch.bool) - - pos_embed = [] - tgt_h, tgt_w = tgt_sizes[0] - pos_embed = torch.from_numpy(pos_embed_base[:tgt_h, :tgt_w, :].reshape((tgt_h * tgt_w, 1, -1))) # patches * D - key_padding_mask[0, patch_len:] = True - - ov_model = ov.convert_model(model.resampler, example_input=[torch.randn(1, 1035, 1152), pos_embed, key_padding_mask]) - ov.save_model(ov_model, model_dir / resampler_path) - del ov_model - cleanup_torchscript_cache() - del model.resampler - gc.collect() - print("✅ Resampler model successfully converted") - - -def copy_llm_files(model_dir, dst_dir): - shutil.copy(model_dir / text_emb_path, model_dir / dst_dir / text_emb_path.name) - shutil.copy(model_dir / text_emb_path.with_suffix(".bin"), model_dir / dst_dir / text_emb_path.with_suffix(".bin").name) - shutil.copy(model_dir / llm_path.parent / "config.json", model_dir / dst_dir / "config.json") - shutil.copy(model_dir / llm_path.parent / "configuration_minicpm.py", model_dir / dst_dir / "configuration_minicpm.py") - shutil.copy(model_dir / llm_path.parent / "modeling_navit_siglip.py", model_dir / dst_dir / "modeling_navit_siglip.py") - - -def prepare_vis_position_ids(pixel_values, patch_attention_mask, tgt_sizes, patch_size, num_patches_per_side): - batch_size = pixel_values.size(0) - max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3) - max_nb_patches_h, max_nb_patches_w = max_im_h // patch_size, max_im_w // patch_size - boundaries = torch.arange(1 / num_patches_per_side, 1.0, 1 / num_patches_per_side) - position_ids = torch.full(size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0) - - for batch_idx, p_attn_mask in enumerate(patch_attention_mask): - if tgt_sizes is not None: - nb_patches_h = tgt_sizes[batch_idx][0] - nb_patches_w = tgt_sizes[batch_idx][1] - else: - nb_patches_h = p_attn_mask[:, 0].sum() - nb_patches_w = p_attn_mask[0].sum() - - fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h) - fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w) - - bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True) - bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True) - - pos_ids = (bucket_coords_h[:, None] * num_patches_per_side + bucket_coords_w).flatten() - position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids - - return position_ids - - -core = ov.Core() - - -class OvModelForCausalLMWithEmb(GenerationMixin): - def __init__(self, model_dir, device="CPU", ov_config=None, compile=True, slice_lm_head=True) -> None: - self._supports_cache_class = False - self.config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True) - self.config.is_decoder = True - self.config.is_encoder_decoder = False - self.generation_config = GenerationConfig.from_model_config(self.config) - model_dir = Path(model_dir) - self.model = core.read_model(model_dir / "openvino_language_model.xml") - self.token_emb = core.read_model(model_dir / "openvino_text_embeddings_model.xml") - if slice_lm_head: - self.slice_lm_head() - self.request = None - self.token_emb_request = None - self._device = device.upper() - self.device = torch.device("cpu") - self.ov_config = ov_config - self.next_beam_idx = None - self._past_length = None - self.input_names = [input_t.get_any_name() for input_t in self.model.inputs] - self.main_input_name = "input_ids" - self.llm_times = [] - if compile: - self.compile() - - def slice_lm_head(self): - manager = Manager() - manager.register_pass(InsertSlice()) - manager.run_passes(self.model) - self.model.validate_nodes_and_infer_types() - - def compile(self): - if self.request is None: - self.request = core.compile_model(self.model, self._device, self.ov_config).create_infer_request() - self._compile_token_emb() - - def _compile_token_emb(self): - if self.token_emb_request is None: - self.token_emb_request = core.compile_model(self.token_emb, self._device, self.ov_config) - - def to(self, device: str): - if isinstance(device, str): - self._device = device.upper() - self.clear_requests() - - return self - - def clear_requests(self): - del self.request - del self.token_emb_request - self.request = None - self.token_emb_request = None - - def embed_tokens(self, input_ids: torch.LongTensor): - self._compile_token_emb() - res = self.token_emb_request(input_ids, share_inputs=True) - return res[0] - - def prepare_inputs( - self, - input_ids: torch.LongTensor, - attention_mask: Optional[torch.LongTensor] = None, - past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, - position_ids: Optional[torch.LongTensor] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - **kwargs, - ): - batch_size = input_ids.shape[0] if input_ids is not None else inputs_embeds.shape[0] - - inputs = {} - # past_key_values are not used explicitly, instead they are handled inside the model - if past_key_values is None: - self.llm_times = [] - # This is the first iteration in a sequence, reset all states - if self.request is not None: - self.request.reset_state() - # Set initial value for the next beam_idx input that will be used at the current iteration - # and will be optionally updated by _reorder_cache at the next iterations if beam_search is used - self.next_beam_idx = np.arange(batch_size, dtype=int) - self._past_length = 0 - past_len = self._get_past_length(past_key_values) - - if inputs_embeds is None: - inputs_embeds = self.embed_tokens(input_ids if past_key_values is None else input_ids[:, -1:]) - - if hasattr(self.config, "scale_emb"): - inputs_embeds = inputs_embeds * self.config.scale_emb - inputs["inputs_embeds"] = inputs_embeds - - # Add the attention_mask inputs when needed - if "attention_mask" in self.input_names or "position_ids" in self.input_names: - if attention_mask is not None: - attention_mask = np.array(attention_mask) - else: - attention_mask = np.ones((inputs_embeds.shape[0], inputs_embeds.shape[1] + past_len), dtype=int) - - if "attention_mask" in self.input_names: - inputs["attention_mask"] = attention_mask - - if "position_ids" in self.input_names: - if position_ids is not None: - position_ids = np.array(position_ids) - else: - position_ids = np.cumsum(attention_mask, axis=1) - 1 - position_ids[attention_mask == 0] = 1 - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - - inputs["position_ids"] = position_ids - - if "beam_idx" in self.input_names: - inputs["beam_idx"] = self.next_beam_idx if self.next_beam_idx is not None else np.arange(batch_size, dtype=int) - - return inputs - - def forward( - self, - input_ids: torch.LongTensor, - attention_mask: Optional[torch.LongTensor] = None, - past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, - position_ids: Optional[torch.LongTensor] = None, - inputs_embeds: Optional[torch.LongTensor] = None, - **kwargs, - ): - self.compile() - - inputs = self.prepare_inputs( - input_ids=input_ids, - attention_mask=attention_mask, - past_key_values=past_key_values, - position_ids=position_ids, - inputs_embeds=inputs_embeds, - **kwargs, - ) - - # Run inference - self.request.start_async(inputs, share_inputs=True) - self.request.wait() - logits = self.request.get_tensor("logits").data - logits = torch.from_numpy(logits).to(self.device) - past_key_values = ((),) - self._past_length += inputs["inputs_embeds"].shape[1] - - return CausalLMOutputWithPast(logits=logits, past_key_values=past_key_values) - - # Adapted from transformers.models.llama.modeling_llama.LlamaForCausalLM.prepare_inputs_for_generation - def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs): - # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly - attention_mask = kwargs.get("attention_mask", None) - use_cache = kwargs.get("use_cache", None) - - if past_key_values is not None: - past_len = self._get_past_length(past_key_values) - # Keep only the unprocessed tokens: - # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where - # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as - # input) - if attention_mask is not None and input_ids is not None and attention_mask.shape[1] > input_ids.shape[1]: - input_ids = input_ids[:, -(attention_mask.shape[1] - past_len) :] - # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard - # input_ids based on the past_length. - elif input_ids is not None and past_len < input_ids.shape[1]: - input_ids = input_ids[:, past_len:] - # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens - position_ids = kwargs.get("position_ids", None) - if attention_mask is not None and position_ids is None and "position_ids" in self.input_names: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values and input_ids is not None: - position_ids = position_ids[:, -input_ids.shape[1] :] - - model_inputs = { - "input_ids": input_ids, - "past_key_values": past_key_values, - "use_cache": use_cache, - "position_ids": position_ids, - "attention_mask": attention_mask, - "inputs_embeds": inputs_embeds if past_key_values is None else None, - } - - return model_inputs - - def _get_past_length(self, past_key_values=None): - if past_key_values is None: - return 0 - return self._past_length - - # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel._reorder_cache - def _reorder_cache(self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]: - """ - This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or - [`~PreTrainedModel.beam_sample`] is called. - This is required to match `past_key_values` with the correct beam_idx at every generation step. - """ - self.next_beam_idx = np.array(beam_idx) # save beam_idx to be used as an input in the next iteration - return past_key_values - - def can_generate(self): - """Returns True to validate the check that the model using `GenerationMixin.generate()` can indeed generate.""" - - return True - - def __call__(self, *args, **kwargs): - return self.forward(*args, **kwargs) - - -class OvMiniCPMV: - def __init__(self, config, vpm, resampler, llm, processor): - self.config = config - self.llm = llm - self.vpm = vpm - self.embed_dim = self.llm.config.hidden_size - self._resampler = resampler - self.processor = processor - self._pos_embeds = torch.from_numpy(get_2d_sincos_pos_embed(self.embed_dim, 70)).float() - self.max_size = (70, 70) - - self.terminators = ["<|im_end|>", "<|endoftext|>"] - - def set_decoder(self, decoder): - self.llm = decoder - - def get_decoder(self): - return self.llm - - def resampler(self, x, tgt_sizes): - bs = x.shape[0] - - patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1] - - self._adjust_pos_cache(tgt_sizes) - - max_patch_len = torch.max(patch_len) - key_padding_mask = torch.zeros((bs, max_patch_len), dtype=torch.bool) - - pos_embed = [] - for i in range(bs): - tgt_h, tgt_w = tgt_sizes[i] - pos_embed.append(self._pos_embeds[:tgt_h, :tgt_w, :].reshape((tgt_h * tgt_w, -1))) # patches * D - key_padding_mask[i, patch_len[i] :] = True - - pos_embed = torch.nn.utils.rnn.pad_sequence(pos_embed, batch_first=True, padding_value=0.0).permute(1, 0, 2) # BLD => L * B * D - - res = torch.from_numpy(self._resampler([x, pos_embed, key_padding_mask])[0]) - return res - - def _set_2d_pos_cache(self, max_size): - pos_embed = torch.from_numpy(get_2d_sincos_pos_embed(self.embed_dim, max_size)).float() - self._pos_embed = pos_embed - - def _adjust_pos_cache(self, tgt_sizes): - max_h = torch.max(tgt_sizes[:, 0]) - max_w = torch.max(tgt_sizes[:, 1]) - if max_h > self.max_size[0] or max_w > self.max_size[1]: - self.max_size = [max(max_h, self.max_size[0]), max(max_w, self.max_size[1])] - self._set_2d_pos_cache(self.max_size) - - def get_vllm_embedding(self, data): - if "vision_hidden_states" not in data: - tgt_sizes = data["tgt_sizes"] - pixel_values_list = data["pixel_values"] - vision_hidden_states = [] - all_pixel_values = [] - img_cnt = [] - for pixel_values in pixel_values_list: - img_cnt.append(len(pixel_values)) - all_pixel_values.extend([i.flatten(end_dim=1).permute(1, 0) for i in pixel_values]) - - # exist image - if all_pixel_values: - tgt_sizes = [tgt_size for tgt_size in tgt_sizes if isinstance(tgt_size, torch.Tensor)] - tgt_sizes = torch.vstack(tgt_sizes).type(torch.int32) - - max_patches = torch.max(tgt_sizes[:, 0] * tgt_sizes[:, 1]) - - all_pixel_values = torch.nn.utils.rnn.pad_sequence(all_pixel_values, batch_first=True, padding_value=0.0) - B, L, _ = all_pixel_values.shape - all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L) - - patch_attn_mask = torch.zeros((B, 1, max_patches), dtype=torch.bool) - for i in range(B): - patch_attn_mask[i, 0, : tgt_sizes[i][0] * tgt_sizes[i][1]] = True - - vision_batch_size = 32 - all_pixel_values = all_pixel_values - if B > vision_batch_size: - hs = [] - for i in range(0, B, vision_batch_size): - start_idx = i - end_idx = i + vision_batch_size - block_pxl_values = all_pixel_values[start_idx:end_idx] - block_patch_attn_mask = patch_attn_mask[start_idx:end_idx] - block_tgt_sizes = tgt_sizes[start_idx:end_idx] - block_position_ids = prepare_vis_position_ids( - block_pxl_values, - block_patch_attn_mask, - block_tgt_sizes, - self.config.vision_config.patch_size, - self.config.vision_config.image_size // self.config.patch_size, - ) - start = time.perf_counter() - tmp_hs = torch.from_numpy(self.vpm([block_pxl_values, block_patch_attn_mask, block_position_ids])[0]) - self.vpm_times.append(time.perf_counter() - start) - hs.append(tmp_hs) - vision_embedding = torch.cat(hs, dim=0) - else: - position_ids = prepare_vis_position_ids( - all_pixel_values, - patch_attn_mask, - tgt_sizes, - self.config.vision_config.patch_size, - self.config.vision_config.image_size // self.config.patch_size, - ) - start = time.perf_counter() - vision_embedding = torch.from_numpy(self.vpm([all_pixel_values, patch_attn_mask, position_ids])[0]) - vision_embedding = torch.from_numpy(self.vpm([all_pixel_values, patch_attn_mask, position_ids])[0]) - vision_embedding = self.resampler(vision_embedding, tgt_sizes) - - start = 0 - for pixel_values in pixel_values_list: - img_cnt = len(pixel_values) - if img_cnt > 0: - vision_hidden_states.append(vision_embedding[start : start + img_cnt]) - start += img_cnt - else: - vision_hidden_states.append([]) - else: # no image - dummy_feature = [] - for _ in range(len(pixel_values_list)): - vision_hidden_states.append(dummy_feature) - - else: - vision_hidden_states = data["vision_hidden_states"] - - if hasattr(self.llm.config, "scale_emb"): - vllm_embedding = self.llm.embed_tokens(data["input_ids"]) * self.llm.config.scale_emb - else: - vllm_embedding = self.llm.embed_tokens(data["input_ids"]) - - bs = len(data["input_ids"]) - for i in range(bs): - cur_vs_hs = vision_hidden_states[i] - if len(cur_vs_hs) > 0: - cur_vllm_emb = torch.from_numpy(vllm_embedding[i]) - cur_image_bound = data["image_bound"][i] - if len(cur_image_bound) > 0: - image_indices = torch.stack([torch.arange(r[0], r[1], dtype=torch.long) for r in cur_image_bound]) - - cur_vllm_emb.scatter_(0, image_indices.view(-1, 1).repeat(1, cur_vllm_emb.shape[-1]), cur_vs_hs.view(-1, cur_vs_hs.shape[-1])) - return vllm_embedding - - def forward(self, data, **kwargs): - vllm_embedding = self.get_vllm_embedding(data) - position_ids = data["position_ids"] - if position_ids.dtype != torch.int64: - position_ids = position_ids.long() - - return self.llm(input_ids=None, position_ids=position_ids, inputs_embeds=vllm_embedding, **kwargs) - - def _decode(self, inputs_embeds, tokenizer, attention_mask, decode_text=False, **kwargs): - terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators] - output = self.llm.generate( - inputs_embeds=torch.from_numpy(inputs_embeds), pad_token_id=0, eos_token_id=terminators, attention_mask=attention_mask, **kwargs - ) - if decode_text: - return self._decode_text(output, tokenizer) - return output - - def _decode_stream(self, inputs_embeds, tokenizer, **kwargs): - terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators] - streamer = TextIteratorStreamer(tokenizer=tokenizer) - generation_kwargs = {"inputs_embeds": torch.from_numpy(inputs_embeds), "pad_token_id": 0, "eos_token_id": terminators, "streamer": streamer} - generation_kwargs.update(kwargs) - - thread = Thread(target=self.llm.generate, kwargs=generation_kwargs) - thread.start() - - return streamer - - def _decode_text(self, result_ids, tokenizer): - terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators] - result_text = [] - for result in result_ids: - result = result[result != 0] - if result[0] == tokenizer.bos_id: - result = result[1:] - if result[-1] in terminators: - result = result[:-1] - result_text.append(tokenizer.decode(result).strip()) - return result_text - - def generate( - self, - input_ids=None, - pixel_values=None, - tgt_sizes=None, - image_bound=None, - attention_mask=None, - tokenizer=None, - vision_hidden_states=None, - return_vision_hidden_states=False, - stream=False, - decode_text=False, - **kwargs, - ): - assert input_ids is not None - assert len(input_ids) == len(pixel_values) - - model_inputs = { - "input_ids": input_ids, - "image_bound": image_bound, - } - - if vision_hidden_states is None: - model_inputs["pixel_values"] = pixel_values - model_inputs["tgt_sizes"] = tgt_sizes - else: - model_inputs["vision_hidden_states"] = vision_hidden_states - - with torch.inference_mode(): - model_inputs["inputs_embeds"] = self.get_vllm_embedding(model_inputs) - - if stream: - result = self._decode_stream(model_inputs["inputs_embeds"], tokenizer, **kwargs) - else: - result = self._decode(model_inputs["inputs_embeds"], tokenizer, attention_mask, decode_text=decode_text, **kwargs) - - return result - - def chat( - self, - image, - msgs, - tokenizer, - processor=None, - vision_hidden_states=None, - max_new_tokens=2048, - min_new_tokens=0, - sampling=True, - max_inp_length=8192, - system_prompt="", - stream=False, - max_slice_nums=None, - use_image_id=None, - **kwargs, - ): - self.vpm_times = [] - self.resampler_times = [] - if isinstance(msgs[0], list): - batched = True - else: - batched = False - msgs_list = msgs - images_list = image - - if batched is False: - images_list, msgs_list = [images_list], [msgs_list] - else: - assert images_list is None, "Please integrate image to msgs when using batch inference." - images_list = [None] * len(msgs_list) - assert len(images_list) == len(msgs_list), "The batch dim of images_list and msgs_list should be the same." - - if processor is None: - if self.processor is None: - self.processor = AutoProcessor.from_pretrained(self.config._name_or_path, trust_remote_code=True) - processor = self.processor - - assert ( - self.config.query_num == processor.image_processor.image_feature_size - ), "These two values should be the same. Check `config.json` and `preprocessor_config.json`." - assert ( - self.config.patch_size == processor.image_processor.patch_size - ), "These two values should be the same. Check `config.json` and `preprocessor_config.json`." - assert ( - self.config.use_image_id == processor.image_processor.use_image_id - ), "These two values should be the same. Check `config.json` and `preprocessor_config.json`." - assert ( - self.config.slice_config.max_slice_nums == processor.image_processor.max_slice_nums - ), "These two values should be the same. Check `config.json` and `preprocessor_config.json`." - assert ( - self.config.slice_mode == processor.image_processor.slice_mode - ), "These two values should be the same. Check `config.json` and `preprocessor_config.json`." - - prompts_lists = [] - input_images_lists = [] - for image, msgs in zip(images_list, msgs_list): - if isinstance(msgs, str): - msgs = json.loads(msgs) - copy_msgs = deepcopy(msgs) - - assert len(msgs) > 0, "msgs is empty" - - if image is not None and isinstance(copy_msgs[0]["content"], str): - copy_msgs[0]["content"] = [image, copy_msgs[0]["content"]] - - images = [] - for i, msg in enumerate(copy_msgs): - role = msg["role"] - content = msg["content"] - assert role in ["user", "assistant"] - if i == 0: - assert role == "user", "The role of first msg should be user" - if isinstance(content, str): - content = [content] - cur_msgs = [] - for c in content: - if isinstance(c, Image.Image): - images.append(c) - cur_msgs.append("(./)") - elif isinstance(c, str): - cur_msgs.append(c) - msg["content"] = "\n".join(cur_msgs) - - if system_prompt: - sys_msg = {"role": "system", "content": system_prompt} - copy_msgs = [sys_msg] + copy_msgs - - prompts_lists.append(processor.tokenizer.apply_chat_template(copy_msgs, tokenize=False, add_generation_prompt=True)) - input_images_lists.append(images) - - inputs = processor( - prompts_lists, input_images_lists, max_slice_nums=max_slice_nums, use_image_id=use_image_id, return_tensors="pt", max_length=max_inp_length - ) - - if sampling: - generation_config = {"top_p": 0.8, "top_k": 100, "temperature": 0.7, "do_sample": True, "repetition_penalty": 1.05} - else: - generation_config = { - "repetition_penalty": 1.0, - } - - if min_new_tokens > 0: - generation_config["min_new_tokens"] = min_new_tokens - - generation_config.update((k, kwargs[k]) for k in generation_config.keys() & kwargs.keys()) - - inputs.pop("image_sizes") - with torch.inference_mode(): - res = self.generate( - **inputs, - tokenizer=tokenizer, - max_new_tokens=max_new_tokens, - vision_hidden_states=vision_hidden_states, - stream=stream, - decode_text=True, - **generation_config, - ) - - if stream: - - def stream_gen(): - for text in res: - for term in self.terminators: - text = text.replace(term, "") - yield text - - return stream_gen() - - else: - if batched: - answer = res - else: - answer = res[0] - return answer - - -def init_model(model_dir, device): - config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True) - llm = OvModelForCausalLMWithEmb(model_dir, device) - img_emb = core.compile_model(model_dir / image_emb_path, device) - resampler = core.compile_model(model_dir / resampler_path, device) - processor = AutoProcessor.from_pretrained(model_dir, trust_remote_code=True) - - ov_model = OvMiniCPMV(config, img_emb, resampler, llm, processor) - return ov_model - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("model_dir", type=Path) - model_dir = parser.parse_args().model_dir - model_id = "openbmb/MiniCPM-V-2_6" - ckpt = model_dir / "ckpt" - if not ckpt.exists(): - snapshot_download(model_id, local_dir=ckpt, force_download=True) - patch_model_code(ckpt) - model = AutoModel.from_pretrained(ckpt, trust_remote_code=True) - model.eval() - model.config.save_pretrained(model_dir) - tokenizer = AutoTokenizer.from_pretrained(ckpt, trust_remote_code=True) - tokenizer.save_pretrained(model_dir) - ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(tokenizer, with_detokenizer=True) - ov.save_model(ov_tokenizer, model_dir / "openvino_tokenizer.xml") - ov.save_model(ov_detokenizer, model_dir / "openvino_detokenizer.xml") - processor = AutoProcessor.from_pretrained(ckpt, trust_remote_code=True) - processor.save_pretrained(model_dir) - - convert_llm(model, model_dir) - del model.llm - gc.collect() - - convert_vision_encoder(model, model_dir) - # ov_cpm = init_model(model_dir, "CPU") - # print(ov_cpm.chat(Image.open(requests.get("https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11", stream=True).raw), [{"role": "user", "content": "What is unusual on this image?"}], ov_cpm.processor.tokenizer, sampling=False)) - -if "__main__" == __name__: - main() diff --git a/samples/python/visual_language_chat/README.md b/samples/python/visual_language_chat/README.md index 12ffb27f99..06355d9ee5 100644 --- a/samples/python/visual_language_chat/README.md +++ b/samples/python/visual_language_chat/README.md @@ -10,7 +10,7 @@ It's not required to install [../../requirements.txt](../../requirements.txt) fo ```sh pip install --upgrade-strategy eager -r ../../requirements.txt -optimum-cli export openvino -m openbmb/MiniCPM-V-2_6 miniCPM-V-2_6 --trust-remote-code +optimum-cli export openvino --model openbmb/MiniCPM-V-2_6 --trust-remote-code MiniCPM-V-2_6 ``` ## Run: diff --git a/samples/requirements.txt b/samples/requirements.txt index 870597f06f..1b84354f09 100644 --- a/samples/requirements.txt +++ b/samples/requirements.txt @@ -5,4 +5,3 @@ einops==0.8.0 # For Qwen transformers_stream_generator==0.0.5 # For Qwen diffusers==0.30.3 librosa # For Whisper -torchvision # needed for mini-CPM export script. Need to remove when we switch to exporting with optimum-intel. From 8410b22810b921b450ddbf857817bd6d68168d9b Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Tue, 15 Oct 2024 14:39:35 +0400 Subject: [PATCH 20/28] Put torchvision back --- samples/requirements.txt | 1 + tests/python_tests/test_vlm_api.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/samples/requirements.txt b/samples/requirements.txt index 1b84354f09..f829c37eae 100644 --- a/samples/requirements.txt +++ b/samples/requirements.txt @@ -5,3 +5,4 @@ einops==0.8.0 # For Qwen transformers_stream_generator==0.0.5 # For Qwen diffusers==0.30.3 librosa # For Whisper +torchvision # For visual langugage models diff --git a/tests/python_tests/test_vlm_api.py b/tests/python_tests/test_vlm_api.py index 94ad8c76be..ec49eb0f93 100644 --- a/tests/python_tests/test_vlm_api.py +++ b/tests/python_tests/test_vlm_api.py @@ -72,7 +72,7 @@ def streamer(word: str) -> bool: get_beam_search(), get_multinomial_all_parameters(), ]) -@pytest.mark.sip("Enable after sampler are enabled") +@pytest.mark.skip("Enable after sampler are enabled") def test_sampling(config, cache): model_path = get_ov_model(cache.mkdir("MiniCPM-V-2_6")) image = get_image_by_link(image_links[0]) From 1fea50fc5e3a99ce0aac37ec5dfb3e1fef66197d Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Tue, 15 Oct 2024 15:44:53 +0400 Subject: [PATCH 21/28] update tests requirements --- .github/workflows/linux.yml | 2 ++ .github/workflows/windows.yml | 3 +++ tests/python_tests/requirements.txt | 2 +- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 2f327ecf31..591fd4ab4b 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -260,6 +260,7 @@ jobs: run: | source ${OV_INSTALL_DIR}/setupvars.sh python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager + python -m pip install -U "optimum<1.23" --no-dependencies python -m pytest ./tests/python_tests --ignore ./tests/python_tests/test_whisper_generate_api.py env: PYTHONPATH: "./build/:$PYTHONPATH" @@ -347,6 +348,7 @@ jobs: run: | source ${OV_INSTALL_DIR}/setupvars.sh python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager + python -m pip install -U "optimum<1.23" --no-dependencies python -m pytest ./tests/python_tests/test_whisper_generate_api.py -k test_smoke env: PYTHONPATH: "./build/:$PYTHONPATH" diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 0b8cece3fb..2d3724a4eb 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -237,6 +237,7 @@ jobs: run: | . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels --upgrade-strategy eager + python -m pip install -U "optimum<1.23" --no-dependencies python -m pytest ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py env: PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that. @@ -301,6 +302,7 @@ jobs: run: | . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels --upgrade-strategy eager + python -m pip install -U "optimum<1.23" --no-dependencies python -m pytest ./tests/python_tests/test_whisper_generate_api.py -k test_smoke env: PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that. @@ -366,6 +368,7 @@ jobs: run: | . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels --upgrade-strategy eager + python -m pip install -U "optimum<1.23" --no-dependencies python -m pytest ./tests/python_tests/test_vlm_api.py env: PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that. diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt index 0e48cc125d..5747f07e02 100644 --- a/tests/python_tests/requirements.txt +++ b/tests/python_tests/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://download.pytorch.org/whl/cpu -optimum-intel @ git+https://github.com/huggingface/optimum-intel.git +optimum-intel @ git+https://github.com/eaidova/optimum-intel.git@ea/minicpmv numpy<2.0.0; sys_platform == 'darwin' onnx==1.16.1 pytest From d1448efc2eb836b4b986c77f0ec03db8d4c2fad1 Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Tue, 15 Oct 2024 16:11:48 +0400 Subject: [PATCH 22/28] remove wwb req --- tests/python_tests/requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt index 5747f07e02..703934954e 100644 --- a/tests/python_tests/requirements.txt +++ b/tests/python_tests/requirements.txt @@ -3,7 +3,6 @@ optimum-intel @ git+https://github.com/eaidova/optimum-intel.git@ea/minicpmv numpy<2.0.0; sys_platform == 'darwin' onnx==1.16.1 pytest -llm_bench/python/who_what_benchmark # requirements for specific models # - hf-tiny-model-private/tiny-random-RoFormerForCausalLM From 67e60aca93f7a6725646992e7f98eae382a19e2b Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Tue, 15 Oct 2024 16:30:00 +0400 Subject: [PATCH 23/28] wwb reqs --- llm_bench/python/requirements.txt | 2 +- tests/python_tests/requirements.txt | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/llm_bench/python/requirements.txt b/llm_bench/python/requirements.txt index 6139bf843c..b11cfdd683 100644 --- a/llm_bench/python/requirements.txt +++ b/llm_bench/python/requirements.txt @@ -10,7 +10,7 @@ torch transformers>=4.40.0 diffusers>=0.22.0 #optimum is in dependency list of optimum-intel -git+https://github.com/huggingface/optimum-intel.git@main#egg=optimum-intel +optimum-intel @ git+https://github.com/eaidova/optimum-intel.git@ea/minicpmv git+https://github.com/openvinotoolkit/nncf.git@develop#egg=nncf packaging psutil diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt index 703934954e..5747f07e02 100644 --- a/tests/python_tests/requirements.txt +++ b/tests/python_tests/requirements.txt @@ -3,6 +3,7 @@ optimum-intel @ git+https://github.com/eaidova/optimum-intel.git@ea/minicpmv numpy<2.0.0; sys_platform == 'darwin' onnx==1.16.1 pytest +llm_bench/python/who_what_benchmark # requirements for specific models # - hf-tiny-model-private/tiny-random-RoFormerForCausalLM From f67ce00cb005b1694b5d254e2f5f5cf78d0ff1a1 Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Tue, 15 Oct 2024 17:08:57 +0400 Subject: [PATCH 24/28] req --- llm_bench/python/requirements.txt | 2 +- llm_bench/python/who_what_benchmark/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llm_bench/python/requirements.txt b/llm_bench/python/requirements.txt index b11cfdd683..6139bf843c 100644 --- a/llm_bench/python/requirements.txt +++ b/llm_bench/python/requirements.txt @@ -10,7 +10,7 @@ torch transformers>=4.40.0 diffusers>=0.22.0 #optimum is in dependency list of optimum-intel -optimum-intel @ git+https://github.com/eaidova/optimum-intel.git@ea/minicpmv +git+https://github.com/huggingface/optimum-intel.git@main#egg=optimum-intel git+https://github.com/openvinotoolkit/nncf.git@develop#egg=nncf packaging psutil diff --git a/llm_bench/python/who_what_benchmark/requirements.txt b/llm_bench/python/who_what_benchmark/requirements.txt index caae595e69..637b1c9942 100644 --- a/llm_bench/python/who_what_benchmark/requirements.txt +++ b/llm_bench/python/who_what_benchmark/requirements.txt @@ -2,7 +2,7 @@ transformers>=4.35.2 sentence-transformers>=2.2.2 openvino>=2024.3.0 openvino-telemetry -optimum-intel @ git+https://github.com/huggingface/optimum-intel.git +optimum-intel @ git+https://github.com/eaidova/optimum-intel.git@ea/minicpmv openvino-tokenizers pandas>=2.0.3 numpy>=1.23.5 From e2ac30eeb3226cd7138e8d478ce3d1c802e99c1e Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Tue, 15 Oct 2024 19:29:09 +0400 Subject: [PATCH 25/28] int8 --- tests/python_tests/{test_vlm_api.py => test_avlm_api.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename tests/python_tests/{test_vlm_api.py => test_avlm_api.py} (97%) diff --git a/tests/python_tests/test_vlm_api.py b/tests/python_tests/test_avlm_api.py similarity index 97% rename from tests/python_tests/test_vlm_api.py rename to tests/python_tests/test_avlm_api.py index ec49eb0f93..8fc163b5ff 100644 --- a/tests/python_tests/test_vlm_api.py +++ b/tests/python_tests/test_avlm_api.py @@ -18,7 +18,7 @@ def get_ov_model(model_dir): ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(processor.tokenizer, with_detokenizer=True) openvino.save_model(ov_tokenizer, model_dir / "openvino_tokenizer.xml") openvino.save_model(ov_detokenizer, model_dir / "openvino_detokenizer.xml") - model = OVModelForVisualCausalLM.from_pretrained(model_id, compile=False, device="CPU", export=True, load_in_8bit=False, trust_remote_code=True) + model = OVModelForVisualCausalLM.from_pretrained(model_id, compile=False, device="CPU", export=True, trust_remote_code=True) model.config.save_pretrained(model_dir) model.generation_config.save_pretrained(model_dir) model.save_pretrained(model_dir) From e084e797fa80e6f3b5798a4407de451cb03d91aa Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Tue, 15 Oct 2024 21:05:32 +0400 Subject: [PATCH 26/28] xfail --- tests/python_tests/test_sampling.py | 4 ++-- tests/python_tests/{test_avlm_api.py => test_vlm_api.py} | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename tests/python_tests/{test_avlm_api.py => test_vlm_api.py} (100%) diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py index 1e7a1b81a5..b13369b7ba 100644 --- a/tests/python_tests/test_sampling.py +++ b/tests/python_tests/test_sampling.py @@ -28,8 +28,8 @@ @pytest.mark.precommit @pytest.mark.parametrize("model_id", get_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "precommit"))) @pytest.mark.xfail( - raises=RuntimeError, - reason="Test fails with error: CPU: head size must be multiple of 16, current: X. CVS-145986.", + raises=(RuntimeError, AttributeError), + reason="RuntimeError with error: CPU: head size must be multiple of 16, current: X. CVS-145986. AttributeError: 'CodeGenAttention' object has no attribute 'causal_mask' for hf-tiny-model-private/tiny-random-CodeGenForCausalLM", strict=True, ) def test_sampling_precommit(tmp_path, model_id): diff --git a/tests/python_tests/test_avlm_api.py b/tests/python_tests/test_vlm_api.py similarity index 100% rename from tests/python_tests/test_avlm_api.py rename to tests/python_tests/test_vlm_api.py From 509fb2f6071cb30b24c8b6b0435a3652fc68e48c Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Wed, 16 Oct 2024 07:14:03 +0400 Subject: [PATCH 27/28] Move common model parts --- src/cpp/src/visual_language/pipeline.cpp | 24 ++++++------------- .../src/visual_language/vision_encoder.cpp | 7 +----- 2 files changed, 8 insertions(+), 23 deletions(-) diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 9ce4e1ef12..0d5772202d 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -353,25 +353,15 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { model_dir / "openvino_resampler_model.xml", device, device_config ).create_infer_request(); - m_embedding = ov::Core{}.compile_model( - model_dir / "openvino_text_embeddings_model.xml", device, device_config - ).create_infer_request(); - - m_language = ov::Core{}.compile_model( - model_dir / "openvino_language_model.xml", device, device_config - ).create_infer_request(); - m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70}); - } else if (m_vlm_config.model_type == VLMModelType::LLAVA) { - m_language = ov::Core{}.compile_model( - model_dir / "openvino_language_model.xml", device, device_config - ).create_infer_request(); - - // Reusing the same m_embedding for llava text_embeddings model - m_embedding = ov::Core{}.compile_model( - model_dir / "openvino_text_embeddings_model.xml", device, device_config - ).create_infer_request(); } + m_embedding = ov::Core{}.compile_model( + model_dir / "openvino_text_embeddings_model.xml", device, device_config + ).create_infer_request(); + + m_language = ov::Core{}.compile_model( + model_dir / "openvino_language_model.xml", device, device_config + ).create_infer_request(); m_language.get_tensor("attention_mask").set_shape({1, 0}); } diff --git a/src/cpp/src/visual_language/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp index d7308e6534..ee7e353e45 100644 --- a/src/cpp/src/visual_language/vision_encoder.cpp +++ b/src/cpp/src/visual_language/vision_encoder.cpp @@ -431,12 +431,7 @@ ov::Tensor preprocess_image_llava(const ov::Tensor& image, const ProcessorConfig VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const VLMModelType model_type, const std::string& device, const ov::AnyMap device_config, ov::Core core) : model_type(model_type) { - if (model_type == VLMModelType::MINICPM) { - m_vision_encoder = core.compile_model(model_dir / "openvino_vision_embeddings_model.xml", device, device_config).create_infer_request(); - } else if (model_type == VLMModelType::LLAVA) { - // Vision embeddings model is merged with multi modal projector at model export stage by optimum-intel - m_vision_encoder = core.compile_model(model_dir / "openvino_vision_embeddings_model.xml", device, device_config).create_infer_request(); - } + m_vision_encoder = core.compile_model(model_dir / "openvino_vision_embeddings_model.xml", device, device_config).create_infer_request(); m_processor_config = ov::genai::utils::from_config_json_if_exists( model_dir, "preprocessor_config.json" ); From db8fdc9e8b2b1b2506dea6cb7562f1878af9a67d Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Thu, 17 Oct 2024 03:11:18 +0400 Subject: [PATCH 28/28] Increase timeout --- .github/workflows/mac.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml index c2380aac15..7ea96bd30c 100644 --- a/.github/workflows/mac.yml +++ b/.github/workflows/mac.yml @@ -391,7 +391,8 @@ jobs: if: ${{ 'Release' == matrix.build-type }} # Python bindings can be built in Release only run: | source ${OV_INSTALL_DIR}/setupvars.sh - timeout --verbose 27s ${OV_INSTALL_DIR}/samples/python/multinomial_causal_lm/multinomial_causal_lm.py ./TinyLlama-1.1B-Chat-v1.0/ 0 + ${OV_INSTALL_DIR}/samples/python/multinomial_causal_lm/multinomial_causal_lm.py ./TinyLlama-1.1B-Chat-v1.0/ 0 + timeout-minutes: 1 Overall_Status: name: ci/gha_overall_status_macos