diff --git a/samples/cpp/visual_language_chat/load_image.cpp b/samples/cpp/visual_language_chat/load_image.cpp index 554583a10d..7956f8c128 100644 --- a/samples/cpp/visual_language_chat/load_image.cpp +++ b/samples/cpp/visual_language_chat/load_image.cpp @@ -23,7 +23,7 @@ std::vector utils::load_images(const std::filesystem::path& input_pa } if (images.empty()) - throw std::runtime_error(std::string{"No one image found by path "} + input_path.string()); + throw std::runtime_error(std::string{"No images were found in path "} + input_path.string()); return images; } diff --git a/samples/cpp/visual_language_chat/visual_language_chat.cpp b/samples/cpp/visual_language_chat/visual_language_chat.cpp index 3d6e412ee5..c5e24247c2 100644 --- a/samples/cpp/visual_language_chat/visual_language_chat.cpp +++ b/samples/cpp/visual_language_chat/visual_language_chat.cpp @@ -6,8 +6,6 @@ #include #include -namespace fs = std::filesystem; - bool print_subword(std::string&& subword) { return !(std::cout << subword << std::flush); } @@ -19,6 +17,9 @@ int main(int argc, char* argv[]) try { std::vector images = utils::load_images(argv[2]); + ov::genai::GenerationConfig generation_config; + generation_config.max_new_tokens = 30; + std::string device = "CPU"; // GPU can be used as well ov::AnyMap enable_compile_cache; if ("GPU" == device) { @@ -33,16 +34,16 @@ int main(int argc, char* argv[]) try { std::cout << "question:\n"; std::getline(std::cin, prompt); - auto resuls = pipe.generate(prompt, - ov::genai::images(images), - ov::genai::generation_config(ov::genai::greedy()), - ov::genai::streamer(print_subword)); + pipe.generate(prompt, + ov::genai::images(images), + ov::genai::generation_config(generation_config), + ov::genai::streamer(print_subword)); std::cout << "\n----------\n" "question:\n"; while (std::getline(std::cin, prompt)) { - resuls = pipe.generate(prompt, - ov::genai::generation_config(ov::genai::greedy()), - ov::genai::streamer(print_subword)); + pipe.generate(prompt, + ov::genai::generation_config(generation_config), + ov::genai::streamer(print_subword)); std::cout << "\n----------\n" "question:\n"; } diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp index 2e631f6201..4885a21b8f 100644 --- a/src/cpp/src/sampler.cpp +++ b/src/cpp/src/sampler.cpp @@ -597,10 +597,11 @@ void register_new_token(const Token& sampled_token_id, } }; -std::vector Sampler::get_beam_idxs(uint64_t request_id) { +std::vector Sampler::get_beam_idxs(SequenceGroup::CPtr sequence_group) { + size_t request_id = sequence_group->get_request_id(); auto beam_searcher = m_beam_search_info.find(request_id); if (m_beam_search_info.find(request_id) == m_beam_search_info.end()) { - return { 0 }; + return std::vector(sequence_group->num_running_seqs(), 0); } return beam_searcher->second.get_beam_idxs(); } diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp index ca73cbb92d..13933e0b75 100644 --- a/src/cpp/src/sampler.hpp +++ b/src/cpp/src/sampler.hpp @@ -65,7 +65,7 @@ class Sampler { SamplerOutput sample(std::vector & sequence_groups, ov::Tensor logits, bool is_validation_mode_enabled = false); void set_seed(size_t seed) { rng_engine.seed(seed); } void clear_beam_search_info(uint64_t request_id); - std::vector get_beam_idxs(uint64_t request_id); + std::vector get_beam_idxs(SequenceGroup::CPtr sequence_group); }; class Sampler::GroupBeamSearcher { diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 42bc8eb465..c3d12c6e0f 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -272,11 +272,6 @@ EncodedGenerationResult get_lm_encoded_results( SamplerOutput sampler_output = sampler.sample(requests, language.get_tensor("logits")); - // logits include image and prompt embedings - if (m_vlm_config.model_type == VLMModelType::LLAVA) { - request->update_processed_tokens_num(request->get_prompt_len()); - } - language.get_tensor("inputs_embeds").set_shape({BATCH_SIZE, 1, m_vlm_config.hidden_size}); language.get_tensor("position_ids").set_shape({ BATCH_SIZE, 1 }); @@ -326,13 +321,9 @@ EncodedGenerationResult get_lm_encoded_results( language.set_tensor("position_ids", position_ids); - std::vector beam_idxs = sampler.get_beam_idxs(request->get_request_id()); + std::vector beam_idxs = sampler.get_beam_idxs(request); int32_t *beam_idx_data = beam_idx.data(); - if (total_num_tokens > beam_idxs.size()) { - std::fill_n(beam_idx_data, total_num_tokens, 0); - } else { - copy(beam_idxs.begin(), beam_idxs.end(), beam_idx_data); - } + copy(beam_idxs.begin(), beam_idxs.end(), beam_idx_data); language.set_tensor("beam_idx", beam_idx); language.infer(); @@ -453,25 +444,25 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { const GenerationConfig& generation_config, const StreamerVariant& streamer ) { - // inputs_embeds, tokenized_input; - std::pair processed_input; + ov::Tensor inputs_embeds; if (m_vlm_config.model_type == VLMModelType::MINICPM) { - processed_input = get_inputs_embeds_minicpm(prompt, rgbs); + inputs_embeds = get_inputs_embeds_minicpm(prompt, rgbs); } else if (m_vlm_config.model_type == VLMModelType::LLAVA) { - processed_input = get_inputs_embeds_llava(prompt, rgbs); + inputs_embeds = get_inputs_embeds_llava(prompt, rgbs); } Sampler sampler = Sampler(m_tokenizer); + std::vector requests; size_t request_id = 0; size_t block_size = 1; bool enable_prefix_caching = false; - // now we have one prompt as input, so we need one request - SequenceGroup::Ptr sequence_group = std::make_shared(request_id, processed_input.second, generation_config, block_size, enable_prefix_caching); - size_t inputs_embeds_size = processed_input.first.get_shape()[1]; - size_t tokenized_prompt_size = processed_input.second.get_size(); - size_t num_processed_tokens = inputs_embeds_size <= tokenized_prompt_size ? tokenized_prompt_size - inputs_embeds_size : 0; - sequence_group->update_processed_tokens_num(num_processed_tokens); + size_t history_size = m_language.get_tensor("attention_mask").get_shape().at(1); + size_t inputs_embeds_size = inputs_embeds.get_shape().at(1); + ov::Tensor prompt_ids(ov::element::i64, { history_size + inputs_embeds_size }); + + SequenceGroup::Ptr sequence_group = std::make_shared(request_id, prompt_ids, generation_config, block_size, enable_prefix_caching); + sequence_group->update_processed_tokens_num(history_size); sequence_group->set_sequence_group_ptr(sequence_group); requests.push_back(sequence_group); @@ -492,7 +483,7 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { OPENVINO_ASSERT((generation_config.is_greedy_decoding() || generation_config.is_multinomial() || !streamer_ptr), "Currently streaming is possible only for greedy or multinomial decoding"); - EncodedGenerationResult encoded_result = get_lm_encoded_results(m_language, m_embedding, processed_input.first, m_vlm_config, streamer_ptr, sampler, requests); + EncodedGenerationResult encoded_result = get_lm_encoded_results(m_language, m_embedding, inputs_embeds, m_vlm_config, streamer_ptr, sampler, requests); DecodedResults decoded; for (size_t idx = 0; idx < encoded_result.m_generation_ids.size(); ++idx) { @@ -586,12 +577,12 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { m_generation_config = new_config; } - std::pair get_inputs_embeds_llava(const std::string& prompt, const std::vector& images) { + ov::Tensor get_inputs_embeds_llava(const std::string& prompt, const std::vector& images) { std::string image_token = ""; // TODO Consider getting from vlm_config or json std::string formatted_prompt = "USER: " + (images.empty() ? prompt : image_token + "\n" + prompt) + " ASSISTANT:"; ov::Tensor input_ids = m_tokenizer.encode(formatted_prompt).input_ids; if (images.empty()) { - return std::pair{process_prompt(m_embedding, input_ids, m_vlm_config.scale_emb), input_ids}; + return process_prompt(m_embedding, input_ids, m_vlm_config.scale_emb); } else { OPENVINO_ASSERT(1 == images.size(), "Only a single image allowed"); EncodedImage encoded_image = m_vision_encoder.encode(images.at(0)); @@ -601,11 +592,11 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { int64_t image_token_index = 32000; // TODO Consider getting from m_vlm_config.image_token_index or config.json - return std::pair{merge_text_and_image_embeddings_llava(input_ids, text_embeds, image_embeds, image_token_index), input_ids}; + return merge_text_and_image_embeddings_llava(input_ids, text_embeds, image_embeds, image_token_index); } } - std::pair get_inputs_embeds_minicpm(const std::string& prompt, const std::vector& images) { + ov::Tensor get_inputs_embeds_minicpm(const std::string& prompt, const std::vector& images) { std::string images_prompt; std::vector embeds; for (const ov::Tensor& rgb : images) { @@ -736,7 +727,7 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { } } - return std::pair{inputs_embeds, m_is_chat_conversation ? new_chat_tokens : encoded_input}; + return inputs_embeds; } ov::Tensor resample(VLMPipeline::VLMPipelineImpl& pipe, const ov::Tensor& encoded_image, const std::vector& target_sizes) {