Skip to content

Commit

Permalink
apply comments
Browse files Browse the repository at this point in the history
  • Loading branch information
sbalandi committed Oct 16, 2024
1 parent 689fd3c commit 03451fc
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 40 deletions.
2 changes: 1 addition & 1 deletion samples/cpp/visual_language_chat/load_image.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ std::vector<ov::Tensor> utils::load_images(const std::filesystem::path& input_pa
}

if (images.empty())
throw std::runtime_error(std::string{"No one image found by path "} + input_path.string());
throw std::runtime_error(std::string{"No images were found in path "} + input_path.string());

return images;
}
Expand Down
19 changes: 10 additions & 9 deletions samples/cpp/visual_language_chat/visual_language_chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
#include <filesystem>
#include <openvino/runtime/intel_gpu/properties.hpp>

namespace fs = std::filesystem;

bool print_subword(std::string&& subword) {
return !(std::cout << subword << std::flush);
}
Expand All @@ -19,6 +17,9 @@ int main(int argc, char* argv[]) try {

std::vector<ov::Tensor> images = utils::load_images(argv[2]);

ov::genai::GenerationConfig generation_config;
generation_config.max_new_tokens = 30;

std::string device = "CPU"; // GPU can be used as well
ov::AnyMap enable_compile_cache;
if ("GPU" == device) {
Expand All @@ -33,16 +34,16 @@ int main(int argc, char* argv[]) try {
std::cout << "question:\n";

std::getline(std::cin, prompt);
auto resuls = pipe.generate(prompt,
ov::genai::images(images),
ov::genai::generation_config(ov::genai::greedy()),
ov::genai::streamer(print_subword));
pipe.generate(prompt,
ov::genai::images(images),
ov::genai::generation_config(generation_config),
ov::genai::streamer(print_subword));
std::cout << "\n----------\n"
"question:\n";
while (std::getline(std::cin, prompt)) {
resuls = pipe.generate(prompt,
ov::genai::generation_config(ov::genai::greedy()),
ov::genai::streamer(print_subword));
pipe.generate(prompt,
ov::genai::generation_config(generation_config),
ov::genai::streamer(print_subword));
std::cout << "\n----------\n"
"question:\n";
}
Expand Down
5 changes: 3 additions & 2 deletions src/cpp/src/sampler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -597,10 +597,11 @@ void register_new_token(const Token& sampled_token_id,
}
};

std::vector<int32_t> Sampler::get_beam_idxs(uint64_t request_id) {
std::vector<int32_t> Sampler::get_beam_idxs(SequenceGroup::CPtr sequence_group) {
size_t request_id = sequence_group->get_request_id();
auto beam_searcher = m_beam_search_info.find(request_id);
if (m_beam_search_info.find(request_id) == m_beam_search_info.end()) {
return { 0 };
return std::vector<int32_t>(sequence_group->num_running_seqs(), 0);
}
return beam_searcher->second.get_beam_idxs();
}
Expand Down
2 changes: 1 addition & 1 deletion src/cpp/src/sampler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ class Sampler {
SamplerOutput sample(std::vector<SequenceGroup::Ptr> & sequence_groups, ov::Tensor logits, bool is_validation_mode_enabled = false);
void set_seed(size_t seed) { rng_engine.seed(seed); }
void clear_beam_search_info(uint64_t request_id);
std::vector<int32_t> get_beam_idxs(uint64_t request_id);
std::vector<int32_t> get_beam_idxs(SequenceGroup::CPtr sequence_group);
};

class Sampler::GroupBeamSearcher {
Expand Down
45 changes: 18 additions & 27 deletions src/cpp/src/visual_language/pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -272,11 +272,6 @@ EncodedGenerationResult get_lm_encoded_results(

SamplerOutput sampler_output = sampler.sample(requests, language.get_tensor("logits"));

// logits include image and prompt embedings
if (m_vlm_config.model_type == VLMModelType::LLAVA) {
request->update_processed_tokens_num(request->get_prompt_len());
}

language.get_tensor("inputs_embeds").set_shape({BATCH_SIZE, 1, m_vlm_config.hidden_size});
language.get_tensor("position_ids").set_shape({ BATCH_SIZE, 1 });

Expand Down Expand Up @@ -326,13 +321,9 @@ EncodedGenerationResult get_lm_encoded_results(

language.set_tensor("position_ids", position_ids);

std::vector<int32_t> beam_idxs = sampler.get_beam_idxs(request->get_request_id());
std::vector<int32_t> beam_idxs = sampler.get_beam_idxs(request);
int32_t *beam_idx_data = beam_idx.data<int32_t>();
if (total_num_tokens > beam_idxs.size()) {
std::fill_n(beam_idx_data, total_num_tokens, 0);
} else {
copy(beam_idxs.begin(), beam_idxs.end(), beam_idx_data);
}
copy(beam_idxs.begin(), beam_idxs.end(), beam_idx_data);
language.set_tensor("beam_idx", beam_idx);

language.infer();
Expand Down Expand Up @@ -453,25 +444,25 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
const GenerationConfig& generation_config,
const StreamerVariant& streamer
) {
// inputs_embeds, tokenized_input;
std::pair<ov::Tensor, ov::Tensor> processed_input;
ov::Tensor inputs_embeds;
if (m_vlm_config.model_type == VLMModelType::MINICPM) {
processed_input = get_inputs_embeds_minicpm(prompt, rgbs);
inputs_embeds = get_inputs_embeds_minicpm(prompt, rgbs);
} else if (m_vlm_config.model_type == VLMModelType::LLAVA) {
processed_input = get_inputs_embeds_llava(prompt, rgbs);
inputs_embeds = get_inputs_embeds_llava(prompt, rgbs);
}

Sampler sampler = Sampler(m_tokenizer);

std::vector<SequenceGroup::Ptr> requests;
size_t request_id = 0;
size_t block_size = 1;
bool enable_prefix_caching = false;
// now we have one prompt as input, so we need one request
SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(request_id, processed_input.second, generation_config, block_size, enable_prefix_caching);
size_t inputs_embeds_size = processed_input.first.get_shape()[1];
size_t tokenized_prompt_size = processed_input.second.get_size();
size_t num_processed_tokens = inputs_embeds_size <= tokenized_prompt_size ? tokenized_prompt_size - inputs_embeds_size : 0;
sequence_group->update_processed_tokens_num(num_processed_tokens);
size_t history_size = m_language.get_tensor("attention_mask").get_shape().at(1);
size_t inputs_embeds_size = inputs_embeds.get_shape().at(1);
ov::Tensor prompt_ids(ov::element::i64, { history_size + inputs_embeds_size });

SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(request_id, prompt_ids, generation_config, block_size, enable_prefix_caching);
sequence_group->update_processed_tokens_num(history_size);
sequence_group->set_sequence_group_ptr(sequence_group);
requests.push_back(sequence_group);

Expand All @@ -492,7 +483,7 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
OPENVINO_ASSERT((generation_config.is_greedy_decoding() || generation_config.is_multinomial() || !streamer_ptr),
"Currently streaming is possible only for greedy or multinomial decoding");

EncodedGenerationResult encoded_result = get_lm_encoded_results(m_language, m_embedding, processed_input.first, m_vlm_config, streamer_ptr, sampler, requests);
EncodedGenerationResult encoded_result = get_lm_encoded_results(m_language, m_embedding, inputs_embeds, m_vlm_config, streamer_ptr, sampler, requests);

DecodedResults decoded;
for (size_t idx = 0; idx < encoded_result.m_generation_ids.size(); ++idx) {
Expand Down Expand Up @@ -586,12 +577,12 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
m_generation_config = new_config;
}

std::pair<ov::Tensor, ov::Tensor> get_inputs_embeds_llava(const std::string& prompt, const std::vector<ov::Tensor>& images) {
ov::Tensor get_inputs_embeds_llava(const std::string& prompt, const std::vector<ov::Tensor>& images) {
std::string image_token = "<image>"; // TODO Consider getting from vlm_config or json
std::string formatted_prompt = "USER: " + (images.empty() ? prompt : image_token + "\n" + prompt) + " ASSISTANT:";
ov::Tensor input_ids = m_tokenizer.encode(formatted_prompt).input_ids;
if (images.empty()) {
return std::pair{process_prompt(m_embedding, input_ids, m_vlm_config.scale_emb), input_ids};
return process_prompt(m_embedding, input_ids, m_vlm_config.scale_emb);
} else {
OPENVINO_ASSERT(1 == images.size(), "Only a single image allowed");
EncodedImage encoded_image = m_vision_encoder.encode(images.at(0));
Expand All @@ -601,11 +592,11 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {

int64_t image_token_index = 32000; // TODO Consider getting from m_vlm_config.image_token_index or config.json

return std::pair{merge_text_and_image_embeddings_llava(input_ids, text_embeds, image_embeds, image_token_index), input_ids};
return merge_text_and_image_embeddings_llava(input_ids, text_embeds, image_embeds, image_token_index);
}
}

std::pair<ov::Tensor, ov::Tensor> get_inputs_embeds_minicpm(const std::string& prompt, const std::vector<ov::Tensor>& images) {
ov::Tensor get_inputs_embeds_minicpm(const std::string& prompt, const std::vector<ov::Tensor>& images) {
std::string images_prompt;
std::vector<EncodedImage> embeds;
for (const ov::Tensor& rgb : images) {
Expand Down Expand Up @@ -736,7 +727,7 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
}
}

return std::pair{inputs_embeds, m_is_chat_conversation ? new_chat_tokens : encoded_input};
return inputs_embeds;
}

ov::Tensor resample(VLMPipeline::VLMPipelineImpl& pipe, const ov::Tensor& encoded_image, const std::vector<ImageSize>& target_sizes) {
Expand Down

0 comments on commit 03451fc

Please sign in to comment.