Skip to content

Commit

Permalink
fix comments
Browse files Browse the repository at this point in the history
  • Loading branch information
sbalandi committed Oct 16, 2024
1 parent 56a3eb2 commit 689fd3c
Show file tree
Hide file tree
Showing 7 changed files with 67 additions and 867 deletions.
14 changes: 10 additions & 4 deletions .github/workflows/causal_lm_cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -704,11 +704,17 @@ jobs:
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
python ./samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py ./miniCPM-V-2_6/
wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --output-document cat.jpg
- name: Run visual_language_chat sample - MiniCPM-V-2_6
mkdir cat_img
wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --output-document cat_img/cat.jpg
- name: Run visual_language_chat sample with file as input - MiniCPM-V-2_6
run: >
source ./ov/setupvars.sh
&& timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ cat.jpg
&& timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ cat_img/cat.jpg
<<< $'What is on the image?\nWhat is special on the image?'
- name: Run visual_language_chat sample with dir as input - MiniCPM-V-2_6
run: >
source ./ov/setupvars.sh
&& timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ cat_img
<<< $'What is on the image?\nWhat is special on the image?'
- name: Download and convert LLaVa 1.5 model and an image
run: |
Expand All @@ -729,7 +735,7 @@ jobs:
source ./ov/setupvars.sh
export PYTHONPATH=./build/:$PYTHONPATH
printf 'What is on the image?\nWhat is special on the image?\n' > ./input.txt
timeout 120s python ./samples/python/visual_language_chat/visual_language_chat.py ./miniCPM-V-2_6/ cat.jpg < input.txt > ./pred.txt
timeout 120s python ./samples/python/visual_language_chat/visual_language_chat.py ./miniCPM-V-2_6/ cat_img/cat.jpg < input.txt > ./pred.txt
cpp-continuous-batching-ubuntu:
runs-on: ubuntu-20.04-8-cores
Expand Down
22 changes: 22 additions & 0 deletions samples/cpp/visual_language_chat/load_image.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,28 @@
#include "stb_image.h"
#include "load_image.hpp"

namespace fs = std::filesystem;

std::vector<ov::Tensor> utils::load_images(const std::filesystem::path& input_path) {
std::vector<ov::Tensor> images;
if (!input_path.empty() && fs::exists(input_path)) {
if (fs::is_directory(input_path)) {
for (const auto& dir_entry : fs::directory_iterator(input_path)) {
ov::Tensor image = utils::load_image(dir_entry.path());
images.push_back(std::move(image));
}
} else if (fs::is_regular_file(input_path)) {
ov::Tensor image = utils::load_image(input_path);
images.push_back(std::move(image));
}
}

if (images.empty())
throw std::runtime_error(std::string{"No one image found by path "} + input_path.string());

return images;
}

ov::Tensor utils::load_image(const std::filesystem::path& image_path) {
int x = 0, y = 0, channels_in_file = 0;
constexpr int desired_channels = 3;
Expand Down
1 change: 1 addition & 0 deletions samples/cpp/visual_language_chat/load_image.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@

namespace utils {
ov::Tensor load_image(const std::filesystem::path& image_path);
std::vector<ov::Tensor> load_images(const std::filesystem::path& image_path);
}
52 changes: 9 additions & 43 deletions samples/cpp/visual_language_chat/visual_language_chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,41 +14,10 @@ bool print_subword(std::string&& subword) {

int main(int argc, char* argv[]) try {
if (3 != argc) {
throw std::runtime_error(std::string{"Usage "} + argv[0] + " <MODEL_DIR> <IMAGE_FILE>");
throw std::runtime_error(std::string{"Usage "} + argv[0] + " <MODEL_DIR> <IMAGE_FILE OR DIR_WITH_IMAGES>");
}

// multinomial or beam_search can be used as well
ov::genai::GenerationConfig generation_config = ov::genai::greedy();
// ov::genai::GenerationConfig generation_config = ov::genai::multinomial();
// ov::genai::GenerationConfig generation_config = ov::genai::beam_search();

ov::AnyMap properies;
properies.insert(ov::genai::generation_config(generation_config));

// streamer could be used with greedy and multinomial
// if num_return_sequences > 1 in case of multinomial, the streamer will use the output from the first sequence
if (generation_config.is_greedy_decoding() or generation_config.is_multinomial()) {
properies.insert(ov::genai::streamer(print_subword));
}

std::vector<ov::Tensor> images;
std::string input_path = argv[2];
if (!input_path.empty() && fs::exists(input_path)) {
if (fs::is_directory(input_path)) {
for (const auto& dir_entry : fs::directory_iterator(input_path)) {
ov::Tensor image = utils::load_image(dir_entry.path());
images.push_back(std::move(image));
}
} else if (fs::is_regular_file(input_path)) {
ov::Tensor image = utils::load_image(input_path);
images.push_back(std::move(image));
}
}

if (images.empty())
throw std::runtime_error("No one image found by path " + input_path);
else
properies.insert(images.size() == 1 ? ov::genai::image(images.at(0)) : ov::genai::images(images));
std::vector<ov::Tensor> images = utils::load_images(argv[2]);

std::string device = "CPU"; // GPU can be used as well
ov::AnyMap enable_compile_cache;
Expand All @@ -64,19 +33,16 @@ int main(int argc, char* argv[]) try {
std::cout << "question:\n";

std::getline(std::cin, prompt);
auto resuls = pipe.generate(prompt, properies);
if (generation_config.is_beam_search()) {
std::cout << resuls.texts.at(0) << std::endl;
}
properies.erase(images.size() == 1 ? "image" : "images");

auto resuls = pipe.generate(prompt,
ov::genai::images(images),
ov::genai::generation_config(ov::genai::greedy()),
ov::genai::streamer(print_subword));
std::cout << "\n----------\n"
"question:\n";
while (std::getline(std::cin, prompt)) {
resuls = pipe.generate(prompt, properies);
if (generation_config.is_beam_search()) {
std::cout << resuls.texts.at(0) << std::endl;
}
resuls = pipe.generate(prompt,
ov::genai::generation_config(ov::genai::greedy()),
ov::genai::streamer(print_subword));
std::cout << "\n----------\n"
"question:\n";
}
Expand Down
90 changes: 19 additions & 71 deletions src/cpp/src/visual_language/pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

#include "openvino/genai/visual_language/pipeline.hpp"
#include "openvino/genai/tokenizer.hpp"
#include "vlm_sampling.hpp"
#include "sampler.hpp"
#include "clip.hpp"
#include "text_callback_streamer.hpp"
#include "utils.hpp"
Expand All @@ -21,64 +21,6 @@ template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>;

constexpr size_t BATCH_SIZE = 1;

struct Args {
bool do_sample = false;
int top_k = 0;
float top_p = 0.7f;
float temp = 0.95f;
float repeat_penalty = 1.0f;
};

int64_t get_out_token_id(const std::vector<int>& input_ids, float* logits, size_t vocab_size, Args args) {
int64_t out_token;

// logits pre-process
if (args.repeat_penalty != 1.f) {
sampling_repetition_penalty(logits, logits + vocab_size, input_ids, args.repeat_penalty);
}

if (args.do_sample)
{
if (args.temp > 0) {
sampling_temperature(logits, logits + vocab_size, args.temp);
}

std::vector<TokenIdScore> token_scores(vocab_size);
for (int i = 0; i < vocab_size; i++) {
token_scores[i] = TokenIdScore(i, logits[i]);
}

// top_k sampling
if (0 < args.top_k && args.top_k < (int)token_scores.size()) {
sampling_top_k(token_scores.data(), token_scores.data() + args.top_k,
token_scores.data() + token_scores.size());
token_scores.resize(args.top_k);
}

// top_p sampling
if (0.f < args.top_p && args.top_p < 1.f) {
auto pos = sampling_top_p(token_scores.data(), token_scores.data() + token_scores.size(), args.top_p);
token_scores.resize(pos - token_scores.data());
}

// sample next token
sampling_softmax_inplace(token_scores.data(), token_scores.data() + token_scores.size());
for (size_t i = 0; i < token_scores.size(); i++) {
logits[i] = token_scores[i].score;
}

thread_local std::random_device rd;
thread_local std::mt19937 gen(rd());

std::discrete_distribution<> dist(logits, logits + token_scores.size());
out_token = token_scores[dist(gen)].id;
}
else {
out_token = std::max_element(logits, logits + vocab_size) - logits;
}

return out_token;
}

ov::Tensor process_prompt(ov::InferRequest& embedding, const ov::Tensor& prompt, float scale_emb) {
embedding.set_input_tensor(prompt);
Expand Down Expand Up @@ -302,9 +244,9 @@ ov::Tensor merge_text_and_image_embeddings_llava(
EncodedGenerationResult get_lm_encoded_results(
ov::InferRequest& language,
ov::InferRequest& embedding,
ov::Tensor inputs_embeds,
const VLMConfig m_vlm_config,
const std::shared_ptr<StreamerBase> streamer_ptr,
const ov::Tensor& inputs_embeds,
const VLMConfig& m_vlm_config,
const std::shared_ptr<StreamerBase>& streamer_ptr,
Sampler& sampler,
std::vector<SequenceGroup::Ptr> requests
) {
Expand Down Expand Up @@ -413,7 +355,7 @@ EncodedGenerationResult get_lm_encoded_results(
EncodedGenerationResult result;
result.m_request_id = 1;
std::vector<GenerationOutput> generation_outputs = generation->read_all();
std::sort(generation_outputs.begin(), generation_outputs.end(), [=] (GenerationOutput& r1, GenerationOutput& r2) {
std::sort(generation_outputs.begin(), generation_outputs.end(), [] (const GenerationOutput& r1, const GenerationOutput& r2) {
return r1.score > r2.score;
});

Expand Down Expand Up @@ -521,9 +463,11 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {

Sampler sampler = Sampler(m_tokenizer);
std::vector<SequenceGroup::Ptr> requests;
// request_id, input_ids, generation_config, block_size, enable_prefix_caching
size_t request_id = 0;
size_t block_size = 1;
bool enable_prefix_caching = false;
// now we have one prompt as input, so we need one request
SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(0, processed_input.second, generation_config, 1, false);
SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(request_id, processed_input.second, generation_config, block_size, enable_prefix_caching);
size_t inputs_embeds_size = processed_input.first.get_shape()[1];
size_t tokenized_prompt_size = processed_input.second.get_size();
size_t num_processed_tokens = inputs_embeds_size <= tokenized_prompt_size ? tokenized_prompt_size - inputs_embeds_size : 0;
Expand All @@ -545,9 +489,8 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
},
}, streamer);

if ((!(generation_config.is_greedy_decoding() || generation_config.is_multinomial())) && streamer_ptr) {
OPENVINO_THROW("Currently streaming is possible only for greedy or multinomial decoding");
}
OPENVINO_ASSERT((generation_config.is_greedy_decoding() || generation_config.is_multinomial() || !streamer_ptr),
"Currently streaming is possible only for greedy or multinomial decoding");

EncodedGenerationResult encoded_result = get_lm_encoded_results(m_language, m_embedding, processed_input.first, m_vlm_config, streamer_ptr, sampler, requests);

Expand Down Expand Up @@ -591,6 +534,11 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
ov::genai::OptionalGenerationConfig config_arg = utils::get_config_from_map(config_map);
GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config();
config.update_generation_config(config_map);

// If eos_token_id was not provided, take value
if (config.eos_token_id == -1)
config.set_eos_token_id(m_tokenizer.get_eos_token_id());

return generate(
prompt,
rgbs,
Expand Down Expand Up @@ -638,12 +586,12 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
m_generation_config = new_config;
}

ov::Tensor get_inputs_embeds_llava(const std::string& prompt, const std::vector<ov::Tensor>& images) {
std::pair<ov::Tensor, ov::Tensor> get_inputs_embeds_llava(const std::string& prompt, const std::vector<ov::Tensor>& images) {
std::string image_token = "<image>"; // TODO Consider getting from vlm_config or json
std::string formatted_prompt = "USER: " + (images.empty() ? prompt : image_token + "\n" + prompt) + " ASSISTANT:";
ov::Tensor input_ids = m_tokenizer.encode(formatted_prompt).input_ids;
if (images.empty()) {
return process_prompt(m_embedding, input_ids, m_vlm_config.scale_emb);
return std::pair{process_prompt(m_embedding, input_ids, m_vlm_config.scale_emb), input_ids};
} else {
OPENVINO_ASSERT(1 == images.size(), "Only a single image allowed");
EncodedImage encoded_image = m_vision_encoder.encode(images.at(0));
Expand All @@ -657,7 +605,7 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
}
}

ov::Tensor get_inputs_embeds_minicpm(const std::string& prompt, const std::vector<ov::Tensor>& images) {
std::pair<ov::Tensor, ov::Tensor> get_inputs_embeds_minicpm(const std::string& prompt, const std::vector<ov::Tensor>& images) {
std::string images_prompt;
std::vector<EncodedImage> embeds;
for (const ov::Tensor& rgb : images) {
Expand Down
Loading

0 comments on commit 689fd3c

Please sign in to comment.