Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable LLaVa-1.5 in VLM Pipeline #917

Merged
merged 20 commits into from
Oct 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions .github/workflows/causal_lm_cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -698,19 +698,30 @@ jobs:
source ./ov/setupvars.sh
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
cmake --build ./build/ --config Release --target visual_language_chat py_generate_pipeline -j
- name: Download and convert a model and an image
- name: Download and convert MiniCPM-V-2_6 model and an image
run: |
source ./ov/setupvars.sh
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
python ./samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py ./miniCPM-V-2_6/
wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --output-document cat.jpg

- name: Run chat chat sample
- name: Run visual_language_chat sample - MiniCPM-V-2_6
run: >
source ./ov/setupvars.sh
&& timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ cat.jpg
<<< $'What is on the image?\nWhat is special on the image?'
- name: Download and convert LLaVa 1.5 model and an image
run: |
source ./ov/setupvars.sh
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
optimum-cli export openvino --model llava-hf/llava-1.5-7b-hf ./llava_1_5_7b_ov/
wget https://llava-vl.github.io/static/images/monalisa.jpg
- name: Run visual_language_chat sample - LLaVa 1.5
run: >
source ./ov/setupvars.sh
&& timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./llava_1_5_7b_ov/ monalisa.jpg
<<< $'Who drew this painting?\nWhen did the painter live?'

- name: Run python chat sample
run: |
Expand Down
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,9 @@ Continuous batching functionality is used within OpenVINO Model Server (OVMS) to

# Install optimum-intel to be able to download, convert and optimize LLMs from Hugging Face
# Optimum is not required to run models, only to convert and compress
pip install optimum[openvino]
pip install optimum-intel@git+https://github.com/huggingface/optimum-intel.git

# (Optional) Install (TBD) to be able to download models from Model Scope
#pip install optimum[openvino]
```

## Performing text generation
Expand Down
3 changes: 2 additions & 1 deletion samples/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
--extra-index-url https://download.pytorch.org/whl/cpu
optimum[openvino]==1.22.0
optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
numpy<2.0.0; sys_platform == 'darwin'
einops==0.8.0 # For Qwen
transformers_stream_generator==0.0.5 # For Qwen
diffusers==0.30.3
Expand Down
8 changes: 8 additions & 0 deletions src/cpp/include/openvino/genai/processor_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,14 @@ class OPENVINO_GENAI_EXPORTS ProcessorConfig {
/// Applied after norm_mean.
/// llava calls it image_std.
std::array<float, 3> norm_std{1.0f, 1.0f, 1.0f};

// llava specific config params
std::array<float, 3> image_mean{0.0f, 0.0f, 0.0f};
std::array<float, 3> image_std{1.0f, 1.0f, 1.0f};
size_t crop_size_height = 336;
size_t crop_size_width = 336;
size_t size_shortest_edge = 336;

/// @brief Default constructor
ProcessorConfig() = default;
/// @brief Construct ProcessorConfig from values in json_path.
Expand Down
17 changes: 15 additions & 2 deletions src/cpp/include/openvino/genai/vision_encoder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#include "openvino/genai/processor_config.hpp"
#include <openvino/openvino.hpp>
#include "vlm_model_type.hpp"

namespace ov::genai {
/// @brief A pair describing image size.
Expand Down Expand Up @@ -41,8 +42,10 @@ struct EncodedImage {
/// ov::InferRequest and configured by ProcessorConfig.
class OPENVINO_GENAI_EXPORTS VisionEncoder {
public:
/// @brief A enum denoting model type.
VLMModelType model_type;
/// @brief A model for image encoding.
ov::InferRequest m_encoder;
ov::InferRequest m_vision_encoder;
/// @brief A config to follow.
ProcessorConfig m_processor_config;

Expand All @@ -52,7 +55,7 @@ class OPENVINO_GENAI_EXPORTS VisionEncoder {
explicit VisionEncoder(
const ov::InferRequest& encoder,
const ProcessorConfig& processor_config=ProcessorConfig{}
) : m_encoder{encoder}, m_processor_config{processor_config} {}
) : m_vision_encoder{encoder}, m_processor_config{processor_config} {}

/// @brief Construct the encoder from model_dir.
/// @param model_dir A folder containing openvino_embedding.xml and
Expand All @@ -63,6 +66,7 @@ class OPENVINO_GENAI_EXPORTS VisionEncoder {
/// @param core ov::Core to be used to compile the model.
explicit VisionEncoder(
const std::filesystem::path& model_dir,
const VLMModelType model_type,
const std::string& device="CPU",
const ov::AnyMap device_config={},
ov::Core core=ov::Core{}
Expand Down Expand Up @@ -117,5 +121,14 @@ class OPENVINO_GENAI_EXPORTS VisionEncoder {
image, AnyMap{std::forward<Properties>(properties)...}
);
}

private:
EncodedImage encode_minicpm(
const ov::Tensor& image, const ProcessorConfig& config
);

EncodedImage encode_llava(
const ov::Tensor& image, const ProcessorConfig& config
);
};
}
3 changes: 3 additions & 0 deletions src/cpp/include/openvino/genai/vlm_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,15 @@
#include "openvino/genai/visibility.hpp"
#include <openvino/runtime/properties.hpp>
#include <filesystem>
#include "vlm_model_type.hpp"

namespace ov::genai {
/// @brief A Configuration class passed to VLMPipeline and used to
/// change VLMPipeline's behavior. Corresponds to config.json.
class OPENVINO_GENAI_EXPORTS VLMConfig {
public:
/// @brief A enum denoting model type.
VLMModelType model_type;
/// @brief A size of a single embedding returned by a resampler.
/// Used to initialize positional embeddings for resampler input.
size_t hidden_size = 2304;
Expand Down
31 changes: 31 additions & 0 deletions src/cpp/include/openvino/genai/vlm_model_type.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
// Copyright (C) 2023-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include <string>
#include <unordered_map>

#include "openvino/genai/visibility.hpp"
#include <openvino/core/except.hpp>

namespace ov::genai {

enum class OPENVINO_GENAI_EXPORTS VLMModelType {
MINICPM,
LLAVA,
};

inline VLMModelType to_vlm_model_type(const std::string& value) {
static const std::unordered_map<std::string, VLMModelType> model_types_map = {
{"minicpmv", VLMModelType::MINICPM},
{"llava", VLMModelType::LLAVA}
};

auto it = model_types_map.find(value);
if (it != model_types_map.end()) {
return it->second;
}
OPENVINO_THROW("Unsupported '", value, "' VLM model type");
}
}
3 changes: 3 additions & 0 deletions src/cpp/include/openvino/genai/vlm_pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,9 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
private:
class VLMPipelineImpl;
std::unique_ptr<VLMPipelineImpl> m_pimpl;

ov::Tensor get_inputs_embeds_minicpm(const std::string& prompt, const std::vector<ov::Tensor>& images);
ov::Tensor get_inputs_embeds_llava(const std::string& prompt, const std::vector<ov::Tensor>& images);
};

/*
Expand Down
18 changes: 17 additions & 1 deletion src/cpp/src/processor_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ ov::genai::ProcessorConfig::ProcessorConfig(const std::filesystem::path& json_pa
OPENVINO_ASSERT(stream.is_open(), "Failed to open '" + json_path.string() + "' with processor config");
nlohmann::json parsed = nlohmann::json::parse(stream);
using ov::genai::utils::read_json_param;
read_json_param(parsed, "patch_size", patch_size);
read_json_param(parsed, "patch_size", patch_size); // For llava - stored in config.json vision_config
read_json_param(parsed, "scale_resolution", scale_resolution);
read_json_param(parsed, "max_slice_nums", max_slice_nums);
if (parsed.contains("norm_mean")) {
Expand All @@ -19,4 +19,20 @@ ov::genai::ProcessorConfig::ProcessorConfig(const std::filesystem::path& json_pa
if (parsed.contains("norm_std")) {
norm_std = parsed.at("norm_std").get<std::array<float, 3>>();
}

// Setting llava config params
if (parsed.contains("image_mean")) {
image_mean = parsed.at("image_mean").get<std::array<float, 3>>();
}
if (parsed.contains("image_std")) {
image_std = parsed.at("image_std").get<std::array<float, 3>>();
}

if (parsed.contains("crop_size")) {
crop_size_height = parsed.at("crop_size").at("height");
crop_size_width = parsed.at("crop_size").at("width");
}
if (parsed.contains("size")) {
size_shortest_edge = parsed.at("size").at("shortest_edge");
}
}
114 changes: 101 additions & 13 deletions src/cpp/src/vision_encoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -362,29 +362,117 @@ ProcessorConfig from_any_map(
read_anymap_param(config_map, "norm_std", extracted_config.norm_std);
return extracted_config;
}


ov::Tensor preprocess_image_llava(const ov::Tensor& image, const ProcessorConfig& config) {
bool do_resize = true;
bool do_center_crop = true;

// ov::Tensor to clip_image_u8
clip_image_u8 input_image{
int(image.get_shape().at(3)),
int(image.get_shape().at(2)),
{image.data<uint8_t>(), image.data<uint8_t>() + image.get_size()}
};

// Resize
clip_image_u8 resized_image;
if (do_resize) {
int target_size = config.size_shortest_edge;
float scale = static_cast<float>(target_size) / std::min(input_image.nx, input_image.ny);
int new_width = static_cast<int>(input_image.nx * scale);
int new_height = static_cast<int>(input_image.ny * scale);
bicubic_resize(input_image, resized_image, new_width, new_height);
} else {
resized_image = input_image;
}

// Center crop
clip_image_u8 cropped_image;
if (do_center_crop) {
int crop_height = config.crop_size_height;
int crop_width = config.crop_size_width;
int start_x = (resized_image.nx - crop_width) / 2;
int start_y = (resized_image.ny - crop_height) / 2;

cropped_image.nx = crop_width;
cropped_image.ny = crop_height;
cropped_image.buf.resize(3 * crop_width * crop_height);

for (int y = 0; y < crop_height; ++y) {
for (int x = 0; x < crop_width; ++x) {
for (int c = 0; c < 3; ++c) {
cropped_image.buf[(y * crop_width + x) * 3 + c] =
resized_image.buf[((start_y + y) * resized_image.nx + (start_x + x)) * 3 + c];
}
}
}
} else {
cropped_image = resized_image;
}

// Normalize
clip_ctx ctx;
std::copy(config.image_mean.begin(), config.image_mean.end(), ctx.image_mean);
std::copy(config.image_std.begin(), config.image_std.end(), ctx.image_std);

clip_image_f32 normalized_image = clip_image_preprocess(ctx, cropped_image);

// Convert clip_image_f32 to ov::Tensor
ov::Tensor result(
ov::element::f32,
{1, 3, size_t(normalized_image.ny), size_t(normalized_image.nx)},
(void*)(normalized_image.buf.data())
);

return result;
}
}

VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const std::string& device, const ov::AnyMap device_config, ov::Core core) :
VisionEncoder{
core.compile_model(
model_dir / "image_encoder.xml", device, device_config
).create_infer_request(),
ov::genai::utils::from_config_json_if_exists<ov::genai::ProcessorConfig>(
VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const VLMModelType model_type, const std::string& device, const ov::AnyMap device_config, ov::Core core) :
model_type(model_type) {
if (model_type == VLMModelType::MINICPM) {
m_vision_encoder = core.compile_model(model_dir / "image_encoder.xml", device, device_config).create_infer_request();
} else if (model_type == VLMModelType::LLAVA) {
// Vision embeddings model is merged with multi modal projector at model export stage by optimum-intel
m_vision_encoder = core.compile_model(model_dir / "openvino_vision_embeddings_model.xml", device, device_config).create_infer_request();
}
m_processor_config = ov::genai::utils::from_config_json_if_exists<ov::genai::ProcessorConfig>(
model_dir, "preprocessor_config.json"
)
} {}
);
}

EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ProcessorConfig& config) {
if (model_type == VLMModelType::MINICPM) {
return encode_minicpm(image, config);
} else if (model_type == VLMModelType::LLAVA) {
return encode_llava(image, config);
}
}

EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ov::AnyMap& config_map) {
return encode(image, from_any_map(
config_map, m_processor_config
));
}

EncodedImage VisionEncoder::encode_minicpm(const ov::Tensor& image, const ProcessorConfig& config) {
clip_ctx ctx_clip;
ctx_clip.patch_size = m_processor_config.patch_size;
ctx_clip.image_size = m_processor_config.image_size;
std::copy(config.norm_mean.begin(), config.norm_mean.end(), ctx_clip.image_mean);
std::copy(config.norm_std.begin(), config.norm_std.end(), ctx_clip.image_std);
return llava_image_embed_make_with_bytes_slice(ctx_clip, image, m_encoder, config.max_slice_nums, config.scale_resolution, config.patch_size, 0 == config.max_slice_nums);
return llava_image_embed_make_with_bytes_slice(ctx_clip, image, m_vision_encoder, config.max_slice_nums, config.scale_resolution, config.patch_size, 0 == config.max_slice_nums);
}

EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ov::AnyMap& config_map) {
return encode(image, from_any_map(
config_map, m_processor_config
));
EncodedImage VisionEncoder::encode_llava(const ov::Tensor& image, const ProcessorConfig& config) {
ov::Tensor preprocessed_image = preprocess_image_llava(image, config);

m_vision_encoder.set_tensor("pixel_values", preprocessed_image);
m_vision_encoder.infer();

ov::Tensor image_features = m_vision_encoder.get_output_tensor();
ImageSize resized_source_size{config.crop_size_height / config.patch_size, config.crop_size_width / config.patch_size};

return {image_features, resized_source_size};
}
1 change: 1 addition & 0 deletions src/cpp/src/vlm_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ ov::genai::VLMConfig::VLMConfig(const std::filesystem::path& json_path) {
OPENVINO_ASSERT(stream.is_open(), "Failed to open '" + json_path.string() + "' with processor config");
nlohmann::json parsed = nlohmann::json::parse(stream);
using ov::genai::utils::read_json_param;
model_type = to_vlm_model_type(parsed.at("model_type"));
read_json_param(parsed, "hidden_size", hidden_size);
read_json_param(parsed, "scale_emb", scale_emb);
read_json_param(parsed, "query_num", query_num);
Expand Down
Loading
Loading