Skip to content

Commit

Permalink
Enable LLaVa-1.5 in VLM Pipeline (#917)
Browse files Browse the repository at this point in the history
Ticket: CVS-153333
  • Loading branch information
andrei-kochin authored Oct 11, 2024
2 parents 6839175 + 8f1e347 commit dbb1f7c
Show file tree
Hide file tree
Showing 14 changed files with 440 additions and 162 deletions.
17 changes: 14 additions & 3 deletions .github/workflows/causal_lm_cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -698,19 +698,30 @@ jobs:
source ./ov/setupvars.sh
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
cmake --build ./build/ --config Release --target visual_language_chat py_generate_pipeline -j
- name: Download and convert a model and an image
- name: Download and convert MiniCPM-V-2_6 model and an image
run: |
source ./ov/setupvars.sh
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
python ./samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py ./miniCPM-V-2_6/
wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --output-document cat.jpg
- name: Run chat chat sample
- name: Run visual_language_chat sample - MiniCPM-V-2_6
run: >
source ./ov/setupvars.sh
&& timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ cat.jpg
<<< $'What is on the image?\nWhat is special on the image?'
- name: Download and convert LLaVa 1.5 model and an image
run: |
source ./ov/setupvars.sh
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
optimum-cli export openvino --model llava-hf/llava-1.5-7b-hf ./llava_1_5_7b_ov/
wget https://llava-vl.github.io/static/images/monalisa.jpg
- name: Run visual_language_chat sample - LLaVa 1.5
run: >
source ./ov/setupvars.sh
&& timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./llava_1_5_7b_ov/ monalisa.jpg
<<< $'Who drew this painting?\nWhen did the painter live?'
- name: Run python chat sample
run: |
Expand Down
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,9 @@ Continuous batching functionality is used within OpenVINO Model Server (OVMS) to

# Install optimum-intel to be able to download, convert and optimize LLMs from Hugging Face
# Optimum is not required to run models, only to convert and compress
pip install optimum[openvino]
pip install optimum-intel@git+https://github.com/huggingface/optimum-intel.git

# (Optional) Install (TBD) to be able to download models from Model Scope
#pip install optimum[openvino]
```

## Performing text generation
Expand Down
3 changes: 2 additions & 1 deletion samples/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
--extra-index-url https://download.pytorch.org/whl/cpu
optimum[openvino]==1.22.0
optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
numpy<2.0.0; sys_platform == 'darwin'
einops==0.8.0 # For Qwen
transformers_stream_generator==0.0.5 # For Qwen
diffusers==0.30.3
Expand Down
8 changes: 8 additions & 0 deletions src/cpp/include/openvino/genai/processor_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,14 @@ class OPENVINO_GENAI_EXPORTS ProcessorConfig {
/// Applied after norm_mean.
/// llava calls it image_std.
std::array<float, 3> norm_std{1.0f, 1.0f, 1.0f};

// llava specific config params
std::array<float, 3> image_mean{0.0f, 0.0f, 0.0f};
std::array<float, 3> image_std{1.0f, 1.0f, 1.0f};
size_t crop_size_height = 336;
size_t crop_size_width = 336;
size_t size_shortest_edge = 336;

/// @brief Default constructor
ProcessorConfig() = default;
/// @brief Construct ProcessorConfig from values in json_path.
Expand Down
17 changes: 15 additions & 2 deletions src/cpp/include/openvino/genai/vision_encoder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#include "openvino/genai/processor_config.hpp"
#include <openvino/openvino.hpp>
#include "vlm_model_type.hpp"

namespace ov::genai {
/// @brief A pair describing image size.
Expand Down Expand Up @@ -41,8 +42,10 @@ struct EncodedImage {
/// ov::InferRequest and configured by ProcessorConfig.
class OPENVINO_GENAI_EXPORTS VisionEncoder {
public:
/// @brief A enum denoting model type.
VLMModelType model_type;
/// @brief A model for image encoding.
ov::InferRequest m_encoder;
ov::InferRequest m_vision_encoder;
/// @brief A config to follow.
ProcessorConfig m_processor_config;

Expand All @@ -52,7 +55,7 @@ class OPENVINO_GENAI_EXPORTS VisionEncoder {
explicit VisionEncoder(
const ov::InferRequest& encoder,
const ProcessorConfig& processor_config=ProcessorConfig{}
) : m_encoder{encoder}, m_processor_config{processor_config} {}
) : m_vision_encoder{encoder}, m_processor_config{processor_config} {}

/// @brief Construct the encoder from model_dir.
/// @param model_dir A folder containing openvino_embedding.xml and
Expand All @@ -63,6 +66,7 @@ class OPENVINO_GENAI_EXPORTS VisionEncoder {
/// @param core ov::Core to be used to compile the model.
explicit VisionEncoder(
const std::filesystem::path& model_dir,
const VLMModelType model_type,
const std::string& device="CPU",
const ov::AnyMap device_config={},
ov::Core core=ov::Core{}
Expand Down Expand Up @@ -117,5 +121,14 @@ class OPENVINO_GENAI_EXPORTS VisionEncoder {
image, AnyMap{std::forward<Properties>(properties)...}
);
}

private:
EncodedImage encode_minicpm(
const ov::Tensor& image, const ProcessorConfig& config
);

EncodedImage encode_llava(
const ov::Tensor& image, const ProcessorConfig& config
);
};
}
3 changes: 3 additions & 0 deletions src/cpp/include/openvino/genai/vlm_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,15 @@
#include "openvino/genai/visibility.hpp"
#include <openvino/runtime/properties.hpp>
#include <filesystem>
#include "vlm_model_type.hpp"

namespace ov::genai {
/// @brief A Configuration class passed to VLMPipeline and used to
/// change VLMPipeline's behavior. Corresponds to config.json.
class OPENVINO_GENAI_EXPORTS VLMConfig {
public:
/// @brief A enum denoting model type.
VLMModelType model_type;
/// @brief A size of a single embedding returned by a resampler.
/// Used to initialize positional embeddings for resampler input.
size_t hidden_size = 2304;
Expand Down
31 changes: 31 additions & 0 deletions src/cpp/include/openvino/genai/vlm_model_type.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
// Copyright (C) 2023-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include <string>
#include <unordered_map>

#include "openvino/genai/visibility.hpp"
#include <openvino/core/except.hpp>

namespace ov::genai {

enum class OPENVINO_GENAI_EXPORTS VLMModelType {
MINICPM,
LLAVA,
};

inline VLMModelType to_vlm_model_type(const std::string& value) {
static const std::unordered_map<std::string, VLMModelType> model_types_map = {
{"minicpmv", VLMModelType::MINICPM},
{"llava", VLMModelType::LLAVA}
};

auto it = model_types_map.find(value);
if (it != model_types_map.end()) {
return it->second;
}
OPENVINO_THROW("Unsupported '", value, "' VLM model type");
}
}
3 changes: 3 additions & 0 deletions src/cpp/include/openvino/genai/vlm_pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,9 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline {
private:
class VLMPipelineImpl;
std::unique_ptr<VLMPipelineImpl> m_pimpl;

ov::Tensor get_inputs_embeds_minicpm(const std::string& prompt, const std::vector<ov::Tensor>& images);
ov::Tensor get_inputs_embeds_llava(const std::string& prompt, const std::vector<ov::Tensor>& images);
};

/*
Expand Down
18 changes: 17 additions & 1 deletion src/cpp/src/processor_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ ov::genai::ProcessorConfig::ProcessorConfig(const std::filesystem::path& json_pa
OPENVINO_ASSERT(stream.is_open(), "Failed to open '" + json_path.string() + "' with processor config");
nlohmann::json parsed = nlohmann::json::parse(stream);
using ov::genai::utils::read_json_param;
read_json_param(parsed, "patch_size", patch_size);
read_json_param(parsed, "patch_size", patch_size); // For llava - stored in config.json vision_config
read_json_param(parsed, "scale_resolution", scale_resolution);
read_json_param(parsed, "max_slice_nums", max_slice_nums);
if (parsed.contains("norm_mean")) {
Expand All @@ -19,4 +19,20 @@ ov::genai::ProcessorConfig::ProcessorConfig(const std::filesystem::path& json_pa
if (parsed.contains("norm_std")) {
norm_std = parsed.at("norm_std").get<std::array<float, 3>>();
}

// Setting llava config params
if (parsed.contains("image_mean")) {
image_mean = parsed.at("image_mean").get<std::array<float, 3>>();
}
if (parsed.contains("image_std")) {
image_std = parsed.at("image_std").get<std::array<float, 3>>();
}

if (parsed.contains("crop_size")) {
crop_size_height = parsed.at("crop_size").at("height");
crop_size_width = parsed.at("crop_size").at("width");
}
if (parsed.contains("size")) {
size_shortest_edge = parsed.at("size").at("shortest_edge");
}
}
114 changes: 101 additions & 13 deletions src/cpp/src/vision_encoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -362,29 +362,117 @@ ProcessorConfig from_any_map(
read_anymap_param(config_map, "norm_std", extracted_config.norm_std);
return extracted_config;
}


ov::Tensor preprocess_image_llava(const ov::Tensor& image, const ProcessorConfig& config) {
bool do_resize = true;
bool do_center_crop = true;

// ov::Tensor to clip_image_u8
clip_image_u8 input_image{
int(image.get_shape().at(3)),
int(image.get_shape().at(2)),
{image.data<uint8_t>(), image.data<uint8_t>() + image.get_size()}
};

// Resize
clip_image_u8 resized_image;
if (do_resize) {
int target_size = config.size_shortest_edge;
float scale = static_cast<float>(target_size) / std::min(input_image.nx, input_image.ny);
int new_width = static_cast<int>(input_image.nx * scale);
int new_height = static_cast<int>(input_image.ny * scale);
bicubic_resize(input_image, resized_image, new_width, new_height);
} else {
resized_image = input_image;
}

// Center crop
clip_image_u8 cropped_image;
if (do_center_crop) {
int crop_height = config.crop_size_height;
int crop_width = config.crop_size_width;
int start_x = (resized_image.nx - crop_width) / 2;
int start_y = (resized_image.ny - crop_height) / 2;

cropped_image.nx = crop_width;
cropped_image.ny = crop_height;
cropped_image.buf.resize(3 * crop_width * crop_height);

for (int y = 0; y < crop_height; ++y) {
for (int x = 0; x < crop_width; ++x) {
for (int c = 0; c < 3; ++c) {
cropped_image.buf[(y * crop_width + x) * 3 + c] =
resized_image.buf[((start_y + y) * resized_image.nx + (start_x + x)) * 3 + c];
}
}
}
} else {
cropped_image = resized_image;
}

// Normalize
clip_ctx ctx;
std::copy(config.image_mean.begin(), config.image_mean.end(), ctx.image_mean);
std::copy(config.image_std.begin(), config.image_std.end(), ctx.image_std);

clip_image_f32 normalized_image = clip_image_preprocess(ctx, cropped_image);

// Convert clip_image_f32 to ov::Tensor
ov::Tensor result(
ov::element::f32,
{1, 3, size_t(normalized_image.ny), size_t(normalized_image.nx)},
(void*)(normalized_image.buf.data())
);

return result;
}
}

VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const std::string& device, const ov::AnyMap device_config, ov::Core core) :
VisionEncoder{
core.compile_model(
model_dir / "image_encoder.xml", device, device_config
).create_infer_request(),
ov::genai::utils::from_config_json_if_exists<ov::genai::ProcessorConfig>(
VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const VLMModelType model_type, const std::string& device, const ov::AnyMap device_config, ov::Core core) :
model_type(model_type) {
if (model_type == VLMModelType::MINICPM) {
m_vision_encoder = core.compile_model(model_dir / "image_encoder.xml", device, device_config).create_infer_request();
} else if (model_type == VLMModelType::LLAVA) {
// Vision embeddings model is merged with multi modal projector at model export stage by optimum-intel
m_vision_encoder = core.compile_model(model_dir / "openvino_vision_embeddings_model.xml", device, device_config).create_infer_request();
}
m_processor_config = ov::genai::utils::from_config_json_if_exists<ov::genai::ProcessorConfig>(
model_dir, "preprocessor_config.json"
)
} {}
);
}

EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ProcessorConfig& config) {
if (model_type == VLMModelType::MINICPM) {
return encode_minicpm(image, config);
} else if (model_type == VLMModelType::LLAVA) {
return encode_llava(image, config);
}
}

EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ov::AnyMap& config_map) {
return encode(image, from_any_map(
config_map, m_processor_config
));
}

EncodedImage VisionEncoder::encode_minicpm(const ov::Tensor& image, const ProcessorConfig& config) {
clip_ctx ctx_clip;
ctx_clip.patch_size = m_processor_config.patch_size;
ctx_clip.image_size = m_processor_config.image_size;
std::copy(config.norm_mean.begin(), config.norm_mean.end(), ctx_clip.image_mean);
std::copy(config.norm_std.begin(), config.norm_std.end(), ctx_clip.image_std);
return llava_image_embed_make_with_bytes_slice(ctx_clip, image, m_encoder, config.max_slice_nums, config.scale_resolution, config.patch_size, 0 == config.max_slice_nums);
return llava_image_embed_make_with_bytes_slice(ctx_clip, image, m_vision_encoder, config.max_slice_nums, config.scale_resolution, config.patch_size, 0 == config.max_slice_nums);
}

EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ov::AnyMap& config_map) {
return encode(image, from_any_map(
config_map, m_processor_config
));
EncodedImage VisionEncoder::encode_llava(const ov::Tensor& image, const ProcessorConfig& config) {
ov::Tensor preprocessed_image = preprocess_image_llava(image, config);

m_vision_encoder.set_tensor("pixel_values", preprocessed_image);
m_vision_encoder.infer();

ov::Tensor image_features = m_vision_encoder.get_output_tensor();
ImageSize resized_source_size{config.crop_size_height / config.patch_size, config.crop_size_width / config.patch_size};

return {image_features, resized_source_size};
}
1 change: 1 addition & 0 deletions src/cpp/src/vlm_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ ov::genai::VLMConfig::VLMConfig(const std::filesystem::path& json_path) {
OPENVINO_ASSERT(stream.is_open(), "Failed to open '" + json_path.string() + "' with processor config");
nlohmann::json parsed = nlohmann::json::parse(stream);
using ov::genai::utils::read_json_param;
model_type = to_vlm_model_type(parsed.at("model_type"));
read_json_param(parsed, "hidden_size", hidden_size);
read_json_param(parsed, "scale_emb", scale_emb);
read_json_param(parsed, "query_num", query_num);
Expand Down
Loading

0 comments on commit dbb1f7c

Please sign in to comment.