diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 097cca34dd..5bada5da5b 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -703,18 +703,18 @@ jobs: source ./ov/setupvars.sh python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - python ./samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py ./miniCPM-V-2_6/ + python -m pip install git+https://github.com/eaidova/optimum-intel.git@ea/minicpmv + python -m pip install -U "optimum<1.23" --no-dependencies + optimum-cli export openvino -m openbmb/MiniCPM-V-2_6 MiniCPM-V-2_6 --trust-remote-code wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --output-document cat.jpg - name: Run visual_language_chat sample - MiniCPM-V-2_6 run: > source ./ov/setupvars.sh - && timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ cat.jpg + && timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./MiniCPM-V-2_6/ cat.jpg <<< $'What is on the image?\nWhat is special on the image?' - name: Download and convert LLaVa 1.5 model and an image run: | source ./ov/setupvars.sh - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --model llava-hf/llava-1.5-7b-hf ./llava_1_5_7b_ov/ wget https://llava-vl.github.io/static/images/monalisa.jpg - name: Run visual_language_chat sample - LLaVa 1.5 @@ -729,7 +729,7 @@ jobs: source ./ov/setupvars.sh export PYTHONPATH=./build/:$PYTHONPATH printf 'What is on the image?\nWhat is special on the image?\n' > ./input.txt - timeout 120s python ./samples/python/visual_language_chat/visual_language_chat.py ./miniCPM-V-2_6/ cat.jpg < input.txt > ./pred.txt + timeout 120s python ./samples/python/visual_language_chat/visual_language_chat.py ./MiniCPM-V-2_6/ cat.jpg < input.txt > ./pred.txt cpp-continuous-batching-ubuntu: runs-on: ubuntu-20.04-8-cores diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index a3d44f28c1..bc56996a1f 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -268,7 +268,13 @@ jobs: run: | source ${OV_INSTALL_DIR}/setupvars.sh python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels - python -m pytest ./tests/python_tests --ignore ./tests/python_tests/test_whisper_generate_api.py + python -m pytest ./tests/python_tests --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py + + - run: python -m pip install git+https://github.com/eaidova/optimum-intel.git@ea/minicpmv + - run: python -m pip install -U "optimum<1.23" --no-dependencies + - run: > + source ${OV_INSTALL_DIR}/setupvars.sh + && python -m pytest ./tests/python_tests/test_vlm_api.py genai_python_lib_whisper: name: OpenVINO genai extension whisper tests (cmake + wheel) diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index e803bae787..d40aa02710 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -366,6 +366,8 @@ jobs: run: | . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels --upgrade-strategy eager + python -m pip install git+https://github.com/eaidova/optimum-intel.git@ea/minicpmv + python -m pip install -U "optimum<1.23" --no-dependencies python -m pytest ./tests/python_tests/test_vlm_api.py env: PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that. diff --git a/samples/cpp/visual_language_chat/README.md b/samples/cpp/visual_language_chat/README.md index b9d0ebcfe4..99ba417baf 100644 --- a/samples/cpp/visual_language_chat/README.md +++ b/samples/cpp/visual_language_chat/README.md @@ -10,7 +10,7 @@ It's not required to install [../../requirements.txt](../../requirements.txt) fo ```sh pip install --upgrade-strategy eager -r ../../requirements.txt -export_MiniCPM-V-2_6.py miniCPM-V-2_6 +optimum-cli export openvino --model openbmb/MiniCPM-V-2_6 --trust-remote-code MiniCPM-V-2_6 ``` ## Run diff --git a/samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py b/samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py deleted file mode 100644 index 903979b795..0000000000 --- a/samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py +++ /dev/null @@ -1,1199 +0,0 @@ -import argparse -import requests -import torch -from threading import Thread -from copy import deepcopy -import shutil -import json -from PIL import Image -from transformers import AutoModel, AutoTokenizer, AutoProcessor, TextIteratorStreamer -from transformers.generation import GenerationMixin -from transformers import AutoConfig, GenerationConfig -from transformers.modeling_outputs import CausalLMOutputWithPast, BaseModelOutputWithPooling -from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask -from pathlib import Path -from huggingface_hub import snapshot_download -import types -from typing import Optional, Tuple, List, Union -from openvino.runtime import opset13 -import openvino as ov -import openvino_tokenizers -import numpy as np -import gc -from openvino.runtime.passes import Manager, MatcherPass, WrapType, Matcher -import time - -text_emb_path = Path("embed_tokens.xml") -image_emb_path = Path("image_encoder.xml") -resampler_path = Path("resampler.xml") -llm_path = Path("language_model.xml") - -class InsertSlice(MatcherPass): - def __init__(self): - MatcherPass.__init__(self) - self.model_changed = False - - param = WrapType("opset10.Result") - - def callback(matcher: Matcher) -> bool: - root = matcher.get_match_root() - if root is None: - return False - if len(root.get_output_partial_shape(0)) == 3: - parent = root.input_value(0).get_node() - grand_parent = parent.input_value(0).get_node() - - grand_parent_output = parent.input(0).get_source_output() - consumers = grand_parent_output.get_target_inputs() - start = np.array([0, -1, 0], dtype=np.int32) - stop = np.array([1, -2, grand_parent_output.get_partial_shape()[-1].get_length()], dtype=np.int32) - step = np.array([1, -1, 1], dtype=np.int32) - axes = np.array([0, 1, 2], dtype=np.int32) - slice = opset13.slice(grand_parent, start, stop, step, axes, name="inserted_slice") - for consumer in consumers: - consumer.replace_source_output(slice.output(0)) - self.model_changed = True - # Use new operation for additional matching - self.register_new_node(slice) - print("applied slice for lm head") - - return True - - self.register_matcher(Matcher(param, "InsertSlice"), callback) - - -def model_has_state(ov_model: ov.Model): - return len(ov_model.get_sinks()) > 0 - - -def model_has_input_output_name(ov_model: ov.Model, name: str): - """ - Helper function for checking that model has specified input or output name - - Parameters: - ov_model (ov.Model): - name (str): - name of input or output - - Returns: - True if input or output with requested name exists else False - """ - return name in sum([list(t.get_names()) for t in ov_model.inputs + ov_model.outputs], []) - - -def fuse_cache_reorder( - ov_model: ov.Model, - not_kv_inputs: List[str], - key_value_input_names: List[str], - gather_dim: int, -): - """ - Fuses reored_cache during generate cycle into ov.Model. Used with stateful models, because we can not modify model state directly. - - Adds a new beam_idx parameter and Gather op per each kv-cache input in a given model. - Should be run before make_stateful. Implements optimumum's _reorder_cache - inside the model in the beginning of each iteration. - Gather works along given gather_dim dimension that may vary from model to model. - KV-cache inputs are identified based on names in key_value_input_names. - Append the new beam_idx parameter to not_kv_inputs. - - Parameters: - ov_model (`ov.Model`): - openvino model for processing - not_kv_inputs (`List[str]`): - list of input nodes in model that not related to past key values - key_value_input_names (`List[str]`): - list of names for key value input layers - gather_dim (int): - dimension for gathering cache during reorder pass - """ - - if model_has_input_output_name(ov_model, "beam_idx"): - raise ValueError("Model already has fused cache") - input_batch = ov_model.input("inputs_embeds").get_partial_shape()[0] - beam_idx = opset13.parameter(name="beam_idx", dtype=ov.Type.i32, shape=ov.PartialShape([input_batch])) - beam_idx.output(0).get_tensor().add_names({"beam_idx"}) - ov_model.add_parameters([beam_idx]) - not_kv_inputs.append(ov_model.inputs[-1]) - # Go over all cache parameters and fuse _reorder_cache with indices provided by the new parameter beam_idx - for input_name in key_value_input_names: - parameter_output_port = ov_model.input(input_name) - consumers = parameter_output_port.get_target_inputs() - gather = opset13.gather(parameter_output_port, beam_idx, opset13.constant(gather_dim)) - for consumer in consumers: - consumer.replace_source_output(gather.output(0)) - ov_model.validate_nodes_and_infer_types() - - -def build_state_initializer(ov_model: ov.Model, batch_dim: int): - """ - Build initialization ShapeOf Expression for all ReadValue ops - - Parameters: - ov_model (ov.Model): - openvino model - batch_dim (int): - index of dimension corresponding to batch size - """ - input_ids = ov_model.input("inputs_embeds") - batch = opset13.gather( - opset13.shape_of(input_ids, output_type="i64"), - opset13.constant([0]), - opset13.constant(0), - ) - for op in ov_model.get_ops(): - if op.get_type_name() == "ReadValue": - dims = [dim.min_length for dim in list(op.get_output_partial_shape(0))] - dims[batch_dim] = batch - dims = [(opset13.constant(np.array([dim], dtype=np.int64)) if isinstance(dim, int) else dim) for dim in dims] - shape = opset13.concat(dims, axis=0) - broadcast = opset13.broadcast(opset13.constant(0.0, dtype=op.get_output_element_type(0)), shape) - op.set_arguments([broadcast]) - ov_model.validate_nodes_and_infer_types() - - -def make_stateful( - ov_model: ov.Model, - not_kv_inputs: List[str], - key_value_input_names: List[str], - key_value_output_names: List[str], - batch_dim: int, - num_attention_heads: int, - num_beams_and_batch: int = None, -): - """ - Hides kv-cache inputs and outputs inside the model as variables. - - Parameters: - ov_model (ov.Model): - openvino model - not_kv_inputs (`List[str]`): - list of input nodes in model that not related to past key values - key_value_input_names (`List[str]`): - list of names for key value input layers - key_value_output_names (`List[str]`): - list of names for key value input layers - batch_dim (int): - index of batch dimension in key value layers - num_attention_heads (int): - number of attention heads for batch dimension initialization - num_beams_an_batch (int): - precalculated number of beams and batch for shapes initialization - """ - from openvino._offline_transformations import apply_make_stateful_transformation - - input_output_map = {} - - if num_beams_and_batch is not None: - # Set batch size for input_ids and attention mask to avoid dynamic dimension got propagated from the end of the model back to ReadValue - for input in not_kv_inputs: - shape = input.get_partial_shape() - if shape.rank.get_length() <= 2: # == 1 for beam_index - shape[0] = num_beams_and_batch - input.get_node().set_partial_shape(shape) - for kv_name_pair in zip(key_value_input_names, key_value_output_names): - input_output_map[kv_name_pair[0]] = kv_name_pair[1] - if num_beams_and_batch is not None: - input = ov_model.input(kv_name_pair[0]) - shape = input.get_partial_shape() - shape[batch_dim] = num_beams_and_batch * num_attention_heads - input.get_node().set_partial_shape(shape) - - if num_beams_and_batch is not None: - # Re-validation model if shapes are altered above - ov_model.validate_nodes_and_infer_types() - - apply_make_stateful_transformation(ov_model, input_output_map) - if num_beams_and_batch is None: - build_state_initializer(ov_model, batch_dim) - - -def patch_stateful(ov_model): - key_value_input_names = [key.get_any_name() for key in ov_model.inputs[2:-1]] - key_value_output_names = [key.get_any_name() for key in ov_model.outputs[1:]] - not_kv_inputs = [input for input in ov_model.inputs if not any(name in key_value_input_names for name in input.get_names())] - if not key_value_input_names or not key_value_output_names: - return - batch_dim = 0 - num_attention_heads = 1 - - fuse_cache_reorder(ov_model, not_kv_inputs, key_value_input_names, batch_dim) - make_stateful( - ov_model, - not_kv_inputs, - key_value_input_names, - key_value_output_names, - batch_dim, - num_attention_heads, - None, - ) - - -def cleanup_torchscript_cache(): - """ - Helper for removing cached model representation - """ - torch._C._jit_clear_class_registry() - torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore() - torch.jit._state._clear_class_state() - - -def get_2d_sincos_pos_embed(embed_dim, image_size): - """ - image_size: image_size or (image_height, image_width) - return: - pos_embed: [image_height, image_width, embed_dim] - """ - if isinstance(image_size, int): - grid_h_size, grid_w_size = image_size, image_size - else: - grid_h_size, grid_w_size = image_size[0], image_size[1] - - grid_h = np.arange(grid_h_size, dtype=np.float32) - grid_w = np.arange(grid_w_size, dtype=np.float32) - grid = np.meshgrid(grid_w, grid_h) # here w goes first - grid = np.stack(grid, axis=0) - - pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) - return pos_embed - - -def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): - assert embed_dim % 2 == 0 - - # use half of dimensions to encode grid_h - emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim // 2, grid[0]) # (H, W, D/2) - emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim // 2, grid[1]) # (H, W, D/2) - - emb = np.concatenate([emb_h, emb_w], axis=-1) # (H, W, D) - return emb - - -def get_1d_sincos_pos_embed_from_grid_new(embed_dim, pos): - """ - embed_dim: output dimension for each position - pos: a list of positions to be encoded: size (H, W) - out: (H, W, D) - """ - assert embed_dim % 2 == 0 - omega = np.arange(embed_dim // 2, dtype=np.float32) - omega /= embed_dim / 2.0 - omega = 1.0 / 10000**omega # (D/2,) - - out = np.einsum("hw,d->hwd", pos, omega) # (H, W, D/2), outer product - - # Align with C++ which always uses double - emb_sin = np.sin(out.astype(np.float64)).astype(np.float32) # (H, W, D/2) - emb_cos = np.cos(out.astype(np.float64)).astype(np.float32) # (H, W, D/2) - - emb = np.concatenate([emb_sin, emb_cos], axis=-1) # (H, W, D) - return emb - - -def patch_model_code(orig_model_dir): - model_file = orig_model_dir / "modeling_navit_siglip.py" - orig_model_file = model_file.parent / ("orig_" + model_file.name) - if not orig_model_file.exists(): - model_file.rename(orig_model_file) - with orig_model_file.open("r") as f: - content = f.read() - content = content.replace("if is_flash_attn_2_available():", "") - content = content.replace("from flash_attn import flash_attn_func, flash_attn_varlen_func", "") - content = content.replace("from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input", "") - - with model_file.open("w") as out_f: - out_f.write(content) - - -def convert_llm(model, model_dir): - model.llm.config.save_pretrained(model_dir / text_emb_path.parent) - print("⌛ Convert Input embedding model") - ov_model = ov.convert_model(model.llm.model.embed_tokens, example_input=torch.ones([1, 10], dtype=torch.long)) - - ov.save_model(ov_model, model_dir / text_emb_path) - del ov_model - cleanup_torchscript_cache() - gc.collect() - print("✅ Input embedding model successfully converted") - - print("⌛ Convert Language model") - hidden_size = model.llm.config.hidden_size - num_pkv = model.llm.config.num_hidden_layers - pkv_shape = (2, model.llm.config.num_key_value_heads, 2, hidden_size // model.llm.config.num_attention_heads) - - input_embeds = torch.randn((2, 2, hidden_size)) - attention_mask = torch.ones([2, 4], dtype=torch.long) - position_ids = torch.tensor([[2, 3], [2, 3]], dtype=torch.long) - input_names = ["attention_mask", "position_ids"] - output_names = ["logits"] - - past_key_values = [] - for i in range(num_pkv): - kv = [torch.randn(pkv_shape) for _ in range(2)] - past_key_values.append(kv) - input_names.extend([f"past_key_values.{i}.key", f"past_key_values.{i}.value"]) - output_names.extend([f"present.{i}.key", f"present.{i}.value"]) - input_names.append("inputs_embeds") - - example_input = {"inputs_embeds": input_embeds, "attention_mask": attention_mask, "position_ids": position_ids, "past_key_values": past_key_values} - - model.llm.config.torchscript = True - - ov_model = ov.convert_model(model.llm, example_input=example_input) - - for out, out_name in zip(ov_model.outputs, output_names): - out.get_tensor().set_names({out_name}) - - for inp, inp_name in zip(ov_model.inputs, input_names): - inp.get_tensor().set_names({inp_name}) - - patch_stateful(ov_model) - - ov.save_model(ov_model, model_dir / llm_path) - del ov_model - - cleanup_torchscript_cache() - gc.collect() - print("✅ Language model successfully converted") - - -def convert_vision_encoder(model, model_dir): - tgt_sizes = torch.tensor([[23, 45]]) - if not (model_dir / image_emb_path).exists(): - print("⌛ Convert Image embedding model") - def siglip_vis_embed_forward( - self, - pixel_values: torch.FloatTensor, - patch_attention_mask: torch.BoolTensor, - tgt_sizes: Optional[torch.IntTensor] = None, - position_ids: Optional[torch.FloatTensor] = None, - ) -> torch.Tensor: - patch_embeds = self.patch_embedding(pixel_values) - embeddings = patch_embeds.flatten(2).transpose(1, 2) - - if position_ids is None: - batch_size = pixel_values.size(0) - max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3) - max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, max_im_w // self.patch_size - boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side) - position_ids = torch.full( - size=( - batch_size, - max_nb_patches_h * max_nb_patches_w, - ), - fill_value=0, - ) - - for batch_idx, p_attn_mask in enumerate(patch_attention_mask): - if tgt_sizes is not None: - nb_patches_h = tgt_sizes[batch_idx][0] - nb_patches_w = tgt_sizes[batch_idx][1] - else: - nb_patches_h = p_attn_mask[:, 0].sum() - nb_patches_w = p_attn_mask[0].sum() - - fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h) - fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w) - - bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True) - bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True) - - pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten() - position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids - - position_ids = position_ids.to(self.position_embedding.weight.device) - - embeddings = embeddings + self.position_embedding(position_ids) - return embeddings - - def siglip_attn_forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - """Input shape: Batch x Time x Channel""" - - batch_size, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2) - - attn_output = torch.nn.functional.scaled_dot_product_attention( - query_states, key_states, value_states, attention_mask, is_causal=attention_mask is None - ) - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim) - - attn_output = self.out_proj(attn_output) - - return attn_output, None - - def siglip_transformer_forward( - self, - pixel_values, - patch_attention_mask: Optional[torch.BoolTensor] = None, - tgt_sizes: Optional[torch.IntTensor] = None, - position_ids: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - batch_size = pixel_values.size(0) - if patch_attention_mask is None: - patch_attention_mask = torch.ones( - size=( - batch_size, - pixel_values.size(2) // self.config.patch_size, - pixel_values.size(3) // self.config.patch_size, - ), - dtype=torch.bool, - device=pixel_values.device, - ) - - hidden_states = self.embeddings( - pixel_values=pixel_values, patch_attention_mask=patch_attention_mask, tgt_sizes=tgt_sizes, position_ids=position_ids - ) - - patch_attention_mask = patch_attention_mask.view(batch_size, -1) - attention_mask = _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype) if not self._use_flash_attention_2 else patch_attention_mask - - encoder_outputs = self.encoder( - inputs_embeds=hidden_states, - attention_mask=attention_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - last_hidden_state = encoder_outputs[0] - last_hidden_state = self.post_layernorm(last_hidden_state) - - if not return_dict: - return (last_hidden_state, None) + encoder_outputs[1:] - - return BaseModelOutputWithPooling( - last_hidden_state=last_hidden_state, - pooler_output=None, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - ) - - vpm = model.vpm - vpm.embeddings.forward = types.MethodType(siglip_vis_embed_forward, vpm.embeddings) - for layer in vpm.encoder.layers: - layer.self_attn.forward = types.MethodType(siglip_attn_forward, layer.self_attn) - vpm.forward = types.MethodType(siglip_transformer_forward, vpm) - - pixel_values = torch.randn([1, 3, 14, 14490]) - patch_attn_mask = torch.zeros((1, 1, 1035), dtype=torch.bool) - patch_attn_mask[0, 0, : tgt_sizes[0][0] * tgt_sizes[0][1]] = True - position_ids = prepare_vis_position_ids( - pixel_values, patch_attn_mask, tgt_sizes, model.config.vision_config.patch_size, model.config.vision_config.image_size // model.config.patch_size - ) - ov_model = ov.convert_model(vpm, example_input={"pixel_values": pixel_values, "position_ids": position_ids, "patch_attention_mask": patch_attn_mask}) - ov.save_model(ov_model, model_dir / image_emb_path) - del ov_model - cleanup_torchscript_cache() - gc.collect() - print("✅ Image embedding model successfully converted") - - if not (model_dir / resampler_path).exists(): - print("⌛ Convert Resamler model") - - def resampler_forward(self, x, pos_embed, key_padding_mask): - bs = x.shape[0] - x = self.kv_proj(x) # B * L * D - x = self.ln_kv(x).permute(1, 0, 2) # L * B * D - - q = self.ln_q(self.query) # Q * D - - q_bs = q.unsqueeze(1).repeat(1, bs, 1) - - out = self.attn(q_bs, x + pos_embed, x, key_padding_mask=key_padding_mask)[0] # Q * B * D # L * B * D + L * B * D - # out: Q * B * D - x = out.permute(1, 0, 2) # B * Q * D - - x = self.ln_post(x) - x = x @ self.proj - return x - - model.resampler.forward = types.MethodType(resampler_forward, model.resampler) - - pos_embed_base = get_2d_sincos_pos_embed(model.resampler.embed_dim, 70) - - patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1] - - max_patch_len = torch.max(patch_len) - key_padding_mask = torch.zeros((1, max_patch_len), dtype=torch.bool) - - pos_embed = [] - tgt_h, tgt_w = tgt_sizes[0] - pos_embed = torch.from_numpy(pos_embed_base[:tgt_h, :tgt_w, :].reshape((tgt_h * tgt_w, 1, -1))) # patches * D - key_padding_mask[0, patch_len:] = True - - ov_model = ov.convert_model(model.resampler, example_input=[torch.randn(1, 1035, 1152), pos_embed, key_padding_mask]) - ov.save_model(ov_model, model_dir / resampler_path) - del ov_model - cleanup_torchscript_cache() - del model.resampler - gc.collect() - print("✅ Resampler model successfully converted") - - -def copy_llm_files(model_dir, dst_dir): - shutil.copy(model_dir / text_emb_path, model_dir / dst_dir / text_emb_path.name) - shutil.copy(model_dir / text_emb_path.with_suffix(".bin"), model_dir / dst_dir / text_emb_path.with_suffix(".bin").name) - shutil.copy(model_dir / llm_path.parent / "config.json", model_dir / dst_dir / "config.json") - shutil.copy(model_dir / llm_path.parent / "configuration_minicpm.py", model_dir / dst_dir / "configuration_minicpm.py") - shutil.copy(model_dir / llm_path.parent / "modeling_navit_siglip.py", model_dir / dst_dir / "modeling_navit_siglip.py") - - -def prepare_vis_position_ids(pixel_values, patch_attention_mask, tgt_sizes, patch_size, num_patches_per_side): - batch_size = pixel_values.size(0) - max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3) - max_nb_patches_h, max_nb_patches_w = max_im_h // patch_size, max_im_w // patch_size - boundaries = torch.arange(1 / num_patches_per_side, 1.0, 1 / num_patches_per_side) - position_ids = torch.full(size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0) - - for batch_idx, p_attn_mask in enumerate(patch_attention_mask): - if tgt_sizes is not None: - nb_patches_h = tgt_sizes[batch_idx][0] - nb_patches_w = tgt_sizes[batch_idx][1] - else: - nb_patches_h = p_attn_mask[:, 0].sum() - nb_patches_w = p_attn_mask[0].sum() - - fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h) - fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w) - - bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True) - bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True) - - pos_ids = (bucket_coords_h[:, None] * num_patches_per_side + bucket_coords_w).flatten() - position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids - - return position_ids - - -core = ov.Core() - - -class OvModelForCausalLMWithEmb(GenerationMixin): - def __init__(self, model_dir, device="CPU", ov_config=None, compile=True, slice_lm_head=True) -> None: - self._supports_cache_class = False - self.config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True) - self.config.is_decoder = True - self.config.is_encoder_decoder = False - self.generation_config = GenerationConfig.from_model_config(self.config) - model_dir = Path(model_dir) - self.model = core.read_model(model_dir / "language_model.xml") - self.token_emb = core.read_model(model_dir / "embed_tokens.xml") - if slice_lm_head: - self.slice_lm_head() - self.request = None - self.token_emb_request = None - self._device = device.upper() - self.device = torch.device("cpu") - self.ov_config = ov_config - self.next_beam_idx = None - self._past_length = None - self.input_names = [input_t.get_any_name() for input_t in self.model.inputs] - self.main_input_name = "input_ids" - self.llm_times = [] - if compile: - self.compile() - - def slice_lm_head(self): - manager = Manager() - manager.register_pass(InsertSlice()) - manager.run_passes(self.model) - self.model.validate_nodes_and_infer_types() - - def compile(self): - if self.request is None: - self.request = core.compile_model(self.model, self._device, self.ov_config).create_infer_request() - self._compile_token_emb() - - def _compile_token_emb(self): - if self.token_emb_request is None: - self.token_emb_request = core.compile_model(self.token_emb, self._device, self.ov_config) - - def to(self, device: str): - if isinstance(device, str): - self._device = device.upper() - self.clear_requests() - - return self - - def clear_requests(self): - del self.request - del self.token_emb_request - self.request = None - self.token_emb_request = None - - def embed_tokens(self, input_ids: torch.LongTensor): - self._compile_token_emb() - res = self.token_emb_request(input_ids, share_inputs=True) - return res[0] - - def prepare_inputs( - self, - input_ids: torch.LongTensor, - attention_mask: Optional[torch.LongTensor] = None, - past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, - position_ids: Optional[torch.LongTensor] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - **kwargs, - ): - batch_size = input_ids.shape[0] if input_ids is not None else inputs_embeds.shape[0] - - inputs = {} - # past_key_values are not used explicitly, instead they are handled inside the model - if past_key_values is None: - self.llm_times = [] - # This is the first iteration in a sequence, reset all states - if self.request is not None: - self.request.reset_state() - # Set initial value for the next beam_idx input that will be used at the current iteration - # and will be optionally updated by _reorder_cache at the next iterations if beam_search is used - self.next_beam_idx = np.arange(batch_size, dtype=int) - self._past_length = 0 - past_len = self._get_past_length(past_key_values) - - if inputs_embeds is None: - inputs_embeds = self.embed_tokens(input_ids if past_key_values is None else input_ids[:, -1:]) - - if hasattr(self.config, "scale_emb"): - inputs_embeds = inputs_embeds * self.config.scale_emb - inputs["inputs_embeds"] = inputs_embeds - - # Add the attention_mask inputs when needed - if "attention_mask" in self.input_names or "position_ids" in self.input_names: - if attention_mask is not None: - attention_mask = np.array(attention_mask) - else: - attention_mask = np.ones((inputs_embeds.shape[0], inputs_embeds.shape[1] + past_len), dtype=int) - - if "attention_mask" in self.input_names: - inputs["attention_mask"] = attention_mask - - if "position_ids" in self.input_names: - if position_ids is not None: - position_ids = np.array(position_ids) - else: - position_ids = np.cumsum(attention_mask, axis=1) - 1 - position_ids[attention_mask == 0] = 1 - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - - inputs["position_ids"] = position_ids - - if "beam_idx" in self.input_names: - inputs["beam_idx"] = self.next_beam_idx if self.next_beam_idx is not None else np.arange(batch_size, dtype=int) - - return inputs - - def forward( - self, - input_ids: torch.LongTensor, - attention_mask: Optional[torch.LongTensor] = None, - past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, - position_ids: Optional[torch.LongTensor] = None, - inputs_embeds: Optional[torch.LongTensor] = None, - **kwargs, - ): - self.compile() - - inputs = self.prepare_inputs( - input_ids=input_ids, - attention_mask=attention_mask, - past_key_values=past_key_values, - position_ids=position_ids, - inputs_embeds=inputs_embeds, - **kwargs, - ) - - # Run inference - self.request.start_async(inputs, share_inputs=True) - self.request.wait() - logits = self.request.get_tensor("logits").data - logits = torch.from_numpy(logits).to(self.device) - past_key_values = ((),) - self._past_length += inputs["inputs_embeds"].shape[1] - - return CausalLMOutputWithPast(logits=logits, past_key_values=past_key_values) - - # Adapted from transformers.models.llama.modeling_llama.LlamaForCausalLM.prepare_inputs_for_generation - def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs): - # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly - attention_mask = kwargs.get("attention_mask", None) - use_cache = kwargs.get("use_cache", None) - - if past_key_values is not None: - past_len = self._get_past_length(past_key_values) - # Keep only the unprocessed tokens: - # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where - # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as - # input) - if attention_mask is not None and input_ids is not None and attention_mask.shape[1] > input_ids.shape[1]: - input_ids = input_ids[:, -(attention_mask.shape[1] - past_len) :] - # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard - # input_ids based on the past_length. - elif input_ids is not None and past_len < input_ids.shape[1]: - input_ids = input_ids[:, past_len:] - # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens - position_ids = kwargs.get("position_ids", None) - if attention_mask is not None and position_ids is None and "position_ids" in self.input_names: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values and input_ids is not None: - position_ids = position_ids[:, -input_ids.shape[1] :] - - model_inputs = { - "input_ids": input_ids, - "past_key_values": past_key_values, - "use_cache": use_cache, - "position_ids": position_ids, - "attention_mask": attention_mask, - "inputs_embeds": inputs_embeds if past_key_values is None else None, - } - - return model_inputs - - def _get_past_length(self, past_key_values=None): - if past_key_values is None: - return 0 - return self._past_length - - # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel._reorder_cache - def _reorder_cache(self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]: - """ - This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or - [`~PreTrainedModel.beam_sample`] is called. - This is required to match `past_key_values` with the correct beam_idx at every generation step. - """ - self.next_beam_idx = np.array(beam_idx) # save beam_idx to be used as an input in the next iteration - return past_key_values - - def can_generate(self): - """Returns True to validate the check that the model using `GenerationMixin.generate()` can indeed generate.""" - - return True - - def __call__(self, *args, **kwargs): - return self.forward(*args, **kwargs) - - -class OvMiniCPMV: - def __init__(self, config, vpm, resampler, llm, processor): - self.config = config - self.llm = llm - self.vpm = vpm - self.embed_dim = self.llm.config.hidden_size - self._resampler = resampler - self.processor = processor - self._pos_embeds = torch.from_numpy(get_2d_sincos_pos_embed(self.embed_dim, 70)).float() - self.max_size = (70, 70) - - self.terminators = ["<|im_end|>", "<|endoftext|>"] - - def set_decoder(self, decoder): - self.llm = decoder - - def get_decoder(self): - return self.llm - - def resampler(self, x, tgt_sizes): - bs = x.shape[0] - - patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1] - - self._adjust_pos_cache(tgt_sizes) - - max_patch_len = torch.max(patch_len) - key_padding_mask = torch.zeros((bs, max_patch_len), dtype=torch.bool) - - pos_embed = [] - for i in range(bs): - tgt_h, tgt_w = tgt_sizes[i] - pos_embed.append(self._pos_embeds[:tgt_h, :tgt_w, :].reshape((tgt_h * tgt_w, -1))) # patches * D - key_padding_mask[i, patch_len[i] :] = True - - pos_embed = torch.nn.utils.rnn.pad_sequence(pos_embed, batch_first=True, padding_value=0.0).permute(1, 0, 2) # BLD => L * B * D - - res = torch.from_numpy(self._resampler([x, pos_embed, key_padding_mask])[0]) - return res - - def _set_2d_pos_cache(self, max_size): - pos_embed = torch.from_numpy(get_2d_sincos_pos_embed(self.embed_dim, max_size)).float() - self._pos_embed = pos_embed - - def _adjust_pos_cache(self, tgt_sizes): - max_h = torch.max(tgt_sizes[:, 0]) - max_w = torch.max(tgt_sizes[:, 1]) - if max_h > self.max_size[0] or max_w > self.max_size[1]: - self.max_size = [max(max_h, self.max_size[0]), max(max_w, self.max_size[1])] - self._set_2d_pos_cache(self.max_size) - - def get_vllm_embedding(self, data): - if "vision_hidden_states" not in data: - tgt_sizes = data["tgt_sizes"] - pixel_values_list = data["pixel_values"] - vision_hidden_states = [] - all_pixel_values = [] - img_cnt = [] - for pixel_values in pixel_values_list: - img_cnt.append(len(pixel_values)) - all_pixel_values.extend([i.flatten(end_dim=1).permute(1, 0) for i in pixel_values]) - - # exist image - if all_pixel_values: - tgt_sizes = [tgt_size for tgt_size in tgt_sizes if isinstance(tgt_size, torch.Tensor)] - tgt_sizes = torch.vstack(tgt_sizes).type(torch.int32) - - max_patches = torch.max(tgt_sizes[:, 0] * tgt_sizes[:, 1]) - - all_pixel_values = torch.nn.utils.rnn.pad_sequence(all_pixel_values, batch_first=True, padding_value=0.0) - B, L, _ = all_pixel_values.shape - all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L) - - patch_attn_mask = torch.zeros((B, 1, max_patches), dtype=torch.bool) - for i in range(B): - patch_attn_mask[i, 0, : tgt_sizes[i][0] * tgt_sizes[i][1]] = True - - vision_batch_size = 32 - all_pixel_values = all_pixel_values - if B > vision_batch_size: - hs = [] - for i in range(0, B, vision_batch_size): - start_idx = i - end_idx = i + vision_batch_size - block_pxl_values = all_pixel_values[start_idx:end_idx] - block_patch_attn_mask = patch_attn_mask[start_idx:end_idx] - block_tgt_sizes = tgt_sizes[start_idx:end_idx] - block_position_ids = prepare_vis_position_ids( - block_pxl_values, - block_patch_attn_mask, - block_tgt_sizes, - self.config.vision_config.patch_size, - self.config.vision_config.image_size // self.config.patch_size, - ) - start = time.perf_counter() - tmp_hs = torch.from_numpy(self.vpm([block_pxl_values, block_patch_attn_mask, block_position_ids])[0]) - self.vpm_times.append(time.perf_counter() - start) - hs.append(tmp_hs) - vision_embedding = torch.cat(hs, dim=0) - else: - position_ids = prepare_vis_position_ids( - all_pixel_values, - patch_attn_mask, - tgt_sizes, - self.config.vision_config.patch_size, - self.config.vision_config.image_size // self.config.patch_size, - ) - start = time.perf_counter() - vision_embedding = torch.from_numpy(self.vpm([all_pixel_values, patch_attn_mask, position_ids])[0]) - vision_embedding = torch.from_numpy(self.vpm([all_pixel_values, patch_attn_mask, position_ids])[0]) - vision_embedding = self.resampler(vision_embedding, tgt_sizes) - - start = 0 - for pixel_values in pixel_values_list: - img_cnt = len(pixel_values) - if img_cnt > 0: - vision_hidden_states.append(vision_embedding[start : start + img_cnt]) - start += img_cnt - else: - vision_hidden_states.append([]) - else: # no image - dummy_feature = [] - for _ in range(len(pixel_values_list)): - vision_hidden_states.append(dummy_feature) - - else: - vision_hidden_states = data["vision_hidden_states"] - - if hasattr(self.llm.config, "scale_emb"): - vllm_embedding = self.llm.embed_tokens(data["input_ids"]) * self.llm.config.scale_emb - else: - vllm_embedding = self.llm.embed_tokens(data["input_ids"]) - - bs = len(data["input_ids"]) - for i in range(bs): - cur_vs_hs = vision_hidden_states[i] - if len(cur_vs_hs) > 0: - cur_vllm_emb = torch.from_numpy(vllm_embedding[i]) - cur_image_bound = data["image_bound"][i] - if len(cur_image_bound) > 0: - image_indices = torch.stack([torch.arange(r[0], r[1], dtype=torch.long) for r in cur_image_bound]) - - cur_vllm_emb.scatter_(0, image_indices.view(-1, 1).repeat(1, cur_vllm_emb.shape[-1]), cur_vs_hs.view(-1, cur_vs_hs.shape[-1])) - return vllm_embedding - - def forward(self, data, **kwargs): - vllm_embedding = self.get_vllm_embedding(data) - position_ids = data["position_ids"] - if position_ids.dtype != torch.int64: - position_ids = position_ids.long() - - return self.llm(input_ids=None, position_ids=position_ids, inputs_embeds=vllm_embedding, **kwargs) - - def _decode(self, inputs_embeds, tokenizer, attention_mask, decode_text=False, **kwargs): - terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators] - output = self.llm.generate( - inputs_embeds=torch.from_numpy(inputs_embeds), pad_token_id=0, eos_token_id=terminators, attention_mask=attention_mask, **kwargs - ) - if decode_text: - return self._decode_text(output, tokenizer) - return output - - def _decode_stream(self, inputs_embeds, tokenizer, **kwargs): - terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators] - streamer = TextIteratorStreamer(tokenizer=tokenizer) - generation_kwargs = {"inputs_embeds": torch.from_numpy(inputs_embeds), "pad_token_id": 0, "eos_token_id": terminators, "streamer": streamer} - generation_kwargs.update(kwargs) - - thread = Thread(target=self.llm.generate, kwargs=generation_kwargs) - thread.start() - - return streamer - - def _decode_text(self, result_ids, tokenizer): - terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators] - result_text = [] - for result in result_ids: - result = result[result != 0] - if result[0] == tokenizer.bos_id: - result = result[1:] - if result[-1] in terminators: - result = result[:-1] - result_text.append(tokenizer.decode(result).strip()) - return result_text - - def generate( - self, - input_ids=None, - pixel_values=None, - tgt_sizes=None, - image_bound=None, - attention_mask=None, - tokenizer=None, - vision_hidden_states=None, - return_vision_hidden_states=False, - stream=False, - decode_text=False, - **kwargs, - ): - assert input_ids is not None - assert len(input_ids) == len(pixel_values) - - model_inputs = { - "input_ids": input_ids, - "image_bound": image_bound, - } - - if vision_hidden_states is None: - model_inputs["pixel_values"] = pixel_values - model_inputs["tgt_sizes"] = tgt_sizes - else: - model_inputs["vision_hidden_states"] = vision_hidden_states - - with torch.inference_mode(): - model_inputs["inputs_embeds"] = self.get_vllm_embedding(model_inputs) - - if stream: - result = self._decode_stream(model_inputs["inputs_embeds"], tokenizer, **kwargs) - else: - result = self._decode(model_inputs["inputs_embeds"], tokenizer, attention_mask, decode_text=decode_text, **kwargs) - - return result - - def chat( - self, - image, - msgs, - tokenizer, - processor=None, - vision_hidden_states=None, - max_new_tokens=2048, - min_new_tokens=0, - sampling=True, - max_inp_length=8192, - system_prompt="", - stream=False, - max_slice_nums=None, - use_image_id=None, - **kwargs, - ): - self.vpm_times = [] - self.resampler_times = [] - if isinstance(msgs[0], list): - batched = True - else: - batched = False - msgs_list = msgs - images_list = image - - if batched is False: - images_list, msgs_list = [images_list], [msgs_list] - else: - assert images_list is None, "Please integrate image to msgs when using batch inference." - images_list = [None] * len(msgs_list) - assert len(images_list) == len(msgs_list), "The batch dim of images_list and msgs_list should be the same." - - if processor is None: - if self.processor is None: - self.processor = AutoProcessor.from_pretrained(self.config._name_or_path, trust_remote_code=True) - processor = self.processor - - assert ( - self.config.query_num == processor.image_processor.image_feature_size - ), "These two values should be the same. Check `config.json` and `preprocessor_config.json`." - assert ( - self.config.patch_size == processor.image_processor.patch_size - ), "These two values should be the same. Check `config.json` and `preprocessor_config.json`." - assert ( - self.config.use_image_id == processor.image_processor.use_image_id - ), "These two values should be the same. Check `config.json` and `preprocessor_config.json`." - assert ( - self.config.slice_config.max_slice_nums == processor.image_processor.max_slice_nums - ), "These two values should be the same. Check `config.json` and `preprocessor_config.json`." - assert ( - self.config.slice_mode == processor.image_processor.slice_mode - ), "These two values should be the same. Check `config.json` and `preprocessor_config.json`." - - prompts_lists = [] - input_images_lists = [] - for image, msgs in zip(images_list, msgs_list): - if isinstance(msgs, str): - msgs = json.loads(msgs) - copy_msgs = deepcopy(msgs) - - assert len(msgs) > 0, "msgs is empty" - - if image is not None and isinstance(copy_msgs[0]["content"], str): - copy_msgs[0]["content"] = [image, copy_msgs[0]["content"]] - - images = [] - for i, msg in enumerate(copy_msgs): - role = msg["role"] - content = msg["content"] - assert role in ["user", "assistant"] - if i == 0: - assert role == "user", "The role of first msg should be user" - if isinstance(content, str): - content = [content] - cur_msgs = [] - for c in content: - if isinstance(c, Image.Image): - images.append(c) - cur_msgs.append("(./)") - elif isinstance(c, str): - cur_msgs.append(c) - msg["content"] = "\n".join(cur_msgs) - - if system_prompt: - sys_msg = {"role": "system", "content": system_prompt} - copy_msgs = [sys_msg] + copy_msgs - - prompts_lists.append(processor.tokenizer.apply_chat_template(copy_msgs, tokenize=False, add_generation_prompt=True)) - input_images_lists.append(images) - - inputs = processor( - prompts_lists, input_images_lists, max_slice_nums=max_slice_nums, use_image_id=use_image_id, return_tensors="pt", max_length=max_inp_length - ) - - if sampling: - generation_config = {"top_p": 0.8, "top_k": 100, "temperature": 0.7, "do_sample": True, "repetition_penalty": 1.05} - else: - generation_config = { - "repetition_penalty": 1.0, - } - - if min_new_tokens > 0: - generation_config["min_new_tokens"] = min_new_tokens - - generation_config.update((k, kwargs[k]) for k in generation_config.keys() & kwargs.keys()) - - inputs.pop("image_sizes") - with torch.inference_mode(): - res = self.generate( - **inputs, - tokenizer=tokenizer, - max_new_tokens=max_new_tokens, - vision_hidden_states=vision_hidden_states, - stream=stream, - decode_text=True, - **generation_config, - ) - - if stream: - - def stream_gen(): - for text in res: - for term in self.terminators: - text = text.replace(term, "") - yield text - - return stream_gen() - - else: - if batched: - answer = res - else: - answer = res[0] - return answer - - -def init_model(model_dir, device): - config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True) - llm = OvModelForCausalLMWithEmb(model_dir, device) - img_emb = core.compile_model(model_dir / image_emb_path, device) - resampler = core.compile_model(model_dir / resampler_path, device) - processor = AutoProcessor.from_pretrained(model_dir, trust_remote_code=True) - - ov_model = OvMiniCPMV(config, img_emb, resampler, llm, processor) - return ov_model - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("model_dir", type=Path) - model_dir = parser.parse_args().model_dir - model_id = "openbmb/MiniCPM-V-2_6" - ckpt = model_dir / "ckpt" - if not ckpt.exists(): - snapshot_download(model_id, local_dir=ckpt, force_download=True) - patch_model_code(ckpt) - model = AutoModel.from_pretrained(ckpt, trust_remote_code=True) - model.eval() - model.config.save_pretrained(model_dir) - tokenizer = AutoTokenizer.from_pretrained(ckpt, trust_remote_code=True) - tokenizer.save_pretrained(model_dir) - ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(tokenizer, with_detokenizer=True) - ov.save_model(ov_tokenizer, model_dir / "openvino_tokenizer.xml") - ov.save_model(ov_detokenizer, model_dir / "openvino_detokenizer.xml") - processor = AutoProcessor.from_pretrained(ckpt, trust_remote_code=True) - processor.save_pretrained(model_dir) - - convert_llm(model, model_dir) - del model.llm - gc.collect() - - convert_vision_encoder(model, model_dir) - # ov_cpm = init_model(model_dir, "CPU") - # print(ov_cpm.chat(Image.open(requests.get("https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11", stream=True).raw), [{"role": "user", "content": "What is unusual on this image?"}], ov_cpm.processor.tokenizer, sampling=False)) - -if "__main__" == __name__: - main() diff --git a/samples/python/visual_language_chat/README.md b/samples/python/visual_language_chat/README.md index 16ef0959c5..06355d9ee5 100644 --- a/samples/python/visual_language_chat/README.md +++ b/samples/python/visual_language_chat/README.md @@ -10,8 +10,8 @@ It's not required to install [../../requirements.txt](../../requirements.txt) fo ```sh pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --model openbmb/MiniCPM-V-2_6 --trust-remote-code MiniCPM-V-2_6 ``` -# TODO: add optimum cli command for miniCPM-V-2_6 when available ## Run: [This image](https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11) can be used as a sample image. diff --git a/samples/requirements.txt b/samples/requirements.txt index e901229c75..69d5572b28 100644 --- a/samples/requirements.txt +++ b/samples/requirements.txt @@ -5,4 +5,4 @@ einops==0.8.0 # For Qwen transformers_stream_generator==0.0.5 # For Qwen diffusers==0.30.3 librosa # For Whisper -torchvision # needed for mini-CPM export script. Need to remove when we switch to exporting with optimum-intel. +torchvision # For visual langugage models diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 56a4bbbeab..0d5772202d 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -350,28 +350,18 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { m_image_id{0} { if (m_vlm_config.model_type == VLMModelType::MINICPM) { m_resampler = ov::Core{}.compile_model( - model_dir / "resampler.xml", device, device_config - ).create_infer_request(); - - m_embedding = ov::Core{}.compile_model( - model_dir / "embed_tokens.xml", device, device_config - ).create_infer_request(); - - m_language = ov::Core{}.compile_model( - model_dir / "language_model.xml", device, device_config + model_dir / "openvino_resampler_model.xml", device, device_config ).create_infer_request(); m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70}); - } else if (m_vlm_config.model_type == VLMModelType::LLAVA) { - m_language = ov::Core{}.compile_model( - model_dir / "openvino_language_model.xml", device, device_config - ).create_infer_request(); - - // Reusing the same m_embedding for llava text_embeddings model - m_embedding = ov::Core{}.compile_model( - model_dir / "openvino_text_embeddings_model.xml", device, device_config - ).create_infer_request(); } + m_embedding = ov::Core{}.compile_model( + model_dir / "openvino_text_embeddings_model.xml", device, device_config + ).create_infer_request(); + + m_language = ov::Core{}.compile_model( + model_dir / "openvino_language_model.xml", device, device_config + ).create_infer_request(); m_language.get_tensor("attention_mask").set_shape({1, 0}); } @@ -707,8 +697,8 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { pipe.m_pos_embed_cache ); size_t max_patch_len = *std::max_element(patch_len.begin(), patch_len.end()); - ov::Tensor key_padding_mask(ov::element::boolean, {bs, max_patch_len}); - bool* mask_data = key_padding_mask.data(); + ov::Tensor key_padding_mask(ov::element::f32, {bs, max_patch_len}); + float* mask_data = key_padding_mask.data(); size_t embed_len = pipe.m_pos_embed_cache.get_shape().at(2); ov::Tensor pos_embed(ov::element::f32, {max_patch_len, bs, embed_len}); // BLD => L * B * D float* pos_embed_data = pos_embed.data(); @@ -730,10 +720,10 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { for (size_t flat = target_h * target_w; flat < max_patch_len; ++flat) { std::fill_n(pos_embed_data + flat * bs * embed_len + i * embed_len, embed_len, 0.0f); } - std::fill_n(mask_data + i * max_patch_len, patch_len[i], false); - std::fill_n(mask_data + i * max_patch_len + patch_len[i], max_patch_len - patch_len[i], true); + std::fill_n(mask_data + i * max_patch_len, patch_len[i], 0.0f); + std::fill_n(mask_data + i * max_patch_len + patch_len[i], max_patch_len - patch_len[i], 1.0f); } - pipe.m_resampler.set_tensor("x", encoded_image); // [N, H*W, old_hidden_size] + pipe.m_resampler.set_tensor("image_feature", encoded_image); // [N, H*W, old_hidden_size] pipe.m_resampler.set_tensor("pos_embed", pos_embed); // [H*W, N, new_hidden_size] pipe.m_resampler.set_tensor("key_padding_mask", key_padding_mask); // [N, H*W] pipe.m_resampler.infer(); diff --git a/src/cpp/src/visual_language/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp index df7f43af77..ee7e353e45 100644 --- a/src/cpp/src/visual_language/vision_encoder.cpp +++ b/src/cpp/src/visual_language/vision_encoder.cpp @@ -300,8 +300,8 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o ov::Tensor input_tensor{ov::element::f32, {1, 3, size_t(resized_preprocessed.ny), size_t(resized_preprocessed.nx)}, (void*)(resized_preprocessed.buf.data())}; ov::Tensor pixel_values = preprocess_for_encoder(input_tensor, patch_size); encoder.set_tensor("pixel_values", pixel_values); - ov::Tensor patch_attention_mask{ov::element::boolean, {pixel_values.get_shape().at(0), 1, resized_source_size.height * resized_source_size.width}}; - std::fill_n(patch_attention_mask.data(), patch_attention_mask.get_size(), true); + ov::Tensor patch_attention_mask{ov::element::f32, {pixel_values.get_shape().at(0), 1, resized_source_size.height * resized_source_size.width}}; + std::fill_n(patch_attention_mask.data(), patch_attention_mask.get_size(), 1.0f); encoder.set_tensor("patch_attention_mask", patch_attention_mask); ov::Tensor position_ids = prepare_vis_position_ids(pixel_values, patch_attention_mask, {resized_source_size}, ctx_clip.patch_size, ctx_clip.image_size / ctx_clip.patch_size); encoder.set_tensor("position_ids", position_ids); @@ -333,8 +333,8 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o patch_size ); encoder.set_tensor("pixel_values", pixel_values); - ov::Tensor patch_attention_mask{ov::element::boolean, {1, 1, slices_size.height * slices_size.width}}; - std::fill_n(patch_attention_mask.data(), patch_attention_mask.get_size(), true); + ov::Tensor patch_attention_mask{ov::element::f32, {1, 1, slices_size.height * slices_size.width}}; + std::fill_n(patch_attention_mask.data(), patch_attention_mask.get_size(), 1.0f); encoder.set_tensor("patch_attention_mask", patch_attention_mask); ov::Tensor position_ids = prepare_vis_position_ids(pixel_values, patch_attention_mask, {slices_size}, ctx_clip.patch_size, ctx_clip.image_size / ctx_clip.patch_size); encoder.set_tensor("position_ids", position_ids); @@ -431,12 +431,7 @@ ov::Tensor preprocess_image_llava(const ov::Tensor& image, const ProcessorConfig VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const VLMModelType model_type, const std::string& device, const ov::AnyMap device_config, ov::Core core) : model_type(model_type) { - if (model_type == VLMModelType::MINICPM) { - m_vision_encoder = core.compile_model(model_dir / "image_encoder.xml", device, device_config).create_infer_request(); - } else if (model_type == VLMModelType::LLAVA) { - // Vision embeddings model is merged with multi modal projector at model export stage by optimum-intel - m_vision_encoder = core.compile_model(model_dir / "openvino_vision_embeddings_model.xml", device, device_config).create_infer_request(); - } + m_vision_encoder = core.compile_model(model_dir / "openvino_vision_embeddings_model.xml", device, device_config).create_infer_request(); m_processor_config = ov::genai::utils::from_config_json_if_exists( model_dir, "preprocessor_config.json" ); diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py index 1e7a1b81a5..b13369b7ba 100644 --- a/tests/python_tests/test_sampling.py +++ b/tests/python_tests/test_sampling.py @@ -28,8 +28,8 @@ @pytest.mark.precommit @pytest.mark.parametrize("model_id", get_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "precommit"))) @pytest.mark.xfail( - raises=RuntimeError, - reason="Test fails with error: CPU: head size must be multiple of 16, current: X. CVS-145986.", + raises=(RuntimeError, AttributeError), + reason="RuntimeError with error: CPU: head size must be multiple of 16, current: X. CVS-145986. AttributeError: 'CodeGenAttention' object has no attribute 'causal_mask' for hf-tiny-model-private/tiny-random-CodeGenForCausalLM", strict=True, ) def test_sampling_precommit(tmp_path, model_id): diff --git a/tests/python_tests/test_vlm_api.py b/tests/python_tests/test_vlm_api.py index 25b61dd0f9..bb5d421716 100644 --- a/tests/python_tests/test_vlm_api.py +++ b/tests/python_tests/test_vlm_api.py @@ -1,69 +1,33 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import openvino_genai +import openvino_tokenizers +import openvino import pytest -import gc -import os -import numpy as np -from PIL import Image -from multiprocessing import Process - +import transformers +from optimum.intel.openvino import OVModelForVisualCausalLM from openvino_genai import VLMPipeline -from openvino import Tensor from common import get_greedy, get_image_by_link, get_beam_search, get_greedy, get_multinomial_all_parameters def get_ov_model(model_dir): - import sys - from pathlib import Path - #TODO: use optimum-intel - - sys.path.append(str(Path(__file__).resolve().parents[2] / 'samples/cpp/visual_language_chat')) - import importlib - export_MiniCPM = importlib.import_module("export_MiniCPM-V-2_6", "export_MiniCPM") - convert_llm = getattr(export_MiniCPM, "convert_llm") - convert_vision_encoder = getattr(export_MiniCPM, "convert_vision_encoder") - from transformers import AutoModel, AutoTokenizer, AutoProcessor - import os - import openvino_tokenizers - import openvino as ov - import gc - + if (model_dir / "openvino_language_model.xml").exists(): + return model_dir model_id = "openbmb/MiniCPM-V-2_6" - ckpt = Path(os.path.join(model_dir, "ckpt")) - if not ckpt.exists(): - snapshot_download = getattr(export_MiniCPM, "snapshot_download") - patch_model_code = getattr(export_MiniCPM, "patch_model_code") - snapshot_download(model_id, local_dir=ckpt, force_download=True) - patch_model_code(ckpt) - model = AutoModel.from_pretrained(ckpt, trust_remote_code=True) - model.eval() + processor = transformers.AutoProcessor.from_pretrained(model_id, trust_remote_code=True) + processor.tokenizer.save_pretrained(model_dir) + ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(processor.tokenizer, with_detokenizer=True) + openvino.save_model(ov_tokenizer, model_dir / "openvino_tokenizer.xml") + openvino.save_model(ov_detokenizer, model_dir / "openvino_detokenizer.xml") + model = OVModelForVisualCausalLM.from_pretrained(model_id, compile=False, device="CPU", export=True, trust_remote_code=True) model.config.save_pretrained(model_dir) - tokenizer = AutoTokenizer.from_pretrained(ckpt, trust_remote_code=True) - tokenizer.save_pretrained(model_dir) - ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(tokenizer, with_detokenizer=True) - ov.save_model(ov_tokenizer, os.path.join(model_dir, "openvino_tokenizer.xml")) - ov.save_model(ov_detokenizer, os.path.join(model_dir, "openvino_detokenizer.xml")) - processor = AutoProcessor.from_pretrained(ckpt, trust_remote_code=True) - processor.save_pretrained(model_dir) - - convert_llm(model, model_dir) - del model.llm - gc.collect() - - convert_vision_encoder(model, model_dir) + model.generation_config.save_pretrained(model_dir) + model.save_pretrained(model_dir) return model_dir -sampling_configs = [ - get_beam_search(), - get_greedy(), - get_multinomial_all_parameters() -] prompts = [ "What is on the image?", "What is special about this image?", - "Tell me more about this image." ] image_links = [ @@ -75,39 +39,51 @@ def get_ov_model(model_dir): image_links_for_testing = [ [], [image_links[0]], - [image_links[1], image_links[0]], [image_links[0], image_links[2], image_links[1]] ] @pytest.mark.precommit -def test_vlm_pipeline(tmp_path): - import os - +@pytest.mark.nightly +def test_vlm_pipeline(cache): def streamer(word: str) -> bool: - print(word, end="") return False - model_path = get_ov_model(os.path.join(tmp_path, "miniCPM")) + model_path = get_ov_model(cache.mkdir("MiniCPM-V-2_6")) - for generation_config in sampling_configs: - for links in image_links_for_testing: - images = [] - for link in links: - images.append(get_image_by_link(link)) + for links in image_links_for_testing: + images = [] + for link in links: + images.append(get_image_by_link(link)) - pipe = VLMPipeline(model_path, "CPU") - pipe.start_chat() + pipe = VLMPipeline(str(model_path), "CPU") + pipe.start_chat() - pipe.generate(prompts[0], images=images, generation_config=generation_config, streamer=streamer) + pipe.generate(prompts[0], images=images, generation_config=get_greedy(), streamer=streamer) - for prompt in prompts[1:]: - pipe.generate(prompt, generation_config=generation_config, streamer=streamer) + for prompt in prompts[1:]: + pipe.generate(prompt, generation_config=get_greedy(), streamer=streamer) - pipe.finish_chat() - gc.collect() + pipe.finish_chat() + + +@pytest.mark.precommit +@pytest.mark.nightly +def test_vlm_get_tokenizer(cache): + model_path = get_ov_model(cache.mkdir("MiniCPM-V-2_6")) + pipe = VLMPipeline(str(model_path), "CPU") tokenizer = pipe.get_tokenizer() tokenizer.encode("") - del pipe - gc.collect() +@pytest.mark.precommit +@pytest.mark.nightly +@pytest.mark.parametrize("config", [ + get_beam_search(), + get_multinomial_all_parameters(), +]) +@pytest.mark.skip("Enable after sampler are enabled") +def test_sampling(config, cache): + model_path = get_ov_model(cache.mkdir("MiniCPM-V-2_6")) + image = get_image_by_link(image_links[0]) + pipe = VLMPipeline(str(model_path), "CPU") + pipe.generate(prompts[0], image=image, generation_config=config)