From 3afe73a459b7f35fbfce2ef70a339ee940129375 Mon Sep 17 00:00:00 2001 From: Anastasiia Pnevskaia Date: Fri, 11 Oct 2024 12:30:42 +0200 Subject: [PATCH] Used optimum-cli for miniCPM conversion. --- .../visual_language_chat.py | 4 +- src/python/py_vlm_pipeline.cpp | 29 +- tests/python_tests/export_MiniCPM.py | 964 ------------------ tests/python_tests/requirements.txt | 1 + tests/python_tests/test_vlm_api.py | 44 +- 5 files changed, 15 insertions(+), 1027 deletions(-) delete mode 100644 tests/python_tests/export_MiniCPM.py diff --git a/samples/python/visual_language_chat/visual_language_chat.py b/samples/python/visual_language_chat/visual_language_chat.py index 2eb4e3a8b2..8d908e0bf7 100644 --- a/samples/python/visual_language_chat/visual_language_chat.py +++ b/samples/python/visual_language_chat/visual_language_chat.py @@ -55,7 +55,7 @@ def main(): pipe.start_chat() prompt = input('question:\n') - pipe(prompt, image=image, generation_config=config, streamer=streamer) + pipe.generate(prompt, image=image, generation_config=config, streamer=streamer) print('\n----------') while True: @@ -63,7 +63,7 @@ def main(): prompt = input('question:\n') except EOFError: break - pipe(prompt, generation_config=config, streamer=streamer) + pipe.generate(prompt, generation_config=config, streamer=streamer) print('\n----------') pipe.finish_chat() diff --git a/src/python/py_vlm_pipeline.cpp b/src/python/py_vlm_pipeline.cpp index 04faed542a..1895d53003 100644 --- a/src/python/py_vlm_pipeline.cpp +++ b/src/python/py_vlm_pipeline.cpp @@ -121,6 +121,7 @@ void init_vlm_pipeline(py::module_& m) { .def("start_chat", &ov::genai::VLMPipeline::start_chat, py::arg("system_message") = "") .def("finish_chat", &ov::genai::VLMPipeline::finish_chat) .def("get_generation_config", &ov::genai::VLMPipeline::get_generation_config) + .def("set_generation_config", &ov::genai::VLMPipeline::set_generation_config) .def( "generate", [](ov::genai::VLMPipeline& pipe, @@ -148,33 +149,5 @@ void init_vlm_pipeline(py::module_& m) { }, py::arg("prompt"), "Input string", (vlm_generate_kwargs_docstring + std::string(" \n ")).c_str() - ) - .def( - "__call__", - [](ov::genai::VLMPipeline& pipe, - const std::string& prompt, - const std::vector& images, - const ov::genai::GenerationConfig& generation_config, - const utils::PyBindStreamerVariant& streamer, - const py::kwargs& kwargs - ) { - return call_vlm_generate(pipe, prompt, images, generation_config, streamer, kwargs); - }, - py::arg("prompt"), "Input string", - py::arg("images"), "Input images", - py::arg("generation_config") = std::nullopt, "generation_config", - py::arg("streamer") = std::monostate(), "streamer", - (vlm_generate_docstring + std::string(" \n ")).c_str() - ) - .def( - "__call__", - [](ov::genai::VLMPipeline& pipe, - const std::string& prompt, - const py::kwargs& kwargs - ) { - return call_vlm_generate(pipe, prompt, kwargs); - }, - py::arg("prompt"), "Input string", - (vlm_generate_kwargs_docstring + std::string(" \n ")).c_str() ); } diff --git a/tests/python_tests/export_MiniCPM.py b/tests/python_tests/export_MiniCPM.py deleted file mode 100644 index 9ad342fe0d..0000000000 --- a/tests/python_tests/export_MiniCPM.py +++ /dev/null @@ -1,964 +0,0 @@ -import argparse -import requests -import torch -from threading import Thread -from copy import deepcopy -import shutil -import json -from PIL import Image -from transformers import AutoModel, AutoTokenizer, AutoProcessor, TextIteratorStreamer -from transformers.generation import GenerationMixin -from transformers import AutoConfig, GenerationConfig -from transformers.modeling_outputs import CausalLMOutputWithPast -from pathlib import Path -from huggingface_hub import snapshot_download -import types -from typing import Optional, Tuple, List -from openvino.runtime import opset13 -import openvino as ov -import openvino_tokenizers -import numpy as np -import gc - -text_emb_path = Path("embed_tokens.xml") -image_emb_path = Path("image_encoder.xml") -resampler_path = Path("resampler.xml") -llm_path = Path("language_model.xml") - - -def model_has_state(ov_model: ov.Model): - return len(ov_model.get_sinks()) > 0 - - -def model_has_input_output_name(ov_model: ov.Model, name: str): - """ - Helper function for checking that model has specified input or output name - - Parameters: - ov_model (ov.Model): - name (str): - name of input or output - - Returns: - True if input or output with requested name exists else False - """ - return name in sum([list(t.get_names()) for t in ov_model.inputs + ov_model.outputs], []) - - -def fuse_cache_reorder( - ov_model: ov.Model, - not_kv_inputs: List[str], - key_value_input_names: List[str], - gather_dim: int, -): - """ - Fuses reored_cache during generate cycle into ov.Model. Used with stateful models, because we can not modify model state directly. - - Adds a new beam_idx parameter and Gather op per each kv-cache input in a given model. - Should be run before make_stateful. Implements optimumum's _reorder_cache - inside the model in the beginning of each iteration. - Gather works along given gather_dim dimension that may vary from model to model. - KV-cache inputs are identified based on names in key_value_input_names. - Append the new beam_idx parameter to not_kv_inputs. - - Parameters: - ov_model (`ov.Model`): - openvino model for processing - not_kv_inputs (`List[str]`): - list of input nodes in model that not related to past key values - key_value_input_names (`List[str]`): - list of names for key value input layers - gather_dim (int): - dimension for gathering cache during reorder pass - """ - - if model_has_input_output_name(ov_model, "beam_idx"): - raise ValueError("Model already has fused cache") - input_batch = ov_model.input("inputs_embeds").get_partial_shape()[0] - beam_idx = opset13.parameter(name="beam_idx", dtype=ov.Type.i32, shape=ov.PartialShape([input_batch])) - beam_idx.output(0).get_tensor().add_names({"beam_idx"}) - ov_model.add_parameters([beam_idx]) - not_kv_inputs.append(ov_model.inputs[-1]) - # Go over all cache parameters and fuse _reorder_cache with indices provided by the new parameter beam_idx - for input_name in key_value_input_names: - parameter_output_port = ov_model.input(input_name) - consumers = parameter_output_port.get_target_inputs() - gather = opset13.gather(parameter_output_port, beam_idx, opset13.constant(gather_dim)) - for consumer in consumers: - consumer.replace_source_output(gather.output(0)) - ov_model.validate_nodes_and_infer_types() - - -def build_state_initializer(ov_model: ov.Model, batch_dim: int): - """ - Build initialization ShapeOf Expression for all ReadValue ops - - Parameters: - ov_model (ov.Model): - openvino model - batch_dim (int): - index of dimension corresponding to batch size - """ - input_ids = ov_model.input("inputs_embeds") - batch = opset13.gather( - opset13.shape_of(input_ids, output_type="i64"), - opset13.constant([0]), - opset13.constant(0), - ) - for op in ov_model.get_ops(): - if op.get_type_name() == "ReadValue": - dims = [dim.min_length for dim in list(op.get_output_partial_shape(0))] - dims[batch_dim] = batch - dims = [(opset13.constant(np.array([dim], dtype=np.int64)) if isinstance(dim, int) else dim) for dim in dims] - shape = opset13.concat(dims, axis=0) - broadcast = opset13.broadcast(opset13.constant(0.0, dtype=op.get_output_element_type(0)), shape) - op.set_arguments([broadcast]) - ov_model.validate_nodes_and_infer_types() - - -def make_stateful( - ov_model: ov.Model, - not_kv_inputs: List[str], - key_value_input_names: List[str], - key_value_output_names: List[str], - batch_dim: int, - num_attention_heads: int, - num_beams_and_batch: int = None, -): - """ - Hides kv-cache inputs and outputs inside the model as variables. - - Parameters: - ov_model (ov.Model): - openvino model - not_kv_inputs (`List[str]`): - list of input nodes in model that not related to past key values - key_value_input_names (`List[str]`): - list of names for key value input layers - key_value_output_names (`List[str]`): - list of names for key value input layers - batch_dim (int): - index of batch dimension in key value layers - num_attention_heads (int): - number of attention heads for batch dimension initialization - num_beams_an_batch (int): - precalculated number of beams and batch for shapes initialization - """ - from openvino._offline_transformations import apply_make_stateful_transformation - - input_output_map = {} - - if num_beams_and_batch is not None: - # Set batch size for input_ids and attention mask to avoid dynamic dimension got propagated from the end of the model back to ReadValue - for input in not_kv_inputs: - shape = input.get_partial_shape() - if shape.rank.get_length() <= 2: # == 1 for beam_index - shape[0] = num_beams_and_batch - input.get_node().set_partial_shape(shape) - for kv_name_pair in zip(key_value_input_names, key_value_output_names): - input_output_map[kv_name_pair[0]] = kv_name_pair[1] - if num_beams_and_batch is not None: - input = ov_model.input(kv_name_pair[0]) - shape = input.get_partial_shape() - shape[batch_dim] = num_beams_and_batch * num_attention_heads - input.get_node().set_partial_shape(shape) - - if num_beams_and_batch is not None: - # Re-validation model if shapes are altered above - ov_model.validate_nodes_and_infer_types() - - apply_make_stateful_transformation(ov_model, input_output_map) - if num_beams_and_batch is None: - build_state_initializer(ov_model, batch_dim) - - -def patch_stateful(ov_model): - key_value_input_names = [key.get_any_name() for key in ov_model.inputs[2:-1]] - key_value_output_names = [key.get_any_name() for key in ov_model.outputs[1:]] - not_kv_inputs = [input for input in ov_model.inputs if not any(name in key_value_input_names for name in input.get_names())] - if not key_value_input_names or not key_value_output_names: - return - batch_dim = 0 - num_attention_heads = 1 - - fuse_cache_reorder(ov_model, not_kv_inputs, key_value_input_names, batch_dim) - make_stateful( - ov_model, - not_kv_inputs, - key_value_input_names, - key_value_output_names, - batch_dim, - num_attention_heads, - None, - ) - - -def cleanup_torchscript_cache(): - """ - Helper for removing cached model representation - """ - torch._C._jit_clear_class_registry() - torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore() - torch.jit._state._clear_class_state() - - -def get_2d_sincos_pos_embed(embed_dim, image_size): - """ - image_size: image_size or (image_height, image_width) - return: - pos_embed: [image_height, image_width, embed_dim] - """ - if isinstance(image_size, int): - grid_h_size, grid_w_size = image_size, image_size - else: - grid_h_size, grid_w_size = image_size[0], image_size[1] - - grid_h = np.arange(grid_h_size, dtype=np.float32) - grid_w = np.arange(grid_w_size, dtype=np.float32) - grid = np.meshgrid(grid_w, grid_h) # here w goes first - grid = np.stack(grid, axis=0) - - pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) - return pos_embed - - -def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): - assert embed_dim % 2 == 0 - - # use half of dimensions to encode grid_h - emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim // 2, grid[0]) # (H, W, D/2) - emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim // 2, grid[1]) # (H, W, D/2) - - emb = np.concatenate([emb_h, emb_w], axis=-1) # (H, W, D) - return emb - - -def get_1d_sincos_pos_embed_from_grid_new(embed_dim, pos): - """ - embed_dim: output dimension for each position - pos: a list of positions to be encoded: size (H, W) - out: (H, W, D) - """ - assert embed_dim % 2 == 0 - omega = np.arange(embed_dim // 2, dtype=np.float32) - omega /= embed_dim / 2.0 - omega = 1.0 / 10000**omega # (D/2,) - - out = np.einsum("hw,d->hwd", pos, omega) # (H, W, D/2), outer product - - emb_sin = np.sin(out) # (H, W, D/2) - emb_cos = np.cos(out) # (H, W, D/2) - - emb = np.concatenate([emb_sin, emb_cos], axis=-1) # (H, W, D) - return emb - - -def patch_model_code(orig_model_dir): - model_file = orig_model_dir / "modeling_navit_siglip.py" - orig_model_file = model_file.parent / ("orig_" + model_file.name) - if not orig_model_file.exists(): - model_file.rename(orig_model_file) - with orig_model_file.open("r") as f: - content = f.read() - content = content.replace("if is_flash_attn_2_available():", "") - content = content.replace("from flash_attn import flash_attn_func, flash_attn_varlen_func", "") - content = content.replace("from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input", "") - - with model_file.open("w") as out_f: - out_f.write(content) - - -def convert_llm(model, model_dir): - model.llm.config.save_pretrained(model_dir / text_emb_path.parent) - print("⌛ Convert Input embedding model") - ov_model = ov.convert_model(model.llm.model.embed_tokens, example_input=torch.ones([1, 10], dtype=torch.long)) - - ov.save_model(ov_model, model_dir / text_emb_path) - del ov_model - cleanup_torchscript_cache() - gc.collect() - print("✅ Input embedding model successfully converted") - - print("⌛ Convert Language model") - hidden_size = model.llm.config.hidden_size - num_pkv = model.llm.config.num_hidden_layers - pkv_shape = (2, model.llm.config.num_key_value_heads, 2, hidden_size // model.llm.config.num_attention_heads) - - input_embeds = torch.randn((2, 2, hidden_size)) - attention_mask = torch.ones([2, 4], dtype=torch.long) - position_ids = torch.tensor([[2, 3], [2, 3]], dtype=torch.long) - input_names = ["attention_mask", "position_ids"] - output_names = ["logits"] - - past_key_values = [] - for i in range(num_pkv): - kv = [torch.randn(pkv_shape) for _ in range(2)] - past_key_values.append(kv) - input_names.extend([f"past_key_values.{i}.key", f"past_key_values.{i}.value"]) - output_names.extend([f"present.{i}.key", f"present.{i}.value"]) - input_names.append("inputs_embeds") - - example_input = {"inputs_embeds": input_embeds, "attention_mask": attention_mask, "position_ids": position_ids, "past_key_values": past_key_values} - - model.llm.config.torchscript = True - - ov_model = ov.convert_model(model.llm, example_input=example_input) - - for out, out_name in zip(ov_model.outputs, output_names): - out.get_tensor().set_names({out_name}) - - for inp, inp_name in zip(ov_model.inputs, input_names): - inp.get_tensor().set_names({inp_name}) - - patch_stateful(ov_model) - - ov.save_model(ov_model, model_dir / llm_path) - del ov_model - - cleanup_torchscript_cache() - gc.collect() - print("✅ Language model successfully converted") - - -def convert_vision_encoder(model, model_dir): - tgt_sizes = torch.tensor([[23, 45]]) - if not (model_dir / image_emb_path).exists(): - print("⌛ Convert Image embedding model") - pixel_values = torch.randn([1, 3, 14, 14490]) - patch_attn_mask = torch.zeros((1, 1, 1035), dtype=torch.bool) - patch_attn_mask[0, 0, : tgt_sizes[0][0] * tgt_sizes[0][1]] = True - ov_model = ov.convert_model(model.vpm, example_input={"pixel_values": pixel_values, "tgt_sizes": tgt_sizes, "patch_attention_mask": patch_attn_mask}) - ov.save_model(ov_model, model_dir / image_emb_path) - del ov_model - cleanup_torchscript_cache() - print("✅ Image embedding model successfully converted") - - if not (model_dir / resampler_path).exists(): - print("⌛ Convert Resamler model") - - def resampler_forward(self, x, pos_embed, key_padding_mask): - bs = x.shape[0] - x = self.kv_proj(x) # B * L * D - x = self.ln_kv(x).permute(1, 0, 2) # L * B * D - - q = self.ln_q(self.query) # Q * D - - out = self.attn(self._repeat(q, bs), x + pos_embed, x, key_padding_mask=key_padding_mask)[0] # Q * B * D # L * B * D + L * B * D - # out: Q * B * D - x = out.permute(1, 0, 2) # B * Q * D - - x = self.ln_post(x) - x = x @ self.proj - return x - - model.resampler.forward = types.MethodType(resampler_forward, model.resampler) - - pos_embed_base = get_2d_sincos_pos_embed(model.resampler.embed_dim, 70) - - patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1] - - max_patch_len = torch.max(patch_len) - key_padding_mask = torch.zeros((1, max_patch_len), dtype=torch.bool) - - pos_embed = [] - tgt_h, tgt_w = tgt_sizes[0] - pos_embed = torch.from_numpy(pos_embed_base[:tgt_h, :tgt_w, :].reshape((tgt_h * tgt_w, 1, -1))) # patches * D - key_padding_mask[0, patch_len:] = True - - ov_model = ov.convert_model(model.resampler, example_input=[torch.randn(1, 1035, 1152), pos_embed, key_padding_mask]) - ov.save_model(ov_model, model_dir / resampler_path) - del ov_model - cleanup_torchscript_cache() - print("✅ Resampler model successfully converted") - - -def copy_llm_files(model_dir, dst_dir): - shutil.copy(model_dir / text_emb_path, model_dir / dst_dir / text_emb_path.name) - shutil.copy(model_dir / text_emb_path.with_suffix(".bin"), model_dir / dst_dir / text_emb_path.with_suffix(".bin").name) - shutil.copy(model_dir / llm_path.parent / "config.json", model_dir / dst_dir / "config.json") - shutil.copy(model_dir / llm_path.parent / "configuration_minicpm.py", model_dir / dst_dir / "configuration_minicpm.py") - shutil.copy(model_dir / llm_path.parent / "modeling_navit_siglip.py", model_dir / dst_dir / "modeling_navit_siglip.py") - - -core = ov.Core() - - -class OvModelForCausalLMWithEmb(GenerationMixin): - def __init__(self, model_dir, device="CPU", ov_config=None, compile=True) -> None: - self._supports_cache_class = False - self.config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True) - self.config.is_decoder = True - self.config.is_encoder_decoder = False - self.generation_config = GenerationConfig.from_model_config(self.config) - model_dir = Path(model_dir) - self.model = core.read_model(model_dir / "language_model.xml") - self.token_emb = core.read_model(model_dir / "embed_tokens.xml") - self.request = None - self.token_emb_request = None - self._device = device.upper() - self.device = torch.device("cpu") - self.ov_config = ov_config - self.next_beam_idx = None - self._past_length = None - self.input_names = [input_t.get_any_name() for input_t in self.model.inputs] - self.main_input_name = "input_ids" - if compile: - self.compile() - - def compile(self): - if self.request is None: - self.request = core.compile_model(self.model, self._device, self.ov_config).create_infer_request() - self._compile_token_emb() - - def _compile_token_emb(self): - if self.token_emb_request is None: - self.token_emb_request = core.compile_model(self.token_emb, self._device, self.ov_config) - - def to(self, device: str): - if isinstance(device, str): - self._device = device.upper() - self.clear_requests() - - return self - - def clear_requests(self): - del self.request - del self.token_emb_request - self.request = None - self.token_emb_request = None - - def embed_tokens(self, input_ids: torch.LongTensor): - self._compile_token_emb() - res = self.token_emb_request(input_ids, share_inputs=True) - return res[0] - - def prepare_inputs( - self, - input_ids: torch.LongTensor, - attention_mask: Optional[torch.LongTensor] = None, - past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, - position_ids: Optional[torch.LongTensor] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - **kwargs, - ): - batch_size = input_ids.shape[0] if input_ids is not None else inputs_embeds.shape[0] - - inputs = {} - # past_key_values are not used explicitly, instead they are handled inside the model - if past_key_values is None: - # This is the first iteration in a sequence, reset all states - if self.request is not None: - self.request.reset_state() - # Set initial value for the next beam_idx input that will be used at the current iteration - # and will be optionally updated by _reorder_cache at the next iterations if beam_search is used - self.next_beam_idx = np.arange(batch_size, dtype=int) - self._past_length = 0 - past_len = self._get_past_length(past_key_values) - - if inputs_embeds is None: - inputs_embeds = self.embed_tokens(input_ids if past_key_values is None else input_ids[:, -1:]) - - if hasattr(self.config, "scale_emb"): - inputs_embeds = inputs_embeds * self.config.scale_emb - inputs["inputs_embeds"] = inputs_embeds - - # Add the attention_mask inputs when needed - if "attention_mask" in self.input_names or "position_ids" in self.input_names: - if attention_mask is not None: - attention_mask = np.array(attention_mask) - else: - attention_mask = np.ones((inputs_embeds.shape[0], inputs_embeds.shape[1] + past_len), dtype=int) - - if "attention_mask" in self.input_names: - inputs["attention_mask"] = attention_mask - - if "position_ids" in self.input_names: - if position_ids is not None: - position_ids = np.array(position_ids) - else: - position_ids = np.cumsum(attention_mask, axis=1) - 1 - position_ids[attention_mask == 0] = 1 - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - - inputs["position_ids"] = position_ids - - if "beam_idx" in self.input_names: - inputs["beam_idx"] = self.next_beam_idx if self.next_beam_idx is not None else np.arange(batch_size, dtype=int) - - return inputs - - def forward( - self, - input_ids: torch.LongTensor, - attention_mask: Optional[torch.LongTensor] = None, - past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, - position_ids: Optional[torch.LongTensor] = None, - inputs_embeds: Optional[torch.LongTensor] = None, - **kwargs, - ): - self.compile() - - inputs = self.prepare_inputs( - input_ids=input_ids, - attention_mask=attention_mask, - past_key_values=past_key_values, - position_ids=position_ids, - inputs_embeds=inputs_embeds, - **kwargs, - ) - - # Run inference - self.request.start_async(inputs, share_inputs=True) - self.request.wait() - logits = self.request.get_tensor("logits").data - logits = torch.from_numpy(logits).to(self.device) - past_key_values = ((),) - self._past_length += inputs["inputs_embeds"].shape[1] - - return CausalLMOutputWithPast(logits=logits, past_key_values=past_key_values) - - # Adapted from transformers.models.llama.modeling_llama.LlamaForCausalLM.prepare_inputs_for_generation - def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs): - # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly - attention_mask = kwargs.get("attention_mask", None) - use_cache = kwargs.get("use_cache", None) - - if past_key_values is not None: - past_len = self._get_past_length(past_key_values) - # Keep only the unprocessed tokens: - # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where - # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as - # input) - if attention_mask is not None and input_ids is not None and attention_mask.shape[1] > input_ids.shape[1]: - input_ids = input_ids[:, -(attention_mask.shape[1] - past_len) :] - # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard - # input_ids based on the past_length. - elif input_ids is not None and past_len < input_ids.shape[1]: - input_ids = input_ids[:, past_len:] - # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens - position_ids = kwargs.get("position_ids", None) - if attention_mask is not None and position_ids is None and "position_ids" in self.input_names: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values and input_ids is not None: - position_ids = position_ids[:, -input_ids.shape[1] :] - - model_inputs = { - "input_ids": input_ids, - "past_key_values": past_key_values, - "use_cache": use_cache, - "position_ids": position_ids, - "attention_mask": attention_mask, - "inputs_embeds": inputs_embeds if past_key_values is None else None, - } - - return model_inputs - - def _get_past_length(self, past_key_values=None): - if past_key_values is None: - return 0 - return self._past_length - - # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel._reorder_cache - def _reorder_cache(self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]: - """ - This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or - [`~PreTrainedModel.beam_sample`] is called. - This is required to match `past_key_values` with the correct beam_idx at every generation step. - """ - self.next_beam_idx = np.array(beam_idx) # save beam_idx to be used as an input in the next iteration - return past_key_values - - def can_generate(self): - """Returns True to validate the check that the model using `GenerationMixin.generate()` can indeed generate.""" - - return True - - def __call__(self, *args, **kwargs): - return self.forward(*args, **kwargs) - - -class OvMiniCPMV: - def __init__(self, config, vpm, resampler, llm, processor): - self.config = config - self.llm = llm - self.vpm = vpm - self.embed_dim = self.llm.config.hidden_size - self._resampler = resampler - self.processor = processor - self._pos_embeds = torch.from_numpy(get_2d_sincos_pos_embed(self.embed_dim, 70)).float() - self.max_size = (70, 70) - - self.terminators = ["<|im_end|>", "<|endoftext|>"] - - def set_decoder(self, decoder): - self.llm = decoder - - def get_decoder(self): - return self.llm - - def resampler(self, x, tgt_sizes): - bs = x.shape[0] - - patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1] - - self._adjust_pos_cache(tgt_sizes) - - max_patch_len = torch.max(patch_len) - key_padding_mask = torch.zeros((bs, max_patch_len), dtype=torch.bool) - - pos_embed = [] - for i in range(bs): - tgt_h, tgt_w = tgt_sizes[i] - pos_embed.append(self._pos_embeds[:tgt_h, :tgt_w, :].reshape((tgt_h * tgt_w, -1))) # patches * D - key_padding_mask[i, patch_len[i] :] = True - - pos_embed = torch.nn.utils.rnn.pad_sequence(pos_embed, batch_first=True, padding_value=0.0).permute(1, 0, 2) # BLD => L * B * D - - res = torch.from_numpy(self._resampler([x, pos_embed, key_padding_mask])[0]) - return res - - def _set_2d_pos_cache(self, max_size): - pos_embed = torch.from_numpy(get_2d_sincos_pos_embed(self.embed_dim, max_size)).float() - self._pos_embed = pos_embed - - def _adjust_pos_cache(self, tgt_sizes): - max_h = torch.max(tgt_sizes[:, 0]) - max_w = torch.max(tgt_sizes[:, 1]) - if max_h > self.max_size[0] or max_w > self.max_size[1]: - self.max_size = [max(max_h, self.max_size[0]), max(max_w, self.max_size[1])] - self._set_2d_pos_cache(self.max_size) - - def get_vllm_embedding(self, data): - if "vision_hidden_states" not in data: - tgt_sizes = data["tgt_sizes"] - pixel_values_list = data["pixel_values"] - vision_hidden_states = [] - all_pixel_values = [] - img_cnt = [] - for pixel_values in pixel_values_list: - img_cnt.append(len(pixel_values)) - all_pixel_values.extend([i.flatten(end_dim=1).permute(1, 0) for i in pixel_values]) - - # exist image - if all_pixel_values: - tgt_sizes = [tgt_size for tgt_size in tgt_sizes if isinstance(tgt_size, torch.Tensor)] - tgt_sizes = torch.vstack(tgt_sizes).type(torch.int32) - - max_patches = torch.max(tgt_sizes[:, 0] * tgt_sizes[:, 1]) - - all_pixel_values = torch.nn.utils.rnn.pad_sequence(all_pixel_values, batch_first=True, padding_value=0.0) - B, L, _ = all_pixel_values.shape - all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L) - - patch_attn_mask = torch.zeros((B, 1, max_patches), dtype=torch.bool) - for i in range(B): - patch_attn_mask[i, 0, : tgt_sizes[i][0] * tgt_sizes[i][1]] = True - - vision_batch_size = 1 - all_pixel_values = all_pixel_values - if B > vision_batch_size: - hs = [] - for i in range(0, B, vision_batch_size): - start_idx = i - end_idx = i + vision_batch_size - tmp_hs = torch.from_numpy( - self.vpm([all_pixel_values[start_idx:end_idx], patch_attn_mask[start_idx:end_idx], tgt_sizes[start_idx:end_idx]])[0] - ) - hs.append(tmp_hs) - vision_embedding = torch.cat(hs, dim=0) - else: - vision_embedding = torch.from_numpy(self.vpm([all_pixel_values, patch_attn_mask, tgt_sizes])[0]) - vision_embedding = self.resampler(vision_embedding, tgt_sizes) - - start = 0 - for pixel_values in pixel_values_list: - img_cnt = len(pixel_values) - if img_cnt > 0: - vision_hidden_states.append(vision_embedding[start : start + img_cnt]) - start += img_cnt - else: - vision_hidden_states.append([]) - else: # no image - dummy_feature = [] - for _ in range(len(pixel_values_list)): - vision_hidden_states.append(dummy_feature) - - else: - vision_hidden_states = data["vision_hidden_states"] - - if hasattr(self.llm.config, "scale_emb"): - vllm_embedding = self.llm.embed_tokens(data["input_ids"]) * self.llm.config.scale_emb - else: - vllm_embedding = self.llm.embed_tokens(data["input_ids"]) - - bs = len(data["input_ids"]) - for i in range(bs): - cur_vs_hs = vision_hidden_states[i] - if len(cur_vs_hs) > 0: - cur_vllm_emb = torch.from_numpy(vllm_embedding[i]) - cur_image_bound = data["image_bound"][i] - if len(cur_image_bound) > 0: - image_indices = torch.stack([torch.arange(r[0], r[1], dtype=torch.long) for r in cur_image_bound]) - - cur_vllm_emb.scatter_(0, image_indices.view(-1, 1).repeat(1, cur_vllm_emb.shape[-1]), cur_vs_hs.view(-1, cur_vs_hs.shape[-1])) - return vllm_embedding - - def forward(self, data, **kwargs): - vllm_embedding = self.get_vllm_embedding(data) - position_ids = data["position_ids"] - if position_ids.dtype != torch.int64: - position_ids = position_ids.long() - - return self.llm(input_ids=None, position_ids=position_ids, inputs_embeds=vllm_embedding, **kwargs) - - def _decode(self, inputs_embeds, tokenizer, attention_mask, decode_text=False, **kwargs): - terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators] - output = self.llm.generate( - inputs_embeds=torch.from_numpy(inputs_embeds), pad_token_id=0, eos_token_id=terminators, attention_mask=attention_mask, **kwargs - ) - if decode_text: - return self._decode_text(output, tokenizer) - return output - - def _decode_stream(self, inputs_embeds, tokenizer, **kwargs): - terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators] - streamer = TextIteratorStreamer(tokenizer=tokenizer) - generation_kwargs = {"inputs_embeds": torch.from_numpy(inputs_embeds), "pad_token_id": 0, "eos_token_id": terminators, "streamer": streamer} - generation_kwargs.update(kwargs) - - thread = Thread(target=self.llm.generate, kwargs=generation_kwargs) - thread.start() - - return streamer - - def _decode_text(self, result_ids, tokenizer): - terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators] - result_text = [] - for result in result_ids: - result = result[result != 0] - if result[0] == tokenizer.bos_id: - result = result[1:] - if result[-1] in terminators: - result = result[:-1] - result_text.append(tokenizer.decode(result).strip()) - return result_text - - def generate( - self, - input_ids=None, - pixel_values=None, - tgt_sizes=None, - image_bound=None, - attention_mask=None, - tokenizer=None, - vision_hidden_states=None, - return_vision_hidden_states=False, - stream=False, - decode_text=False, - **kwargs, - ): - assert input_ids is not None - assert len(input_ids) == len(pixel_values) - - model_inputs = { - "input_ids": input_ids, - "image_bound": image_bound, - } - - if vision_hidden_states is None: - model_inputs["pixel_values"] = pixel_values - model_inputs["tgt_sizes"] = tgt_sizes - else: - model_inputs["vision_hidden_states"] = vision_hidden_states - - with torch.inference_mode(): - model_inputs["inputs_embeds"] = self.get_vllm_embedding(model_inputs) - - if stream: - result = self._decode_stream(model_inputs["inputs_embeds"], tokenizer, **kwargs) - else: - result = self._decode(model_inputs["inputs_embeds"], tokenizer, attention_mask, decode_text=decode_text, **kwargs) - - return result - - def chat( - self, - image, - msgs, - tokenizer, - processor=None, - vision_hidden_states=None, - max_new_tokens=2048, - min_new_tokens=0, - sampling=True, - max_inp_length=8192, - system_prompt="", - stream=False, - max_slice_nums=None, - use_image_id=None, - **kwargs, - ): - if isinstance(msgs[0], list): - batched = True - else: - batched = False - msgs_list = msgs - images_list = image - - if batched is False: - images_list, msgs_list = [images_list], [msgs_list] - else: - assert images_list is None, "Please integrate image to msgs when using batch inference." - images_list = [None] * len(msgs_list) - assert len(images_list) == len(msgs_list), "The batch dim of images_list and msgs_list should be the same." - - if processor is None: - if self.processor is None: - self.processor = AutoProcessor.from_pretrained(self.config._name_or_path, trust_remote_code=True) - processor = self.processor - - assert ( - self.config.query_num == processor.image_processor.image_feature_size - ), "These two values should be the same. Check `config.json` and `preprocessor_config.json`." - assert ( - self.config.patch_size == processor.image_processor.patch_size - ), "These two values should be the same. Check `config.json` and `preprocessor_config.json`." - assert ( - self.config.use_image_id == processor.image_processor.use_image_id - ), "These two values should be the same. Check `config.json` and `preprocessor_config.json`." - assert ( - self.config.slice_config.max_slice_nums == processor.image_processor.max_slice_nums - ), "These two values should be the same. Check `config.json` and `preprocessor_config.json`." - assert ( - self.config.slice_mode == processor.image_processor.slice_mode - ), "These two values should be the same. Check `config.json` and `preprocessor_config.json`." - - prompts_lists = [] - input_images_lists = [] - for image, msgs in zip(images_list, msgs_list): - if isinstance(msgs, str): - msgs = json.loads(msgs) - copy_msgs = deepcopy(msgs) - - assert len(msgs) > 0, "msgs is empty" - assert sampling or not stream, "if use stream mode, make sure sampling=True" - - if image is not None and isinstance(copy_msgs[0]["content"], str): - copy_msgs[0]["content"] = [image, copy_msgs[0]["content"]] - - images = [] - for i, msg in enumerate(copy_msgs): - role = msg["role"] - content = msg["content"] - assert role in ["user", "assistant"] - if i == 0: - assert role == "user", "The role of first msg should be user" - if isinstance(content, str): - content = [content] - cur_msgs = [] - for c in content: - if isinstance(c, Image.Image): - images.append(c) - cur_msgs.append("(./)") - elif isinstance(c, str): - cur_msgs.append(c) - msg["content"] = "\n".join(cur_msgs) - - if system_prompt: - sys_msg = {"role": "system", "content": system_prompt} - copy_msgs = [sys_msg] + copy_msgs - - prompts_lists.append(processor.tokenizer.apply_chat_template(copy_msgs, tokenize=False, add_generation_prompt=True)) - input_images_lists.append(images) - - inputs = processor( - prompts_lists, input_images_lists, max_slice_nums=max_slice_nums, use_image_id=use_image_id, return_tensors="pt", max_length=max_inp_length - ) - - if sampling: - generation_config = {"top_p": 0.8, "top_k": 100, "temperature": 0.7, "do_sample": True, "repetition_penalty": 1.05} - else: - generation_config = { - "num_beams": 3, - "repetition_penalty": 1.2, - } - - if min_new_tokens > 0: - generation_config["min_new_tokens"] = min_new_tokens - - generation_config.update((k, kwargs[k]) for k in generation_config.keys() & kwargs.keys()) - - inputs.pop("image_sizes") - with torch.inference_mode(): - res = self.generate( - **inputs, - tokenizer=tokenizer, - max_new_tokens=max_new_tokens, - vision_hidden_states=vision_hidden_states, - stream=stream, - decode_text=True, - **generation_config, - ) - - if stream: - - def stream_gen(): - for text in res: - for term in self.terminators: - text = text.replace(term, "") - yield text - - return stream_gen() - - else: - if batched: - answer = res - else: - answer = res[0] - return answer - - -def init_model(model_dir, device): - config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True) - llm = OvModelForCausalLMWithEmb(model_dir, device) - img_emb = core.compile_model(model_dir / image_emb_path, device) - resampler = core.compile_model(model_dir / resampler_path, device) - processor = AutoProcessor.from_pretrained(model_dir, trust_remote_code=True) - - ov_model = OvMiniCPMV(config, img_emb, resampler, llm, processor) - return ov_model - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("model_dir", type=Path) - model_dir = parser.parse_args().model_dir - model_id = "openbmb/MiniCPM-V-2_6" - ckpt = model_dir / "ckpt" - if not ckpt.exists(): - snapshot_download(model_id, local_dir=ckpt, force_download=True) - patch_model_code(ckpt) - model = AutoModel.from_pretrained(ckpt, trust_remote_code=True) - model.eval() - model.config.save_pretrained(model_dir) - tokenizer = AutoTokenizer.from_pretrained(ckpt, trust_remote_code=True) - tokenizer.save_pretrained(model_dir) - ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(tokenizer, with_detokenizer=True) - ov.save_model(ov_tokenizer, model_dir / "openvino_tokenizer.xml") - ov.save_model(ov_detokenizer, model_dir / "openvino_detokenizer.xml") - processor = AutoProcessor.from_pretrained(ckpt, trust_remote_code=True) - processor.save_pretrained(model_dir) - - convert_llm(model, model_dir) - del model.llm - gc.collect() - - convert_vision_encoder(model, model_dir) - ov_cpm = init_model(model_dir, "CPU") - print(ov_cpm.chat(Image.open(requests.get("https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11", stream=True).raw), [{"role": "user", "content": "What is unusual on this image?"}], ov_cpm.processor.tokenizer)) - -if "__main__" == __name__: - main() diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt index 8c49f7c1e6..66d0fff7bb 100644 --- a/tests/python_tests/requirements.txt +++ b/tests/python_tests/requirements.txt @@ -32,3 +32,4 @@ sacremoses librosa soundfile datasets +git+https://github.com/eaidova/optimum-intel.git@ea/minicpmv \ No newline at end of file diff --git a/tests/python_tests/test_vlm_api.py b/tests/python_tests/test_vlm_api.py index 1c6f2a16ca..207cc9880b 100644 --- a/tests/python_tests/test_vlm_api.py +++ b/tests/python_tests/test_vlm_api.py @@ -3,46 +3,18 @@ import openvino_genai import pytest -from openvino import Tensor +import gc +import os import numpy as np from PIL import Image from multiprocessing import Process from openvino_genai import VLMPipeline +from openvino import Tensor from common import get_greedy, get_image_by_link, get_beam_search, get_greedy, get_multinomial_all_parameters def get_ov_model(model_dir): - #TODO: use optimum-intel - from export_MiniCPM import convert_llm, convert_vision_encoder, snapshot_download, patch_model_code - from transformers import AutoModel, AutoTokenizer, AutoProcessor - import os - import openvino_tokenizers - import openvino as ov - from pathlib import Path - import gc - - model_id = "openbmb/MiniCPM-V-2_6" - ckpt = Path(os.path.join(model_dir, "ckpt")) - if not ckpt.exists(): - snapshot_download(model_id, local_dir=ckpt, force_download=True) - patch_model_code(ckpt) - model = AutoModel.from_pretrained(ckpt, trust_remote_code=True) - model.eval() - model.config.save_pretrained(model_dir) - tokenizer = AutoTokenizer.from_pretrained(ckpt, trust_remote_code=True) - tokenizer.save_pretrained(model_dir) - ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(tokenizer, with_detokenizer=True) - ov.save_model(ov_tokenizer, os.path.join(model_dir, "openvino_tokenizer.xml")) - ov.save_model(ov_detokenizer, os.path.join(model_dir, "openvino_detokenizer.xml")) - processor = AutoProcessor.from_pretrained(ckpt, trust_remote_code=True) - processor.save_pretrained(model_dir) - - convert_llm(model, model_dir) - del model.llm - gc.collect() - - convert_vision_encoder(model, model_dir) - + os.system("optimum-cli export openvino -m openbmb/MiniCPM-V-2_6 " + model_dir + " --trust-remote-code") return model_dir @@ -65,6 +37,7 @@ def get_ov_model(model_dir): ] image_links_for_testing = [ + [], [image_links[0]], [image_links[1], image_links[0]], [image_links[0], image_links[2], image_links[1]] @@ -88,10 +61,15 @@ def streamer(word: str) -> bool: pipe = VLMPipeline(model_path, "CPU") pipe.start_chat() - pipe(prompts[0], images=images, generation_config=generation_config, streamer=streamer) + if len(images): + pipe.generate(prompts[0], images=images, generation_config=generation_config, streamer=streamer) + else: + pipe.generate(prompts[0], generation_config=generation_config, streamer=streamer) + for prompt in prompts[1:]: pipe.generate(prompt, generation_config=generation_config, streamer=streamer) pipe.finish_chat() + gc.collect()