diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 097cca34dd..5bada5da5b 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -703,18 +703,18 @@ jobs:
           source ./ov/setupvars.sh
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python ./samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py ./miniCPM-V-2_6/
+          python -m pip install git+https://github.com/eaidova/optimum-intel.git@ea/minicpmv
+          python -m pip install -U "optimum<1.23" --no-dependencies
+          optimum-cli export openvino -m openbmb/MiniCPM-V-2_6 MiniCPM-V-2_6 --trust-remote-code
           wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --output-document cat.jpg
       - name: Run visual_language_chat sample - MiniCPM-V-2_6
         run: >
           source ./ov/setupvars.sh
-          && timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ cat.jpg
+          && timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./MiniCPM-V-2_6/ cat.jpg
           <<< $'What is on the image?\nWhat is special on the image?'
       - name: Download and convert LLaVa 1.5 model and an image
         run: |
           source ./ov/setupvars.sh
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           optimum-cli export openvino --model llava-hf/llava-1.5-7b-hf ./llava_1_5_7b_ov/
           wget https://llava-vl.github.io/static/images/monalisa.jpg
       - name: Run visual_language_chat sample - LLaVa 1.5
@@ -729,7 +729,7 @@ jobs:
           source ./ov/setupvars.sh
           export PYTHONPATH=./build/:$PYTHONPATH
           printf 'What is on the image?\nWhat is special on the image?\n' > ./input.txt
-          timeout 120s python ./samples/python/visual_language_chat/visual_language_chat.py ./miniCPM-V-2_6/ cat.jpg < input.txt > ./pred.txt
+          timeout 120s python ./samples/python/visual_language_chat/visual_language_chat.py ./MiniCPM-V-2_6/ cat.jpg < input.txt > ./pred.txt
 
   cpp-continuous-batching-ubuntu:
     runs-on: ubuntu-20.04-8-cores
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index a3d44f28c1..bc56996a1f 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -268,7 +268,13 @@ jobs:
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels
-          python -m pytest ./tests/python_tests --ignore ./tests/python_tests/test_whisper_generate_api.py
+          python -m pytest ./tests/python_tests --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py
+
+      - run: python -m pip install git+https://github.com/eaidova/optimum-intel.git@ea/minicpmv
+      - run: python -m pip install -U "optimum<1.23" --no-dependencies
+      - run: >
+            source ${OV_INSTALL_DIR}/setupvars.sh
+            && python -m pytest ./tests/python_tests/test_vlm_api.py
 
   genai_python_lib_whisper:
     name: OpenVINO genai extension whisper tests (cmake + wheel)
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index e803bae787..d40aa02710 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -366,6 +366,8 @@ jobs:
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels --upgrade-strategy eager
+          python -m pip install git+https://github.com/eaidova/optimum-intel.git@ea/minicpmv
+          python -m pip install -U "optimum<1.23" --no-dependencies
           python -m pytest ./tests/python_tests/test_vlm_api.py
         env:
           PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
diff --git a/samples/cpp/visual_language_chat/README.md b/samples/cpp/visual_language_chat/README.md
index b9d0ebcfe4..99ba417baf 100644
--- a/samples/cpp/visual_language_chat/README.md
+++ b/samples/cpp/visual_language_chat/README.md
@@ -10,7 +10,7 @@ It's not required to install [../../requirements.txt](../../requirements.txt) fo
 
 ```sh
 pip install --upgrade-strategy eager -r ../../requirements.txt
-export_MiniCPM-V-2_6.py miniCPM-V-2_6
+optimum-cli export openvino --model openbmb/MiniCPM-V-2_6 --trust-remote-code MiniCPM-V-2_6
 ```
 
 ## Run
diff --git a/samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py b/samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py
deleted file mode 100644
index 903979b795..0000000000
--- a/samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py
+++ /dev/null
@@ -1,1199 +0,0 @@
-import argparse
-import requests
-import torch
-from threading import Thread
-from copy import deepcopy
-import shutil
-import json
-from PIL import Image
-from transformers import AutoModel, AutoTokenizer, AutoProcessor, TextIteratorStreamer
-from transformers.generation import GenerationMixin
-from transformers import AutoConfig, GenerationConfig
-from transformers.modeling_outputs import CausalLMOutputWithPast, BaseModelOutputWithPooling
-from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
-from pathlib import Path
-from huggingface_hub import snapshot_download
-import types
-from typing import Optional, Tuple, List, Union
-from openvino.runtime import opset13
-import openvino as ov
-import openvino_tokenizers
-import numpy as np
-import gc
-from openvino.runtime.passes import Manager, MatcherPass, WrapType, Matcher
-import time
-
-text_emb_path = Path("embed_tokens.xml")
-image_emb_path = Path("image_encoder.xml")
-resampler_path = Path("resampler.xml")
-llm_path = Path("language_model.xml")
-
-class InsertSlice(MatcherPass):
-    def __init__(self):
-        MatcherPass.__init__(self)
-        self.model_changed = False
-
-        param = WrapType("opset10.Result")
-
-        def callback(matcher: Matcher) -> bool:
-            root = matcher.get_match_root()
-            if root is None:
-                return False
-            if len(root.get_output_partial_shape(0)) == 3:
-                parent = root.input_value(0).get_node()
-                grand_parent = parent.input_value(0).get_node()
-
-                grand_parent_output = parent.input(0).get_source_output()
-                consumers = grand_parent_output.get_target_inputs()
-                start = np.array([0, -1, 0], dtype=np.int32)
-                stop = np.array([1, -2, grand_parent_output.get_partial_shape()[-1].get_length()], dtype=np.int32)
-                step = np.array([1, -1, 1], dtype=np.int32)
-                axes = np.array([0, 1, 2], dtype=np.int32)
-                slice = opset13.slice(grand_parent, start, stop, step, axes, name="inserted_slice")
-                for consumer in consumers:
-                    consumer.replace_source_output(slice.output(0))
-                self.model_changed = True
-                # Use new operation for additional matching
-                self.register_new_node(slice)
-                print("applied slice for lm head")
-
-                return True
-
-        self.register_matcher(Matcher(param, "InsertSlice"), callback)
-
-
-def model_has_state(ov_model: ov.Model):
-    return len(ov_model.get_sinks()) > 0
-
-
-def model_has_input_output_name(ov_model: ov.Model, name: str):
-    """
-    Helper function for checking that model has specified input or output name
-
-    Parameters:
-      ov_model (ov.Model):
-      name (str):
-          name of input or output
-
-    Returns:
-      True if input or output with requested name exists else False
-    """
-    return name in sum([list(t.get_names()) for t in ov_model.inputs + ov_model.outputs], [])
-
-
-def fuse_cache_reorder(
-    ov_model: ov.Model,
-    not_kv_inputs: List[str],
-    key_value_input_names: List[str],
-    gather_dim: int,
-):
-    """
-    Fuses reored_cache during generate cycle into ov.Model. Used with stateful models, because we can not modify model state directly.
-
-    Adds a new beam_idx parameter and Gather op per each kv-cache input in a given model.
-    Should be run before make_stateful. Implements optimumum's _reorder_cache
-    inside the model in the beginning of each iteration.
-    Gather works along given gather_dim dimension that may vary from model to model.
-    KV-cache inputs are identified based on names in key_value_input_names.
-    Append the new beam_idx parameter to not_kv_inputs.
-
-    Parameters:
-      ov_model (`ov.Model`):
-          openvino model for processing
-      not_kv_inputs (`List[str]`):
-          list of input nodes in model that not related to past key values
-      key_value_input_names (`List[str]`):
-          list of names for key value input layers
-      gather_dim (int):
-          dimension for gathering cache during reorder pass
-    """
-
-    if model_has_input_output_name(ov_model, "beam_idx"):
-        raise ValueError("Model already has fused cache")
-    input_batch = ov_model.input("inputs_embeds").get_partial_shape()[0]
-    beam_idx = opset13.parameter(name="beam_idx", dtype=ov.Type.i32, shape=ov.PartialShape([input_batch]))
-    beam_idx.output(0).get_tensor().add_names({"beam_idx"})
-    ov_model.add_parameters([beam_idx])
-    not_kv_inputs.append(ov_model.inputs[-1])
-    # Go over all cache parameters and fuse _reorder_cache with indices provided by the new parameter beam_idx
-    for input_name in key_value_input_names:
-        parameter_output_port = ov_model.input(input_name)
-        consumers = parameter_output_port.get_target_inputs()
-        gather = opset13.gather(parameter_output_port, beam_idx, opset13.constant(gather_dim))
-        for consumer in consumers:
-            consumer.replace_source_output(gather.output(0))
-    ov_model.validate_nodes_and_infer_types()
-
-
-def build_state_initializer(ov_model: ov.Model, batch_dim: int):
-    """
-    Build initialization ShapeOf Expression for all ReadValue ops
-
-    Parameters:
-      ov_model (ov.Model):
-          openvino model
-      batch_dim (int):
-          index of dimension corresponding to batch size
-    """
-    input_ids = ov_model.input("inputs_embeds")
-    batch = opset13.gather(
-        opset13.shape_of(input_ids, output_type="i64"),
-        opset13.constant([0]),
-        opset13.constant(0),
-    )
-    for op in ov_model.get_ops():
-        if op.get_type_name() == "ReadValue":
-            dims = [dim.min_length for dim in list(op.get_output_partial_shape(0))]
-            dims[batch_dim] = batch
-            dims = [(opset13.constant(np.array([dim], dtype=np.int64)) if isinstance(dim, int) else dim) for dim in dims]
-            shape = opset13.concat(dims, axis=0)
-            broadcast = opset13.broadcast(opset13.constant(0.0, dtype=op.get_output_element_type(0)), shape)
-            op.set_arguments([broadcast])
-    ov_model.validate_nodes_and_infer_types()
-
-
-def make_stateful(
-    ov_model: ov.Model,
-    not_kv_inputs: List[str],
-    key_value_input_names: List[str],
-    key_value_output_names: List[str],
-    batch_dim: int,
-    num_attention_heads: int,
-    num_beams_and_batch: int = None,
-):
-    """
-    Hides kv-cache inputs and outputs inside the model as variables.
-
-    Parameters:
-        ov_model (ov.Model):
-            openvino model
-        not_kv_inputs (`List[str]`):
-            list of input nodes in model that not related to past key values
-        key_value_input_names (`List[str]`):
-            list of names for key value input layers
-        key_value_output_names (`List[str]`):
-            list of names for key value input layers
-        batch_dim (int):
-            index of batch dimension in key value layers
-        num_attention_heads (int):
-            number of attention heads for batch dimension initialization
-        num_beams_an_batch (int):
-            precalculated number of beams and batch for shapes initialization
-    """
-    from openvino._offline_transformations import apply_make_stateful_transformation
-
-    input_output_map = {}
-
-    if num_beams_and_batch is not None:
-        # Set batch size for input_ids and attention mask to avoid dynamic dimension got propagated from the end of the model back to ReadValue
-        for input in not_kv_inputs:
-            shape = input.get_partial_shape()
-            if shape.rank.get_length() <= 2:  # == 1 for beam_index
-                shape[0] = num_beams_and_batch
-                input.get_node().set_partial_shape(shape)
-    for kv_name_pair in zip(key_value_input_names, key_value_output_names):
-        input_output_map[kv_name_pair[0]] = kv_name_pair[1]
-        if num_beams_and_batch is not None:
-            input = ov_model.input(kv_name_pair[0])
-            shape = input.get_partial_shape()
-            shape[batch_dim] = num_beams_and_batch * num_attention_heads
-            input.get_node().set_partial_shape(shape)
-
-    if num_beams_and_batch is not None:
-        # Re-validation model if shapes are altered above
-        ov_model.validate_nodes_and_infer_types()
-
-    apply_make_stateful_transformation(ov_model, input_output_map)
-    if num_beams_and_batch is None:
-        build_state_initializer(ov_model, batch_dim)
-
-
-def patch_stateful(ov_model):
-    key_value_input_names = [key.get_any_name() for key in ov_model.inputs[2:-1]]
-    key_value_output_names = [key.get_any_name() for key in ov_model.outputs[1:]]
-    not_kv_inputs = [input for input in ov_model.inputs if not any(name in key_value_input_names for name in input.get_names())]
-    if not key_value_input_names or not key_value_output_names:
-        return
-    batch_dim = 0
-    num_attention_heads = 1
-
-    fuse_cache_reorder(ov_model, not_kv_inputs, key_value_input_names, batch_dim)
-    make_stateful(
-        ov_model,
-        not_kv_inputs,
-        key_value_input_names,
-        key_value_output_names,
-        batch_dim,
-        num_attention_heads,
-        None,
-    )
-
-
-def cleanup_torchscript_cache():
-    """
-    Helper for removing cached model representation
-    """
-    torch._C._jit_clear_class_registry()
-    torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore()
-    torch.jit._state._clear_class_state()
-
-
-def get_2d_sincos_pos_embed(embed_dim, image_size):
-    """
-    image_size: image_size or (image_height, image_width)
-    return:
-    pos_embed: [image_height, image_width, embed_dim]
-    """
-    if isinstance(image_size, int):
-        grid_h_size, grid_w_size = image_size, image_size
-    else:
-        grid_h_size, grid_w_size = image_size[0], image_size[1]
-
-    grid_h = np.arange(grid_h_size, dtype=np.float32)
-    grid_w = np.arange(grid_w_size, dtype=np.float32)
-    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
-    grid = np.stack(grid, axis=0)
-
-    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
-    return pos_embed
-
-
-def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
-    assert embed_dim % 2 == 0
-
-    # use half of dimensions to encode grid_h
-    emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim // 2, grid[0])  # (H, W, D/2)
-    emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim // 2, grid[1])  # (H, W, D/2)
-
-    emb = np.concatenate([emb_h, emb_w], axis=-1)  # (H, W, D)
-    return emb
-
-
-def get_1d_sincos_pos_embed_from_grid_new(embed_dim, pos):
-    """
-    embed_dim: output dimension for each position
-    pos: a list of positions to be encoded: size (H, W)
-    out: (H, W, D)
-    """
-    assert embed_dim % 2 == 0
-    omega = np.arange(embed_dim // 2, dtype=np.float32)
-    omega /= embed_dim / 2.0
-    omega = 1.0 / 10000**omega  # (D/2,)
-
-    out = np.einsum("hw,d->hwd", pos, omega)  # (H, W, D/2), outer product
-
-    # Align with C++ which always uses double
-    emb_sin = np.sin(out.astype(np.float64)).astype(np.float32)  # (H, W, D/2)
-    emb_cos = np.cos(out.astype(np.float64)).astype(np.float32)  # (H, W, D/2)
-
-    emb = np.concatenate([emb_sin, emb_cos], axis=-1)  # (H, W, D)
-    return emb
-
-
-def patch_model_code(orig_model_dir):
-    model_file = orig_model_dir / "modeling_navit_siglip.py"
-    orig_model_file = model_file.parent / ("orig_" + model_file.name)
-    if not orig_model_file.exists():
-        model_file.rename(orig_model_file)
-        with orig_model_file.open("r") as f:
-            content = f.read()
-            content = content.replace("if is_flash_attn_2_available():", "")
-            content = content.replace("from flash_attn import flash_attn_func, flash_attn_varlen_func", "")
-            content = content.replace("from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input", "")
-
-            with model_file.open("w") as out_f:
-                out_f.write(content)
-
-
-def convert_llm(model, model_dir):
-    model.llm.config.save_pretrained(model_dir / text_emb_path.parent)
-    print("⌛ Convert Input embedding model")
-    ov_model = ov.convert_model(model.llm.model.embed_tokens, example_input=torch.ones([1, 10], dtype=torch.long))
-
-    ov.save_model(ov_model, model_dir / text_emb_path)
-    del ov_model
-    cleanup_torchscript_cache()
-    gc.collect()
-    print("✅ Input embedding model successfully converted")
-
-    print("⌛ Convert Language model")
-    hidden_size = model.llm.config.hidden_size
-    num_pkv = model.llm.config.num_hidden_layers
-    pkv_shape = (2, model.llm.config.num_key_value_heads, 2, hidden_size // model.llm.config.num_attention_heads)
-
-    input_embeds = torch.randn((2, 2, hidden_size))
-    attention_mask = torch.ones([2, 4], dtype=torch.long)
-    position_ids = torch.tensor([[2, 3], [2, 3]], dtype=torch.long)
-    input_names = ["attention_mask", "position_ids"]
-    output_names = ["logits"]
-
-    past_key_values = []
-    for i in range(num_pkv):
-        kv = [torch.randn(pkv_shape) for _ in range(2)]
-        past_key_values.append(kv)
-        input_names.extend([f"past_key_values.{i}.key", f"past_key_values.{i}.value"])
-        output_names.extend([f"present.{i}.key", f"present.{i}.value"])
-    input_names.append("inputs_embeds")
-
-    example_input = {"inputs_embeds": input_embeds, "attention_mask": attention_mask, "position_ids": position_ids, "past_key_values": past_key_values}
-
-    model.llm.config.torchscript = True
-
-    ov_model = ov.convert_model(model.llm, example_input=example_input)
-
-    for out, out_name in zip(ov_model.outputs, output_names):
-        out.get_tensor().set_names({out_name})
-
-    for inp, inp_name in zip(ov_model.inputs, input_names):
-        inp.get_tensor().set_names({inp_name})
-
-    patch_stateful(ov_model)
-
-    ov.save_model(ov_model, model_dir / llm_path)
-    del ov_model
-
-    cleanup_torchscript_cache()
-    gc.collect()
-    print("✅ Language model successfully converted")
-
-
-def convert_vision_encoder(model, model_dir):
-    tgt_sizes = torch.tensor([[23, 45]])
-    if not (model_dir / image_emb_path).exists():
-        print("⌛ Convert Image embedding model")
-        def siglip_vis_embed_forward(
-            self,
-            pixel_values: torch.FloatTensor,
-            patch_attention_mask: torch.BoolTensor,
-            tgt_sizes: Optional[torch.IntTensor] = None,
-            position_ids: Optional[torch.FloatTensor] = None,
-        ) -> torch.Tensor:
-            patch_embeds = self.patch_embedding(pixel_values)
-            embeddings = patch_embeds.flatten(2).transpose(1, 2)
-
-            if position_ids is None:
-                batch_size = pixel_values.size(0)
-                max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3)
-                max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, max_im_w // self.patch_size
-                boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side)
-                position_ids = torch.full(
-                    size=(
-                        batch_size,
-                        max_nb_patches_h * max_nb_patches_w,
-                    ),
-                    fill_value=0,
-                )
-
-                for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
-                    if tgt_sizes is not None:
-                        nb_patches_h = tgt_sizes[batch_idx][0]
-                        nb_patches_w = tgt_sizes[batch_idx][1]
-                    else:
-                        nb_patches_h = p_attn_mask[:, 0].sum()
-                        nb_patches_w = p_attn_mask[0].sum()
-
-                    fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
-                    fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
-
-                    bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True)
-                    bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
-
-                    pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten()
-                    position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
-
-            position_ids = position_ids.to(self.position_embedding.weight.device)
-
-            embeddings = embeddings + self.position_embedding(position_ids)
-            return embeddings
-
-        def siglip_attn_forward(
-            self,
-            hidden_states: torch.Tensor,
-            attention_mask: Optional[torch.Tensor] = None,
-            output_attentions: Optional[bool] = False,
-        ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-            """Input shape: Batch x Time x Channel"""
-
-            batch_size, q_len, _ = hidden_states.size()
-
-            query_states = self.q_proj(hidden_states)
-            key_states = self.k_proj(hidden_states)
-            value_states = self.v_proj(hidden_states)
-
-            query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-            key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-            value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-
-            attn_output = torch.nn.functional.scaled_dot_product_attention(
-                query_states, key_states, value_states, attention_mask, is_causal=attention_mask is None
-            )
-
-            attn_output = attn_output.transpose(1, 2).contiguous()
-            attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
-
-            attn_output = self.out_proj(attn_output)
-
-            return attn_output, None
-
-        def siglip_transformer_forward(
-            self,
-            pixel_values,
-            patch_attention_mask: Optional[torch.BoolTensor] = None,
-            tgt_sizes: Optional[torch.IntTensor] = None,
-            position_ids: Optional[torch.FloatTensor] = None,
-            output_attentions: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
-        ) -> Union[Tuple, BaseModelOutputWithPooling]:
-            output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-            output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-            return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-            batch_size = pixel_values.size(0)
-            if patch_attention_mask is None:
-                patch_attention_mask = torch.ones(
-                    size=(
-                        batch_size,
-                        pixel_values.size(2) // self.config.patch_size,
-                        pixel_values.size(3) // self.config.patch_size,
-                    ),
-                    dtype=torch.bool,
-                    device=pixel_values.device,
-                )
-
-            hidden_states = self.embeddings(
-                pixel_values=pixel_values, patch_attention_mask=patch_attention_mask, tgt_sizes=tgt_sizes, position_ids=position_ids
-            )
-
-            patch_attention_mask = patch_attention_mask.view(batch_size, -1)
-            attention_mask = _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype) if not self._use_flash_attention_2 else patch_attention_mask
-
-            encoder_outputs = self.encoder(
-                inputs_embeds=hidden_states,
-                attention_mask=attention_mask,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-
-            last_hidden_state = encoder_outputs[0]
-            last_hidden_state = self.post_layernorm(last_hidden_state)
-
-            if not return_dict:
-                return (last_hidden_state, None) + encoder_outputs[1:]
-
-            return BaseModelOutputWithPooling(
-                last_hidden_state=last_hidden_state,
-                pooler_output=None,
-                hidden_states=encoder_outputs.hidden_states,
-                attentions=encoder_outputs.attentions,
-            )
-
-        vpm = model.vpm
-        vpm.embeddings.forward = types.MethodType(siglip_vis_embed_forward, vpm.embeddings)
-        for layer in vpm.encoder.layers:
-            layer.self_attn.forward = types.MethodType(siglip_attn_forward, layer.self_attn)
-        vpm.forward = types.MethodType(siglip_transformer_forward, vpm)
-
-        pixel_values = torch.randn([1, 3, 14, 14490])
-        patch_attn_mask = torch.zeros((1, 1, 1035), dtype=torch.bool)
-        patch_attn_mask[0, 0, : tgt_sizes[0][0] * tgt_sizes[0][1]] = True
-        position_ids = prepare_vis_position_ids(
-            pixel_values, patch_attn_mask, tgt_sizes, model.config.vision_config.patch_size, model.config.vision_config.image_size // model.config.patch_size
-        )
-        ov_model = ov.convert_model(vpm, example_input={"pixel_values": pixel_values, "position_ids": position_ids, "patch_attention_mask": patch_attn_mask})
-        ov.save_model(ov_model, model_dir / image_emb_path)
-        del ov_model
-        cleanup_torchscript_cache()
-        gc.collect()
-        print("✅ Image embedding model successfully converted")
-
-    if not (model_dir / resampler_path).exists():
-        print("⌛ Convert Resamler model")
-
-        def resampler_forward(self, x, pos_embed, key_padding_mask):
-            bs = x.shape[0]
-            x = self.kv_proj(x)  # B * L * D
-            x = self.ln_kv(x).permute(1, 0, 2)  # L * B * D
-
-            q = self.ln_q(self.query)  # Q * D
-
-            q_bs = q.unsqueeze(1).repeat(1, bs, 1)
-
-            out = self.attn(q_bs, x + pos_embed, x, key_padding_mask=key_padding_mask)[0]  # Q * B * D  # L * B * D +  L * B * D
-            #  out: Q * B * D
-            x = out.permute(1, 0, 2)  # B * Q * D
-
-            x = self.ln_post(x)
-            x = x @ self.proj
-            return x
-
-        model.resampler.forward = types.MethodType(resampler_forward, model.resampler)
-
-        pos_embed_base = get_2d_sincos_pos_embed(model.resampler.embed_dim, 70)
-
-        patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1]
-
-        max_patch_len = torch.max(patch_len)
-        key_padding_mask = torch.zeros((1, max_patch_len), dtype=torch.bool)
-
-        pos_embed = []
-        tgt_h, tgt_w = tgt_sizes[0]
-        pos_embed = torch.from_numpy(pos_embed_base[:tgt_h, :tgt_w, :].reshape((tgt_h * tgt_w, 1, -1)))  # patches * D
-        key_padding_mask[0, patch_len:] = True
-
-        ov_model = ov.convert_model(model.resampler, example_input=[torch.randn(1, 1035, 1152), pos_embed, key_padding_mask])
-        ov.save_model(ov_model, model_dir / resampler_path)
-        del ov_model
-        cleanup_torchscript_cache()
-        del model.resampler
-        gc.collect()
-        print("✅ Resampler model successfully converted")
-
-
-def copy_llm_files(model_dir, dst_dir):
-    shutil.copy(model_dir / text_emb_path, model_dir / dst_dir / text_emb_path.name)
-    shutil.copy(model_dir / text_emb_path.with_suffix(".bin"), model_dir / dst_dir / text_emb_path.with_suffix(".bin").name)
-    shutil.copy(model_dir / llm_path.parent / "config.json", model_dir / dst_dir / "config.json")
-    shutil.copy(model_dir / llm_path.parent / "configuration_minicpm.py", model_dir / dst_dir / "configuration_minicpm.py")
-    shutil.copy(model_dir / llm_path.parent / "modeling_navit_siglip.py", model_dir / dst_dir / "modeling_navit_siglip.py")
-
-
-def prepare_vis_position_ids(pixel_values, patch_attention_mask, tgt_sizes, patch_size, num_patches_per_side):
-    batch_size = pixel_values.size(0)
-    max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3)
-    max_nb_patches_h, max_nb_patches_w = max_im_h // patch_size, max_im_w // patch_size
-    boundaries = torch.arange(1 / num_patches_per_side, 1.0, 1 / num_patches_per_side)
-    position_ids = torch.full(size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0)
-
-    for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
-        if tgt_sizes is not None:
-            nb_patches_h = tgt_sizes[batch_idx][0]
-            nb_patches_w = tgt_sizes[batch_idx][1]
-        else:
-            nb_patches_h = p_attn_mask[:, 0].sum()
-            nb_patches_w = p_attn_mask[0].sum()
-
-        fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
-        fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
-
-        bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True)
-        bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
-
-        pos_ids = (bucket_coords_h[:, None] * num_patches_per_side + bucket_coords_w).flatten()
-        position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
-
-    return position_ids
-
-
-core = ov.Core()
-
-
-class OvModelForCausalLMWithEmb(GenerationMixin):
-    def __init__(self, model_dir, device="CPU", ov_config=None, compile=True, slice_lm_head=True) -> None:
-        self._supports_cache_class = False
-        self.config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
-        self.config.is_decoder = True
-        self.config.is_encoder_decoder = False
-        self.generation_config = GenerationConfig.from_model_config(self.config)
-        model_dir = Path(model_dir)
-        self.model = core.read_model(model_dir / "language_model.xml")
-        self.token_emb = core.read_model(model_dir / "embed_tokens.xml")
-        if slice_lm_head:
-            self.slice_lm_head()
-        self.request = None
-        self.token_emb_request = None
-        self._device = device.upper()
-        self.device = torch.device("cpu")
-        self.ov_config = ov_config
-        self.next_beam_idx = None
-        self._past_length = None
-        self.input_names = [input_t.get_any_name() for input_t in self.model.inputs]
-        self.main_input_name = "input_ids"
-        self.llm_times = []
-        if compile:
-            self.compile()
-
-    def slice_lm_head(self):
-        manager = Manager()
-        manager.register_pass(InsertSlice())
-        manager.run_passes(self.model)
-        self.model.validate_nodes_and_infer_types()
-
-    def compile(self):
-        if self.request is None:
-            self.request = core.compile_model(self.model, self._device, self.ov_config).create_infer_request()
-        self._compile_token_emb()
-
-    def _compile_token_emb(self):
-        if self.token_emb_request is None:
-            self.token_emb_request = core.compile_model(self.token_emb, self._device, self.ov_config)
-
-    def to(self, device: str):
-        if isinstance(device, str):
-            self._device = device.upper()
-            self.clear_requests()
-
-        return self
-
-    def clear_requests(self):
-        del self.request
-        del self.token_emb_request
-        self.request = None
-        self.token_emb_request = None
-
-    def embed_tokens(self, input_ids: torch.LongTensor):
-        self._compile_token_emb()
-        res = self.token_emb_request(input_ids, share_inputs=True)
-        return res[0]
-
-    def prepare_inputs(
-        self,
-        input_ids: torch.LongTensor,
-        attention_mask: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        **kwargs,
-    ):
-        batch_size = input_ids.shape[0] if input_ids is not None else inputs_embeds.shape[0]
-
-        inputs = {}
-        # past_key_values are not used explicitly, instead they are handled inside the model
-        if past_key_values is None:
-            self.llm_times = []
-            # This is the first iteration in a sequence, reset all states
-            if self.request is not None:
-                self.request.reset_state()
-                # Set initial value for the next beam_idx input that will be used at the current iteration
-                # and will be optionally updated by _reorder_cache at the next iterations if beam_search is used
-                self.next_beam_idx = np.arange(batch_size, dtype=int)
-                self._past_length = 0
-        past_len = self._get_past_length(past_key_values)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids if past_key_values is None else input_ids[:, -1:])
-
-            if hasattr(self.config, "scale_emb"):
-                inputs_embeds = inputs_embeds * self.config.scale_emb
-        inputs["inputs_embeds"] = inputs_embeds
-
-        # Add the attention_mask inputs when needed
-        if "attention_mask" in self.input_names or "position_ids" in self.input_names:
-            if attention_mask is not None:
-                attention_mask = np.array(attention_mask)
-            else:
-                attention_mask = np.ones((inputs_embeds.shape[0], inputs_embeds.shape[1] + past_len), dtype=int)
-
-        if "attention_mask" in self.input_names:
-            inputs["attention_mask"] = attention_mask
-
-        if "position_ids" in self.input_names:
-            if position_ids is not None:
-                position_ids = np.array(position_ids)
-            else:
-                position_ids = np.cumsum(attention_mask, axis=1) - 1
-                position_ids[attention_mask == 0] = 1
-                if past_key_values:
-                    position_ids = position_ids[:, -input_ids.shape[1] :]
-
-            inputs["position_ids"] = position_ids
-
-        if "beam_idx" in self.input_names:
-            inputs["beam_idx"] = self.next_beam_idx if self.next_beam_idx is not None else np.arange(batch_size, dtype=int)
-
-        return inputs
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor,
-        attention_mask: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.LongTensor] = None,
-        **kwargs,
-    ):
-        self.compile()
-
-        inputs = self.prepare_inputs(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            past_key_values=past_key_values,
-            position_ids=position_ids,
-            inputs_embeds=inputs_embeds,
-            **kwargs,
-        )
-
-        # Run inference
-        self.request.start_async(inputs, share_inputs=True)
-        self.request.wait()
-        logits = self.request.get_tensor("logits").data
-        logits = torch.from_numpy(logits).to(self.device)
-        past_key_values = ((),)
-        self._past_length += inputs["inputs_embeds"].shape[1]
-
-        return CausalLMOutputWithPast(logits=logits, past_key_values=past_key_values)
-
-    # Adapted from transformers.models.llama.modeling_llama.LlamaForCausalLM.prepare_inputs_for_generation
-    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        attention_mask = kwargs.get("attention_mask", None)
-        use_cache = kwargs.get("use_cache", None)
-
-        if past_key_values is not None:
-            past_len = self._get_past_length(past_key_values)
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and input_ids is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_len) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif input_ids is not None and past_len < input_ids.shape[1]:
-                input_ids = input_ids[:, past_len:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None and "position_ids" in self.input_names:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values and input_ids is not None:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        model_inputs = {
-            "input_ids": input_ids,
-            "past_key_values": past_key_values,
-            "use_cache": use_cache,
-            "position_ids": position_ids,
-            "attention_mask": attention_mask,
-            "inputs_embeds": inputs_embeds if past_key_values is None else None,
-        }
-
-        return model_inputs
-
-    def _get_past_length(self, past_key_values=None):
-        if past_key_values is None:
-            return 0
-        return self._past_length
-
-    # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel._reorder_cache
-    def _reorder_cache(self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]:
-        """
-        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
-        [`~PreTrainedModel.beam_sample`] is called.
-        This is required to match `past_key_values` with the correct beam_idx at every generation step.
-        """
-        self.next_beam_idx = np.array(beam_idx)  # save beam_idx to be used as an input in the next iteration
-        return past_key_values
-
-    def can_generate(self):
-        """Returns True to validate the check that the model using `GenerationMixin.generate()` can indeed generate."""
-
-        return True
-
-    def __call__(self, *args, **kwargs):
-        return self.forward(*args, **kwargs)
-
-
-class OvMiniCPMV:
-    def __init__(self, config, vpm, resampler, llm, processor):
-        self.config = config
-        self.llm = llm
-        self.vpm = vpm
-        self.embed_dim = self.llm.config.hidden_size
-        self._resampler = resampler
-        self.processor = processor
-        self._pos_embeds = torch.from_numpy(get_2d_sincos_pos_embed(self.embed_dim, 70)).float()
-        self.max_size = (70, 70)
-
-        self.terminators = ["<|im_end|>", "<|endoftext|>"]
-
-    def set_decoder(self, decoder):
-        self.llm = decoder
-
-    def get_decoder(self):
-        return self.llm
-
-    def resampler(self, x, tgt_sizes):
-        bs = x.shape[0]
-
-        patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1]
-
-        self._adjust_pos_cache(tgt_sizes)
-
-        max_patch_len = torch.max(patch_len)
-        key_padding_mask = torch.zeros((bs, max_patch_len), dtype=torch.bool)
-
-        pos_embed = []
-        for i in range(bs):
-            tgt_h, tgt_w = tgt_sizes[i]
-            pos_embed.append(self._pos_embeds[:tgt_h, :tgt_w, :].reshape((tgt_h * tgt_w, -1)))  # patches * D
-            key_padding_mask[i, patch_len[i] :] = True
-
-        pos_embed = torch.nn.utils.rnn.pad_sequence(pos_embed, batch_first=True, padding_value=0.0).permute(1, 0, 2)  # BLD => L * B * D
-
-        res = torch.from_numpy(self._resampler([x, pos_embed, key_padding_mask])[0])
-        return res
-
-    def _set_2d_pos_cache(self, max_size):
-        pos_embed = torch.from_numpy(get_2d_sincos_pos_embed(self.embed_dim, max_size)).float()
-        self._pos_embed = pos_embed
-
-    def _adjust_pos_cache(self, tgt_sizes):
-        max_h = torch.max(tgt_sizes[:, 0])
-        max_w = torch.max(tgt_sizes[:, 1])
-        if max_h > self.max_size[0] or max_w > self.max_size[1]:
-            self.max_size = [max(max_h, self.max_size[0]), max(max_w, self.max_size[1])]
-            self._set_2d_pos_cache(self.max_size)
-
-    def get_vllm_embedding(self, data):
-        if "vision_hidden_states" not in data:
-            tgt_sizes = data["tgt_sizes"]
-            pixel_values_list = data["pixel_values"]
-            vision_hidden_states = []
-            all_pixel_values = []
-            img_cnt = []
-            for pixel_values in pixel_values_list:
-                img_cnt.append(len(pixel_values))
-                all_pixel_values.extend([i.flatten(end_dim=1).permute(1, 0) for i in pixel_values])
-
-            # exist image
-            if all_pixel_values:
-                tgt_sizes = [tgt_size for tgt_size in tgt_sizes if isinstance(tgt_size, torch.Tensor)]
-                tgt_sizes = torch.vstack(tgt_sizes).type(torch.int32)
-
-                max_patches = torch.max(tgt_sizes[:, 0] * tgt_sizes[:, 1])
-
-                all_pixel_values = torch.nn.utils.rnn.pad_sequence(all_pixel_values, batch_first=True, padding_value=0.0)
-                B, L, _ = all_pixel_values.shape
-                all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L)
-
-                patch_attn_mask = torch.zeros((B, 1, max_patches), dtype=torch.bool)
-                for i in range(B):
-                    patch_attn_mask[i, 0, : tgt_sizes[i][0] * tgt_sizes[i][1]] = True
-
-                vision_batch_size = 32
-                all_pixel_values = all_pixel_values
-                if B > vision_batch_size:
-                    hs = []
-                    for i in range(0, B, vision_batch_size):
-                        start_idx = i
-                        end_idx = i + vision_batch_size
-                        block_pxl_values = all_pixel_values[start_idx:end_idx]
-                        block_patch_attn_mask = patch_attn_mask[start_idx:end_idx]
-                        block_tgt_sizes = tgt_sizes[start_idx:end_idx]
-                        block_position_ids = prepare_vis_position_ids(
-                            block_pxl_values,
-                            block_patch_attn_mask,
-                            block_tgt_sizes,
-                            self.config.vision_config.patch_size,
-                            self.config.vision_config.image_size // self.config.patch_size,
-                        )
-                        start = time.perf_counter()
-                        tmp_hs = torch.from_numpy(self.vpm([block_pxl_values, block_patch_attn_mask, block_position_ids])[0])
-                        self.vpm_times.append(time.perf_counter() - start)
-                        hs.append(tmp_hs)
-                    vision_embedding = torch.cat(hs, dim=0)
-                else:
-                    position_ids = prepare_vis_position_ids(
-                        all_pixel_values,
-                        patch_attn_mask,
-                        tgt_sizes,
-                        self.config.vision_config.patch_size,
-                        self.config.vision_config.image_size // self.config.patch_size,
-                    )
-                    start = time.perf_counter()
-                    vision_embedding = torch.from_numpy(self.vpm([all_pixel_values, patch_attn_mask, position_ids])[0])
-                    vision_embedding = torch.from_numpy(self.vpm([all_pixel_values, patch_attn_mask, position_ids])[0])
-                vision_embedding = self.resampler(vision_embedding, tgt_sizes)
-
-                start = 0
-                for pixel_values in pixel_values_list:
-                    img_cnt = len(pixel_values)
-                    if img_cnt > 0:
-                        vision_hidden_states.append(vision_embedding[start : start + img_cnt])
-                        start += img_cnt
-                    else:
-                        vision_hidden_states.append([])
-            else:  # no image
-                dummy_feature = []
-                for _ in range(len(pixel_values_list)):
-                    vision_hidden_states.append(dummy_feature)
-
-        else:
-            vision_hidden_states = data["vision_hidden_states"]
-
-        if hasattr(self.llm.config, "scale_emb"):
-            vllm_embedding = self.llm.embed_tokens(data["input_ids"]) * self.llm.config.scale_emb
-        else:
-            vllm_embedding = self.llm.embed_tokens(data["input_ids"])
-
-        bs = len(data["input_ids"])
-        for i in range(bs):
-            cur_vs_hs = vision_hidden_states[i]
-            if len(cur_vs_hs) > 0:
-                cur_vllm_emb = torch.from_numpy(vllm_embedding[i])
-                cur_image_bound = data["image_bound"][i]
-                if len(cur_image_bound) > 0:
-                    image_indices = torch.stack([torch.arange(r[0], r[1], dtype=torch.long) for r in cur_image_bound])
-
-                    cur_vllm_emb.scatter_(0, image_indices.view(-1, 1).repeat(1, cur_vllm_emb.shape[-1]), cur_vs_hs.view(-1, cur_vs_hs.shape[-1]))
-        return vllm_embedding
-
-    def forward(self, data, **kwargs):
-        vllm_embedding = self.get_vllm_embedding(data)
-        position_ids = data["position_ids"]
-        if position_ids.dtype != torch.int64:
-            position_ids = position_ids.long()
-
-        return self.llm(input_ids=None, position_ids=position_ids, inputs_embeds=vllm_embedding, **kwargs)
-
-    def _decode(self, inputs_embeds, tokenizer, attention_mask, decode_text=False, **kwargs):
-        terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
-        output = self.llm.generate(
-            inputs_embeds=torch.from_numpy(inputs_embeds), pad_token_id=0, eos_token_id=terminators, attention_mask=attention_mask, **kwargs
-        )
-        if decode_text:
-            return self._decode_text(output, tokenizer)
-        return output
-
-    def _decode_stream(self, inputs_embeds, tokenizer, **kwargs):
-        terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
-        streamer = TextIteratorStreamer(tokenizer=tokenizer)
-        generation_kwargs = {"inputs_embeds": torch.from_numpy(inputs_embeds), "pad_token_id": 0, "eos_token_id": terminators, "streamer": streamer}
-        generation_kwargs.update(kwargs)
-
-        thread = Thread(target=self.llm.generate, kwargs=generation_kwargs)
-        thread.start()
-
-        return streamer
-
-    def _decode_text(self, result_ids, tokenizer):
-        terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
-        result_text = []
-        for result in result_ids:
-            result = result[result != 0]
-            if result[0] == tokenizer.bos_id:
-                result = result[1:]
-            if result[-1] in terminators:
-                result = result[:-1]
-            result_text.append(tokenizer.decode(result).strip())
-        return result_text
-
-    def generate(
-        self,
-        input_ids=None,
-        pixel_values=None,
-        tgt_sizes=None,
-        image_bound=None,
-        attention_mask=None,
-        tokenizer=None,
-        vision_hidden_states=None,
-        return_vision_hidden_states=False,
-        stream=False,
-        decode_text=False,
-        **kwargs,
-    ):
-        assert input_ids is not None
-        assert len(input_ids) == len(pixel_values)
-
-        model_inputs = {
-            "input_ids": input_ids,
-            "image_bound": image_bound,
-        }
-
-        if vision_hidden_states is None:
-            model_inputs["pixel_values"] = pixel_values
-            model_inputs["tgt_sizes"] = tgt_sizes
-        else:
-            model_inputs["vision_hidden_states"] = vision_hidden_states
-
-        with torch.inference_mode():
-            model_inputs["inputs_embeds"] = self.get_vllm_embedding(model_inputs)
-
-            if stream:
-                result = self._decode_stream(model_inputs["inputs_embeds"], tokenizer, **kwargs)
-            else:
-                result = self._decode(model_inputs["inputs_embeds"], tokenizer, attention_mask, decode_text=decode_text, **kwargs)
-
-        return result
-
-    def chat(
-        self,
-        image,
-        msgs,
-        tokenizer,
-        processor=None,
-        vision_hidden_states=None,
-        max_new_tokens=2048,
-        min_new_tokens=0,
-        sampling=True,
-        max_inp_length=8192,
-        system_prompt="",
-        stream=False,
-        max_slice_nums=None,
-        use_image_id=None,
-        **kwargs,
-    ):
-        self.vpm_times = []
-        self.resampler_times = []
-        if isinstance(msgs[0], list):
-            batched = True
-        else:
-            batched = False
-        msgs_list = msgs
-        images_list = image
-
-        if batched is False:
-            images_list, msgs_list = [images_list], [msgs_list]
-        else:
-            assert images_list is None, "Please integrate image to msgs when using batch inference."
-            images_list = [None] * len(msgs_list)
-        assert len(images_list) == len(msgs_list), "The batch dim of images_list and msgs_list should be the same."
-
-        if processor is None:
-            if self.processor is None:
-                self.processor = AutoProcessor.from_pretrained(self.config._name_or_path, trust_remote_code=True)
-            processor = self.processor
-
-        assert (
-            self.config.query_num == processor.image_processor.image_feature_size
-        ), "These two values should be the same. Check `config.json` and `preprocessor_config.json`."
-        assert (
-            self.config.patch_size == processor.image_processor.patch_size
-        ), "These two values should be the same. Check `config.json` and `preprocessor_config.json`."
-        assert (
-            self.config.use_image_id == processor.image_processor.use_image_id
-        ), "These two values should be the same. Check `config.json` and `preprocessor_config.json`."
-        assert (
-            self.config.slice_config.max_slice_nums == processor.image_processor.max_slice_nums
-        ), "These two values should be the same. Check `config.json` and `preprocessor_config.json`."
-        assert (
-            self.config.slice_mode == processor.image_processor.slice_mode
-        ), "These two values should be the same. Check `config.json` and `preprocessor_config.json`."
-
-        prompts_lists = []
-        input_images_lists = []
-        for image, msgs in zip(images_list, msgs_list):
-            if isinstance(msgs, str):
-                msgs = json.loads(msgs)
-            copy_msgs = deepcopy(msgs)
-
-            assert len(msgs) > 0, "msgs is empty"
-
-            if image is not None and isinstance(copy_msgs[0]["content"], str):
-                copy_msgs[0]["content"] = [image, copy_msgs[0]["content"]]
-
-            images = []
-            for i, msg in enumerate(copy_msgs):
-                role = msg["role"]
-                content = msg["content"]
-                assert role in ["user", "assistant"]
-                if i == 0:
-                    assert role == "user", "The role of first msg should be user"
-                if isinstance(content, str):
-                    content = [content]
-                cur_msgs = []
-                for c in content:
-                    if isinstance(c, Image.Image):
-                        images.append(c)
-                        cur_msgs.append("(<image>./</image>)")
-                    elif isinstance(c, str):
-                        cur_msgs.append(c)
-                msg["content"] = "\n".join(cur_msgs)
-
-            if system_prompt:
-                sys_msg = {"role": "system", "content": system_prompt}
-                copy_msgs = [sys_msg] + copy_msgs
-
-            prompts_lists.append(processor.tokenizer.apply_chat_template(copy_msgs, tokenize=False, add_generation_prompt=True))
-            input_images_lists.append(images)
-
-        inputs = processor(
-            prompts_lists, input_images_lists, max_slice_nums=max_slice_nums, use_image_id=use_image_id, return_tensors="pt", max_length=max_inp_length
-        )
-
-        if sampling:
-            generation_config = {"top_p": 0.8, "top_k": 100, "temperature": 0.7, "do_sample": True, "repetition_penalty": 1.05}
-        else:
-            generation_config = {
-                "repetition_penalty": 1.0,
-            }
-
-        if min_new_tokens > 0:
-            generation_config["min_new_tokens"] = min_new_tokens
-
-        generation_config.update((k, kwargs[k]) for k in generation_config.keys() & kwargs.keys())
-
-        inputs.pop("image_sizes")
-        with torch.inference_mode():
-            res = self.generate(
-                **inputs,
-                tokenizer=tokenizer,
-                max_new_tokens=max_new_tokens,
-                vision_hidden_states=vision_hidden_states,
-                stream=stream,
-                decode_text=True,
-                **generation_config,
-            )
-
-        if stream:
-
-            def stream_gen():
-                for text in res:
-                    for term in self.terminators:
-                        text = text.replace(term, "")
-                    yield text
-
-            return stream_gen()
-
-        else:
-            if batched:
-                answer = res
-            else:
-                answer = res[0]
-            return answer
-
-
-def init_model(model_dir, device):
-    config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
-    llm = OvModelForCausalLMWithEmb(model_dir, device)
-    img_emb = core.compile_model(model_dir / image_emb_path, device)
-    resampler = core.compile_model(model_dir / resampler_path, device)
-    processor = AutoProcessor.from_pretrained(model_dir, trust_remote_code=True)
-
-    ov_model = OvMiniCPMV(config, img_emb, resampler, llm, processor)
-    return ov_model
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("model_dir", type=Path)
-    model_dir = parser.parse_args().model_dir
-    model_id = "openbmb/MiniCPM-V-2_6"
-    ckpt = model_dir / "ckpt"
-    if not ckpt.exists():
-        snapshot_download(model_id, local_dir=ckpt, force_download=True)
-        patch_model_code(ckpt)
-    model = AutoModel.from_pretrained(ckpt, trust_remote_code=True)
-    model.eval()
-    model.config.save_pretrained(model_dir)
-    tokenizer = AutoTokenizer.from_pretrained(ckpt, trust_remote_code=True)
-    tokenizer.save_pretrained(model_dir)
-    ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(tokenizer, with_detokenizer=True)
-    ov.save_model(ov_tokenizer, model_dir / "openvino_tokenizer.xml")
-    ov.save_model(ov_detokenizer, model_dir / "openvino_detokenizer.xml")
-    processor = AutoProcessor.from_pretrained(ckpt, trust_remote_code=True)
-    processor.save_pretrained(model_dir)
-
-    convert_llm(model, model_dir)
-    del model.llm
-    gc.collect()
-
-    convert_vision_encoder(model, model_dir)
-    # ov_cpm = init_model(model_dir, "CPU")
-    # print(ov_cpm.chat(Image.open(requests.get("https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11", stream=True).raw), [{"role": "user", "content": "What is unusual on this image?"}], ov_cpm.processor.tokenizer, sampling=False))
-
-if "__main__" == __name__:
-    main()
diff --git a/samples/python/visual_language_chat/README.md b/samples/python/visual_language_chat/README.md
index 16ef0959c5..06355d9ee5 100644
--- a/samples/python/visual_language_chat/README.md
+++ b/samples/python/visual_language_chat/README.md
@@ -10,8 +10,8 @@ It's not required to install [../../requirements.txt](../../requirements.txt) fo
 
 ```sh
 pip install --upgrade-strategy eager -r ../../requirements.txt
+optimum-cli export openvino --model openbmb/MiniCPM-V-2_6 --trust-remote-code MiniCPM-V-2_6
 ```
-# TODO: add optimum cli command for miniCPM-V-2_6 when available
 
 ## Run:
 [This image](https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11) can be used as a sample image.
diff --git a/samples/requirements.txt b/samples/requirements.txt
index e901229c75..69d5572b28 100644
--- a/samples/requirements.txt
+++ b/samples/requirements.txt
@@ -5,4 +5,4 @@ einops==0.8.0  # For Qwen
 transformers_stream_generator==0.0.5  # For Qwen
 diffusers==0.30.3
 librosa # For Whisper
-torchvision # needed for mini-CPM export script. Need to remove when we switch to exporting with optimum-intel.
+torchvision  # For visual langugage models
diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp
index 56a4bbbeab..0d5772202d 100644
--- a/src/cpp/src/visual_language/pipeline.cpp
+++ b/src/cpp/src/visual_language/pipeline.cpp
@@ -350,28 +350,18 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         m_image_id{0} {
             if (m_vlm_config.model_type == VLMModelType::MINICPM) {
                 m_resampler = ov::Core{}.compile_model(
-                    model_dir / "resampler.xml", device, device_config
-                ).create_infer_request();
-
-                m_embedding = ov::Core{}.compile_model(
-                    model_dir / "embed_tokens.xml", device, device_config
-                ).create_infer_request();
-
-                m_language = ov::Core{}.compile_model(
-                    model_dir / "language_model.xml", device, device_config
+                    model_dir / "openvino_resampler_model.xml", device, device_config
                 ).create_infer_request();
 
                 m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70});
-            } else if (m_vlm_config.model_type == VLMModelType::LLAVA) {
-                m_language = ov::Core{}.compile_model(
-                    model_dir / "openvino_language_model.xml", device, device_config
-                ).create_infer_request();
-
-                // Reusing the same m_embedding for llava text_embeddings model
-                m_embedding = ov::Core{}.compile_model(
-                    model_dir / "openvino_text_embeddings_model.xml", device, device_config
-                ).create_infer_request();
             }
+            m_embedding = ov::Core{}.compile_model(
+                model_dir / "openvino_text_embeddings_model.xml", device, device_config
+            ).create_infer_request();
+
+            m_language = ov::Core{}.compile_model(
+                model_dir / "openvino_language_model.xml", device, device_config
+            ).create_infer_request();
 
             m_language.get_tensor("attention_mask").set_shape({1, 0});
     }
@@ -707,8 +697,8 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
             pipe.m_pos_embed_cache
         );
         size_t max_patch_len = *std::max_element(patch_len.begin(), patch_len.end());
-        ov::Tensor key_padding_mask(ov::element::boolean, {bs, max_patch_len});
-        bool* mask_data = key_padding_mask.data<bool>();
+        ov::Tensor key_padding_mask(ov::element::f32, {bs, max_patch_len});
+        float* mask_data = key_padding_mask.data<float>();
         size_t embed_len = pipe.m_pos_embed_cache.get_shape().at(2);
         ov::Tensor pos_embed(ov::element::f32, {max_patch_len, bs, embed_len});  // BLD => L * B * D
         float* pos_embed_data = pos_embed.data<float>();
@@ -730,10 +720,10 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
             for (size_t flat = target_h * target_w; flat < max_patch_len; ++flat) {
                 std::fill_n(pos_embed_data + flat * bs * embed_len + i * embed_len, embed_len, 0.0f);
             }
-            std::fill_n(mask_data + i * max_patch_len, patch_len[i], false);
-            std::fill_n(mask_data + i * max_patch_len + patch_len[i], max_patch_len - patch_len[i], true);
+            std::fill_n(mask_data + i * max_patch_len, patch_len[i], 0.0f);
+            std::fill_n(mask_data + i * max_patch_len + patch_len[i], max_patch_len - patch_len[i], 1.0f);
         }
-        pipe.m_resampler.set_tensor("x", encoded_image);  // [N, H*W, old_hidden_size]
+        pipe.m_resampler.set_tensor("image_feature", encoded_image);  // [N, H*W, old_hidden_size]
         pipe.m_resampler.set_tensor("pos_embed", pos_embed);  // [H*W, N, new_hidden_size]
         pipe.m_resampler.set_tensor("key_padding_mask", key_padding_mask);  // [N, H*W]
         pipe.m_resampler.infer();
diff --git a/src/cpp/src/visual_language/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp
index df7f43af77..ee7e353e45 100644
--- a/src/cpp/src/visual_language/vision_encoder.cpp
+++ b/src/cpp/src/visual_language/vision_encoder.cpp
@@ -300,8 +300,8 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
     ov::Tensor input_tensor{ov::element::f32, {1, 3, size_t(resized_preprocessed.ny), size_t(resized_preprocessed.nx)}, (void*)(resized_preprocessed.buf.data())};
     ov::Tensor pixel_values = preprocess_for_encoder(input_tensor, patch_size);
     encoder.set_tensor("pixel_values", pixel_values);
-    ov::Tensor patch_attention_mask{ov::element::boolean, {pixel_values.get_shape().at(0), 1, resized_source_size.height * resized_source_size.width}};
-    std::fill_n(patch_attention_mask.data<bool>(), patch_attention_mask.get_size(), true);
+    ov::Tensor patch_attention_mask{ov::element::f32, {pixel_values.get_shape().at(0), 1, resized_source_size.height * resized_source_size.width}};
+    std::fill_n(patch_attention_mask.data<float>(), patch_attention_mask.get_size(), 1.0f);
     encoder.set_tensor("patch_attention_mask", patch_attention_mask);
     ov::Tensor position_ids = prepare_vis_position_ids(pixel_values, patch_attention_mask, {resized_source_size}, ctx_clip.patch_size, ctx_clip.image_size / ctx_clip.patch_size);
     encoder.set_tensor("position_ids", position_ids);
@@ -333,8 +333,8 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o
                 patch_size
             );
             encoder.set_tensor("pixel_values", pixel_values);
-            ov::Tensor patch_attention_mask{ov::element::boolean, {1, 1, slices_size.height * slices_size.width}};
-            std::fill_n(patch_attention_mask.data<bool>(), patch_attention_mask.get_size(), true);
+            ov::Tensor patch_attention_mask{ov::element::f32, {1, 1, slices_size.height * slices_size.width}};
+            std::fill_n(patch_attention_mask.data<float>(), patch_attention_mask.get_size(), 1.0f);
             encoder.set_tensor("patch_attention_mask", patch_attention_mask);
             ov::Tensor position_ids = prepare_vis_position_ids(pixel_values, patch_attention_mask, {slices_size}, ctx_clip.patch_size, ctx_clip.image_size / ctx_clip.patch_size);
             encoder.set_tensor("position_ids", position_ids);
@@ -431,12 +431,7 @@ ov::Tensor preprocess_image_llava(const ov::Tensor& image, const ProcessorConfig
 
 VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const VLMModelType model_type, const std::string& device, const ov::AnyMap device_config, ov::Core core) :
     model_type(model_type) {
-        if (model_type == VLMModelType::MINICPM) {
-            m_vision_encoder = core.compile_model(model_dir / "image_encoder.xml", device, device_config).create_infer_request();
-        } else if (model_type == VLMModelType::LLAVA) {
-            // Vision embeddings model is merged with multi modal projector at model export stage by optimum-intel
-            m_vision_encoder = core.compile_model(model_dir / "openvino_vision_embeddings_model.xml", device, device_config).create_infer_request();
-        }
+        m_vision_encoder = core.compile_model(model_dir / "openvino_vision_embeddings_model.xml", device, device_config).create_infer_request();
         m_processor_config = ov::genai::utils::from_config_json_if_exists<ov::genai::ProcessorConfig>(
             model_dir, "preprocessor_config.json"
         );
diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py
index 1e7a1b81a5..b13369b7ba 100644
--- a/tests/python_tests/test_sampling.py
+++ b/tests/python_tests/test_sampling.py
@@ -28,8 +28,8 @@
 @pytest.mark.precommit
 @pytest.mark.parametrize("model_id", get_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "precommit")))
 @pytest.mark.xfail(
-    raises=RuntimeError,
-    reason="Test fails with error: CPU: head size must be multiple of 16, current: X. CVS-145986.",
+    raises=(RuntimeError, AttributeError),
+    reason="RuntimeError with error: CPU: head size must be multiple of 16, current: X. CVS-145986. AttributeError: 'CodeGenAttention' object has no attribute 'causal_mask' for hf-tiny-model-private/tiny-random-CodeGenForCausalLM",
     strict=True,
 )
 def test_sampling_precommit(tmp_path, model_id):
diff --git a/tests/python_tests/test_vlm_api.py b/tests/python_tests/test_vlm_api.py
index 25b61dd0f9..bb5d421716 100644
--- a/tests/python_tests/test_vlm_api.py
+++ b/tests/python_tests/test_vlm_api.py
@@ -1,69 +1,33 @@
 # Copyright (C) 2018-2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-import openvino_genai
+import openvino_tokenizers
+import openvino
 import pytest
-import gc
-import os
-import numpy as np
-from PIL import Image
-from multiprocessing import Process
-
+import transformers
+from optimum.intel.openvino import OVModelForVisualCausalLM
 from openvino_genai import VLMPipeline
-from openvino import Tensor
 from common import get_greedy, get_image_by_link, get_beam_search, get_greedy, get_multinomial_all_parameters
 
 def get_ov_model(model_dir):
-    import sys
-    from pathlib import Path
-    #TODO: use optimum-intel
-
-    sys.path.append(str(Path(__file__).resolve().parents[2] / 'samples/cpp/visual_language_chat'))
-    import importlib
-    export_MiniCPM = importlib.import_module("export_MiniCPM-V-2_6", "export_MiniCPM")
-    convert_llm = getattr(export_MiniCPM, "convert_llm")
-    convert_vision_encoder = getattr(export_MiniCPM, "convert_vision_encoder")
-    from transformers import AutoModel, AutoTokenizer, AutoProcessor
-    import os
-    import openvino_tokenizers
-    import openvino as ov
-    import gc
-
+    if (model_dir / "openvino_language_model.xml").exists():
+        return model_dir
     model_id = "openbmb/MiniCPM-V-2_6"
-    ckpt = Path(os.path.join(model_dir, "ckpt"))
-    if not ckpt.exists():
-        snapshot_download = getattr(export_MiniCPM, "snapshot_download")
-        patch_model_code = getattr(export_MiniCPM, "patch_model_code")
-        snapshot_download(model_id, local_dir=ckpt, force_download=True)
-        patch_model_code(ckpt)
-    model = AutoModel.from_pretrained(ckpt, trust_remote_code=True)
-    model.eval()
+    processor = transformers.AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+    processor.tokenizer.save_pretrained(model_dir)
+    ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(processor.tokenizer, with_detokenizer=True)
+    openvino.save_model(ov_tokenizer, model_dir / "openvino_tokenizer.xml")
+    openvino.save_model(ov_detokenizer, model_dir / "openvino_detokenizer.xml")
+    model = OVModelForVisualCausalLM.from_pretrained(model_id, compile=False, device="CPU", export=True, trust_remote_code=True)
     model.config.save_pretrained(model_dir)
-    tokenizer = AutoTokenizer.from_pretrained(ckpt, trust_remote_code=True)
-    tokenizer.save_pretrained(model_dir)
-    ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(tokenizer, with_detokenizer=True)
-    ov.save_model(ov_tokenizer, os.path.join(model_dir, "openvino_tokenizer.xml"))
-    ov.save_model(ov_detokenizer, os.path.join(model_dir, "openvino_detokenizer.xml"))
-    processor = AutoProcessor.from_pretrained(ckpt, trust_remote_code=True)
-    processor.save_pretrained(model_dir)
-
-    convert_llm(model, model_dir)
-    del model.llm
-    gc.collect()
-
-    convert_vision_encoder(model, model_dir)
+    model.generation_config.save_pretrained(model_dir)
+    model.save_pretrained(model_dir)
     return model_dir
 
-sampling_configs = [
-    get_beam_search(),
-    get_greedy(),
-    get_multinomial_all_parameters()
-]
 
 prompts = [
     "What is on the image?",
     "What is special about this image?",
-    "Tell me more about this image."
 ]
 
 image_links = [
@@ -75,39 +39,51 @@ def get_ov_model(model_dir):
 image_links_for_testing = [
     [],
     [image_links[0]],
-    [image_links[1], image_links[0]],
     [image_links[0], image_links[2], image_links[1]]
 ]
 
 @pytest.mark.precommit
-def test_vlm_pipeline(tmp_path):
-    import os
-
+@pytest.mark.nightly
+def test_vlm_pipeline(cache):
     def streamer(word: str) -> bool:
-        print(word, end="")
         return False
 
-    model_path = get_ov_model(os.path.join(tmp_path, "miniCPM"))
+    model_path = get_ov_model(cache.mkdir("MiniCPM-V-2_6"))
 
-    for generation_config in sampling_configs:
-        for links in image_links_for_testing:
-            images = []
-            for link in links:
-                images.append(get_image_by_link(link))
+    for links in image_links_for_testing:
+        images = []
+        for link in links:
+            images.append(get_image_by_link(link))
 
-            pipe = VLMPipeline(model_path, "CPU")
-            pipe.start_chat()
+        pipe = VLMPipeline(str(model_path), "CPU")
+        pipe.start_chat()
 
-            pipe.generate(prompts[0], images=images, generation_config=generation_config, streamer=streamer)
+        pipe.generate(prompts[0], images=images, generation_config=get_greedy(), streamer=streamer)
 
-            for prompt in prompts[1:]:
-                pipe.generate(prompt, generation_config=generation_config, streamer=streamer)
+        for prompt in prompts[1:]:
+            pipe.generate(prompt, generation_config=get_greedy(), streamer=streamer)
 
-            pipe.finish_chat()
-            gc.collect()
+        pipe.finish_chat()
+
+
+@pytest.mark.precommit
+@pytest.mark.nightly
+def test_vlm_get_tokenizer(cache):
+    model_path = get_ov_model(cache.mkdir("MiniCPM-V-2_6"))
+    pipe = VLMPipeline(str(model_path), "CPU")
     tokenizer = pipe.get_tokenizer()
     tokenizer.encode("")
-    del pipe
-    gc.collect()
 
 
+@pytest.mark.precommit
+@pytest.mark.nightly
+@pytest.mark.parametrize("config", [
+    get_beam_search(),
+    get_multinomial_all_parameters(),
+])
+@pytest.mark.skip("Enable after sampler are enabled")
+def test_sampling(config, cache):
+    model_path = get_ov_model(cache.mkdir("MiniCPM-V-2_6"))
+    image = get_image_by_link(image_links[0])
+    pipe = VLMPipeline(str(model_path), "CPU")
+    pipe.generate(prompts[0], image=image, generation_config=config)