clean code and fix bug in clip

OpenBMB · Sep 27, 2024 · cdc9511 · cdc9511
1 parent c900ada
commit cdc9511
Show file tree

Hide file tree

Showing 8 changed files with 852 additions and 29 deletions.
diff --git a/examples/llava/README-minicpmv-dev.md b/examples/llava/README-minicpmv-dev.md
@@ -0,0 +1,45 @@
+## MiniCPM-V dev
+
+### Prepare models and code
+
+Clone llama.cpp:
+```bash
+git clone [email protected]:OpenBMB/llama.cpp.git
+cd llama.cpp
+git checkout minicpmv-main-dev
+```
+
+### Usage of MiniCPM-V 2.6
+
+Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-dev-gguf) by us)
+
+```bash
+python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-V-dev
+python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-dev --minicpmv-projector ../MiniCPM-V-dev/minicpmv.projector --output-dir ../MiniCPM-V-dev/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
+```
+
+add 'res = "llama-bpe"' in convert_hf_to_gguf.py 514 line
+```bash
+python ./convert_hf_to_gguf.py ../MiniCPM-V-dev/model
+
+# quantize int4 version
+./llama-quantize ../MiniCPM-V-dev/model/ggml-model-f16.gguf ../MiniCPM-V-dev/model/ggml-model-Q4_K_M.gguf Q4_K_M
+```
+
+Build for Linux or Mac
+
+```bash
+make
+```
+
+Inference on Linux or Mac
+```
+# run f16 version
+./llama-minicpmv-cli -m ../MiniCPM-V-dev/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-dev/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
+
+# run quantized int4 version
+./llama-minicpmv-cli -m ../MiniCPM-V-dev/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-dev/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
+
+# or run in interactive mode
+./llama-minicpmv-cli -m ../MiniCPM-V-dev/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-dev/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
+```
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
@@ -503,8 +503,6 @@ struct clip_vision_model {
     struct ggml_tensor * mm_model_peg_0_b;
 
     // MINICPMV projection
-    struct ggml_tensor * mm_model_pos_embed;
-    struct ggml_tensor * mm_model_pos_embed_k;
     struct ggml_tensor * mm_model_query;
     struct ggml_tensor * mm_model_proj;
     struct ggml_tensor * mm_model_kv_proj;
@@ -960,9 +958,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             }
             struct ggml_tensor * k;
             { // position
-                if (ctx->minicpmv_version == 1) {
-                    q = ggml_add(ctx0, q, model.mm_model_pos_embed);
-                }
                 k = ggml_add(ctx0, v, pos_embed);      
             }
 
@@ -1285,7 +1280,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         hparams.patch_size     = get_u32(ctx, KEY_PATCH_SIZE);
         hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "vision"));
         hparams.eps            = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision"));
-
+
+        if (new_clip->has_minicpmv_projector) {
+            hparams.n_layer = 27;
+        }
+
         try {
             int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS);
             int n = gguf_get_arr_n(ctx, idx);
@@ -1447,10 +1446,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias"));
         }
         else if (new_clip->proj_type == PROJECTOR_TYPE_RESAMPLER) {
-            if (new_clip->minicpmv_version == 1) {
-                vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
-            }
-            vision_model.mm_model_pos_embed_k = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD_K);
             vision_model.mm_model_query = get_tensor(new_clip->ctx_data, TN_MINICPMV_QUERY);
             vision_model.mm_model_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_PROJ);
             vision_model.mm_model_kv_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_KV_PROJ);
@@ -1632,7 +1627,7 @@ static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32*
     }
 }
 
-inline float clip(float x, float lower, float upper) {
+inline int clip(int x, int lower, int upper) {
     return std::max(lower, std::min(x, upper));
 }
 
@@ -1836,10 +1831,6 @@ static std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size
     return refine_size;
 }
 
-inline int clip(int x, int lower, int upper) {
-    return std::max(lower, std::min(x, upper));
-}
-
 static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
     std::vector<int> candidate_split_grids_nums;
     for (int i : {multiple - 1, multiple, multiple + 1}) {

diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
@@ -202,7 +202,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
     return true;
 }
 
-static clip_image_f32 * only_v2_5_reshape_by_patch(clip_image_f32 * image, int patch_size) {
+static clip_image_f32 * reshape_by_patch(clip_image_f32 * image, int patch_size) {
     int width = image->nx;
     int height = image->ny;
     int num_patches = (height / patch_size) * (width / patch_size);
@@ -256,14 +256,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
             load_image_size->width = img_res_v.data[i].nx;
             load_image_size->height = img_res_v.data[i].ny;
             clip_add_load_image_size(ctx_clip, load_image_size);
-            bool encoded = false;
-            int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
-            if (has_minicpmv_projector == 2) {
-                encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
-            }
-            else {
-                encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
-            }
+            bool encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);     
             if (!encoded) {
                 LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
                 return false;

diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp
@@ -185,18 +185,16 @@ static int process_image(struct llava_context * ctx_llava, struct llava_image_em
     res = eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
     if (num_image_embeds > 1) {
         size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
-        eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
         for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
             for (size_t j = 0; j < num_image_embeds_col; ++j) {
-                eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
+                eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
                 process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
-                eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
+                eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
                 if (j == num_image_embeds_col - 1) {
                     eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
                 }
             }
         }
-        res = eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
     }
     res = eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
     LOG_TEE("%s: image token past: %d\n", __func__, n_past);