Skip to content

Commit

Permalink
clean code and fix bug in clip
Browse files Browse the repository at this point in the history
  • Loading branch information
tc-mb committed Sep 27, 2024
1 parent c900ada commit cdc9511
Show file tree
Hide file tree
Showing 8 changed files with 852 additions and 29 deletions.
45 changes: 45 additions & 0 deletions examples/llava/README-minicpmv-dev.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
## MiniCPM-V dev

### Prepare models and code

Clone llama.cpp:
```bash
git clone [email protected]:OpenBMB/llama.cpp.git
cd llama.cpp
git checkout minicpmv-main-dev
```

### Usage of MiniCPM-V 2.6

Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-dev-gguf) by us)

```bash
python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-V-dev
python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-dev --minicpmv-projector ../MiniCPM-V-dev/minicpmv.projector --output-dir ../MiniCPM-V-dev/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
```

add 'res = "llama-bpe"' in convert_hf_to_gguf.py 514 line
```bash
python ./convert_hf_to_gguf.py ../MiniCPM-V-dev/model

# quantize int4 version
./llama-quantize ../MiniCPM-V-dev/model/ggml-model-f16.gguf ../MiniCPM-V-dev/model/ggml-model-Q4_K_M.gguf Q4_K_M
```

Build for Linux or Mac

```bash
make
```

Inference on Linux or Mac
```
# run f16 version
./llama-minicpmv-cli -m ../MiniCPM-V-dev/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-dev/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
# run quantized int4 version
./llama-minicpmv-cli -m ../MiniCPM-V-dev/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-dev/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
# or run in interactive mode
./llama-minicpmv-cli -m ../MiniCPM-V-dev/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-dev/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
```
21 changes: 6 additions & 15 deletions examples/llava/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -503,8 +503,6 @@ struct clip_vision_model {
struct ggml_tensor * mm_model_peg_0_b;

// MINICPMV projection
struct ggml_tensor * mm_model_pos_embed;
struct ggml_tensor * mm_model_pos_embed_k;
struct ggml_tensor * mm_model_query;
struct ggml_tensor * mm_model_proj;
struct ggml_tensor * mm_model_kv_proj;
Expand Down Expand Up @@ -960,9 +958,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
}
struct ggml_tensor * k;
{ // position
if (ctx->minicpmv_version == 1) {
q = ggml_add(ctx0, q, model.mm_model_pos_embed);
}
k = ggml_add(ctx0, v, pos_embed);
}

Expand Down Expand Up @@ -1285,7 +1280,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
hparams.patch_size = get_u32(ctx, KEY_PATCH_SIZE);
hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "vision"));
hparams.eps = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision"));


if (new_clip->has_minicpmv_projector) {
hparams.n_layer = 27;
}

try {
int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS);
int n = gguf_get_arr_n(ctx, idx);
Expand Down Expand Up @@ -1447,10 +1446,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias"));
}
else if (new_clip->proj_type == PROJECTOR_TYPE_RESAMPLER) {
if (new_clip->minicpmv_version == 1) {
vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
}
vision_model.mm_model_pos_embed_k = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD_K);
vision_model.mm_model_query = get_tensor(new_clip->ctx_data, TN_MINICPMV_QUERY);
vision_model.mm_model_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_PROJ);
vision_model.mm_model_kv_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_KV_PROJ);
Expand Down Expand Up @@ -1632,7 +1627,7 @@ static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32*
}
}

inline float clip(float x, float lower, float upper) {
inline int clip(int x, int lower, int upper) {
return std::max(lower, std::min(x, upper));
}

Expand Down Expand Up @@ -1836,10 +1831,6 @@ static std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size
return refine_size;
}

inline int clip(int x, int lower, int upper) {
return std::max(lower, std::min(x, upper));
}

static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
std::vector<int> candidate_split_grids_nums;
for (int i : {multiple - 1, multiple, multiple + 1}) {
Expand Down
11 changes: 2 additions & 9 deletions examples/llava/llava.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
return true;
}

static clip_image_f32 * only_v2_5_reshape_by_patch(clip_image_f32 * image, int patch_size) {
static clip_image_f32 * reshape_by_patch(clip_image_f32 * image, int patch_size) {
int width = image->nx;
int height = image->ny;
int num_patches = (height / patch_size) * (width / patch_size);
Expand Down Expand Up @@ -256,14 +256,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
load_image_size->width = img_res_v.data[i].nx;
load_image_size->height = img_res_v.data[i].ny;
clip_add_load_image_size(ctx_clip, load_image_size);
bool encoded = false;
int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
if (has_minicpmv_projector == 2) {
encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
}
else {
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
}
bool encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
if (!encoded) {
LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
return false;
Expand Down
6 changes: 2 additions & 4 deletions examples/llava/minicpmv-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,18 +185,16 @@ static int process_image(struct llava_context * ctx_llava, struct llava_image_em
res = eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
if (num_image_embeds > 1) {
size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
for (size_t j = 0; j < num_image_embeds_col; ++j) {
eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
if (j == num_image_embeds_col - 1) {
eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
}
}
}
res = eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
}
res = eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
LOG_TEE("%s: image token past: %d\n", __func__, n_past);
Expand Down
Loading

0 comments on commit cdc9511

Please sign in to comment.