Skip to content

Commit

Permalink
better pos_embed in clip
Browse files Browse the repository at this point in the history
  • Loading branch information
tc-mb committed May 26, 2024
1 parent 2997a68 commit 8541e99
Showing 1 changed file with 23 additions and 10 deletions.
33 changes: 23 additions & 10 deletions examples/minicpmv/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -593,7 +593,7 @@ std::vector<std::vector<std::vector<float>>> get_2d_sincos_pos_embed_from_grid(i
return emb;
}

struct ggml_tensor * get_2d_sincos_pos_embed(int embed_dim, const std::pair<int, int> image_size, struct ggml_context * ctx, struct ggml_tensor * pos_embed) {
std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, const std::pair<int, int> image_size) {
int grid_h_size = image_size.first;
int grid_w_size = image_size.second;

Expand Down Expand Up @@ -632,13 +632,7 @@ struct ggml_tensor * get_2d_sincos_pos_embed(int embed_dim, const std::pair<int,
}
}

float* dataArray = static_cast<float*>(pos_embed->data);
for(int i=0;i<grid_h_size * grid_w_size;++i){
for(int j=0;j<embed_dim;++j){
dataArray[i*embed_dim+j]=pos_embed_2d[i][j];
}
}
return pos_embed;
return pos_embed_2d;
}

static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, std::pair<int, int> load_image_size = {448, 448}) {
Expand Down Expand Up @@ -708,8 +702,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32

int pos_w = image_size_width/patch_size;
int pos_h = image_size_height/patch_size;
struct ggml_tensor * pos_embed = get_2d_sincos_pos_embed(4096, std::make_pair(pos_w, pos_h), ctx0, model.mm_model_pos_embed_k);
pos_embed = ggml_view_3d(ctx0, pos_embed, 4096, pos_w * pos_h, 1, pos_embed->nb[1], pos_embed->nb[2], 0);
struct ggml_tensor * pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1);
ggml_set_name(pos_embed, "pos_embed");
ggml_set_input(pos_embed);

// // pre-layernorm
// {
Expand Down Expand Up @@ -2068,6 +2063,24 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
free(positions_data);
}

{
struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
int pos_w = image_size_width/patch_size;
int pos_h = image_size_height/patch_size;
int embed_dim = 4096;
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));

float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
for(int i=0;i<pos_w * pos_h;++i){
for(int j=0;j<embed_dim;++j){
pos_embed_data[i*embed_dim+j]=pos_embed_t[i][j];
}
}

ggml_backend_tensor_set(pos_embed, pos_embed_data, 0, ggml_nbytes(pos_embed));
free(pos_embed_data);
}

// {
// struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
// int* patches_data = (int*)malloc(ggml_nbytes(patches));
Expand Down

0 comments on commit 8541e99

Please sign in to comment.