From 245020fdf79085ba2d52bb5cc5e6d85623a8fa4e Mon Sep 17 00:00:00 2001 From: zhangkaihuo Date: Wed, 18 Sep 2024 17:05:34 +0800 Subject: [PATCH] correct get load time --- examples/llava/minicpmv-cli.cpp | 29 ++++++++++++++++++----------- llama.cpp | 7 +++++-- llama.h | 1 + 3 files changed, 24 insertions(+), 13 deletions(-) diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index 031f055329b35..f8aeba563f50c 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -75,6 +75,18 @@ static struct llava_context * llava_init_context(gpt_params * params) { } else { ctx_params.n_ctx = params->n_ctx; } + + llama_model * model2 = nullptr; + if(params->skip_model.size() > 0 && params->skip_layers > 0) { + //load last model + llama_model_params model_params = llama_model_params_from_gpt_params(*params); + model_params.init_time = false; + //llama_model * model2 = llama_load_model_from_file(params->model.c_str(), model_params); + //llama_model * model2 = llama_load_model_from_file("/Users/zkh/Downloads/last_16/ggml-model-Q4_0.gguf", model_params); + model2 = llama_load_model_from_file(params->skip_model.c_str(), model_params); + llama_set_model_skip_layers(model2, params->skip_layers); + //llama_add_model_load_times(model, model2); + } llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); @@ -83,6 +95,10 @@ static struct llava_context * llava_init_context(gpt_params * params) { return NULL; } + if(params->skip_model.size() > 0 && params->skip_layers > 0) { + llama_set_model2(ctx_llama, model2); + } + for (unsigned int i = 0; i < params->lora_adapter.size(); ++i) { const std::string & lora_adapter = std::get<0>(params->lora_adapter[i]); float lora_scale = std::get<1>(params->lora_adapter[i]); @@ -101,16 +117,6 @@ static struct llava_context * llava_init_context(gpt_params * params) { auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); - { - //load last model - llama_model_params model_params = llama_model_params_from_gpt_params(*params); - //llama_model * model2 = llama_load_model_from_file(params->model.c_str(), model_params); - //llama_model * model2 = llama_load_model_from_file("/Users/zkh/Downloads/last_16/ggml-model-Q4_0.gguf", model_params); - llama_model * model2 = llama_load_model_from_file(params->skip_model.c_str(), model_params); - llama_set_model_skip_layers(model2, params->skip_layers); - llama_set_model2(ctx_llama, model2); - } - ctx_llava->ctx_llama = ctx_llama; ctx_llava->ctx_clip = ctx_clip; ctx_llava->model = model; @@ -341,7 +347,8 @@ int main(int argc, char ** argv) { if (params.image.size() > 0) { auto image = params.image; ctx_llava = minicpmv_init(¶ms, image, n_past); - + //release vit memory + clip_free(ctx_llava->ctx_clip); if (!params.prompt.empty()) { LOG_TEE("%s\n", params.prompt.c_str()); LOG_TEE(""); diff --git a/llama.cpp b/llama.cpp index d41e633bd0901..26c63fb4c2ffc 100644 --- a/llama.cpp +++ b/llama.cpp @@ -14849,6 +14849,7 @@ struct llama_model_params llama_model_default_params() { /*.vocab_only =*/ false, /*.use_mmap =*/ true, /*.use_mlock =*/ false, + /*.init_time =*/ true, }; #ifdef GGML_USE_METAL @@ -14983,8 +14984,10 @@ void llama_set_model_skip_layers( struct llama_model * llama_load_model_from_file( const char * path_model, struct llama_model_params params) { - ggml_time_init(); - + if(params.init_time){ + ggml_time_init(); + } + llama_model * model = new llama_model; unsigned cur_percentage = 0; diff --git a/llama.h b/llama.h index 91808dba55a70..4de1051b2b4d0 100644 --- a/llama.h +++ b/llama.h @@ -235,6 +235,7 @@ extern "C" { bool vocab_only; // only load the vocabulary, no weights bool use_mmap; // use mmap if possible bool use_mlock; // force system to keep model in RAM + bool init_time = true; }; struct llama_context_params {