diff --git a/Makefile b/Makefile index d5288dc41f75..e4d5b22ce1db 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ DETECT_LIBS?=true # llama.cpp versions GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be -CPPLLAMA_VERSION?=feff4aa8461da7c432d144c11da4802e41fef3cf +CPPLLAMA_VERSION?=6262d13e0b2da91f230129a93a996609a2f5a2f2 # go-rwkv version RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp index a46b4ee0a335..56d59d217a7c 100644 --- a/backend/cpp/llama/grpc-server.cpp +++ b/backend/cpp/llama/grpc-server.cpp @@ -13,6 +13,7 @@ #include #include "clip.h" #include "llava.h" +#include "log.h" #include "stb_image.h" #include "common.h" #include "json.hpp" @@ -448,7 +449,7 @@ struct llama_server_context LOG_INFO("Multi Modal Mode Enabled", {}); clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1); if(clp_ctx == nullptr) { - LOG_ERROR("unable to load clip model", {{"model", params.mmproj}}); + LOG_ERR("unable to load clip model: %s", params.mmproj.c_str()); return false; } @@ -462,7 +463,7 @@ struct llama_server_context ctx = llama_init.context; if (model == nullptr) { - LOG_ERROR("unable to load model", {{"model", params.model}}); + LOG_ERR("unable to load model: %s", params.model.c_str()); return false; } @@ -470,7 +471,7 @@ struct llama_server_context const int n_embd_clip = clip_n_mmproj_embd(clp_ctx); const int n_embd_llm = llama_n_embd(model); if (n_embd_clip != n_embd_llm) { - LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm); + LOG("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm); llama_free(ctx); llama_free_model(model); return false; @@ -489,7 +490,7 @@ struct llama_server_context std::vector buf(1); int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size()); if (res < 0) { - LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {}); + LOG_ERR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", __func__); sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template } } @@ -812,10 +813,11 @@ struct llama_server_context img_sl.img_data = clip_image_u8_init(); if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data)) { - LOG_ERROR("failed to load image", { - {"slot_id", slot->id}, - {"img_sl_id", img_sl.id} - }); + LOG_ERR("%s: failed to load image, slot_id: %d, img_sl_id: %d", + __func__, + slot->id, + img_sl.id + ); return false; } LOG_VERBOSE("image loaded", { @@ -853,12 +855,12 @@ struct llama_server_context } } if (!found) { - LOG_TEE("ERROR: Image with id: %i, not found.\n", img_id); + LOG("ERROR: Image with id: %i, not found.\n", img_id); slot->images.clear(); return false; } } catch (const std::invalid_argument& e) { - LOG_TEE("Invalid image number id in prompt\n"); + LOG("Invalid image number id in prompt\n"); slot->images.clear(); return false; } @@ -886,7 +888,7 @@ struct llama_server_context {"task_id", slot->task_id}, }); - // LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str()); + // LOG("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str()); return true; } @@ -926,7 +928,7 @@ struct llama_server_context }; if (llama_decode(ctx, batch_view) != 0) { - LOG_TEE("%s: llama_decode() failed\n", __func__); + LOG("%s: llama_decode() failed\n", __func__); return; } } @@ -938,7 +940,7 @@ struct llama_server_context } } - LOG_TEE("system prompt updated\n"); + LOG("system prompt updated\n"); system_need_update = false; } @@ -1120,7 +1122,7 @@ struct llama_server_context } if (!llava_image_embed_make_with_clip_img(clp_ctx, params.cpuparams.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) { - LOG_TEE("Error processing the given image"); + LOG("Error processing the given image"); return false; } @@ -1132,7 +1134,7 @@ struct llama_server_context void send_error(task_server& task, const std::string &error) { - LOG_TEE("task %i - error: %s\n", task.id, error.c_str()); + LOG("task %i - error: %s\n", task.id, error.c_str()); task_result res; res.id = task.id; res.multitask_id = task.multitask_id; @@ -1371,7 +1373,7 @@ struct llama_server_context }; if (llama_decode(ctx, batch_view)) { - LOG_TEE("%s : failed to eval\n", __func__); + LOG("%s : failed to eval\n", __func__); return false; } } @@ -1389,7 +1391,7 @@ struct llama_server_context llama_batch batch_img = { n_eval, nullptr, (img.image_embedding + i * n_embd), nullptr, nullptr, nullptr, nullptr, slot.n_past, 1, 0, }; if (llama_decode(ctx, batch_img)) { - LOG_TEE("%s : failed to eval image\n", __func__); + LOG("%s : failed to eval image\n", __func__); return false; } slot.n_past += n_eval; @@ -1572,7 +1574,7 @@ struct llama_server_context slot.n_past = 0; slot.truncated = false; slot.has_next_token = true; - LOG_TEE("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size()); + LOG("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size()); continue; // END LOCALAI changes @@ -1820,10 +1822,11 @@ struct llama_server_context if (has_images && !ingest_images(slot, n_batch)) { - LOG_ERROR("failed processing images", { - "slot_id", slot.id, - "task_id", slot.task_id, - }); + LOG_ERR("%s: failed processing images Slot id : %d, Task id: %d", + __func__, + slot.id, + slot.task_id + ); // FIXME @phymbert: to be properly tested // early returning without changing the slot state will block the slot for ever // no one at the moment is checking the return value @@ -1863,10 +1866,10 @@ struct llama_server_context const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1); const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w; - LOG_TEE("\n"); - LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd); - LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n); - LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd); + LOG("\n"); + LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd); + LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n); + LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd); llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd); llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n); @@ -1876,7 +1879,7 @@ struct llama_server_context slot.ga_i += slot.ga_w / slot.ga_n; - LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i); + LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i); } slot.n_past_se += n_tokens; } @@ -1901,11 +1904,11 @@ struct llama_server_context if (n_batch == 1 || ret < 0) { // if you get here, it means the KV cache is full - try increasing it via the context size - LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret); + LOG("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret); return false; } - LOG_TEE("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2); + LOG("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2); // retry with half the batch size to try to find a free slot in the KV cache n_batch /= 2;