diff --git a/common/train.cpp b/common/train.cpp index 773e2c59cc669..dcf9614e40823 100644 --- a/common/train.cpp +++ b/common/train.cpp @@ -71,7 +71,7 @@ void free_random_uniform_distribution(struct random_uniform_distribution * rnd) struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) { float scale = 1.0f; // xavier - switch (tensor->n_dims) { + switch (ggml_n_dims(tensor)) { case 1: scale /= sqrtf((float) tensor->ne[0]); for (int i0 = 0; i0 < tensor->ne[0]; i0++) { @@ -119,7 +119,7 @@ struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct } struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) { - switch (tensor->n_dims) { + switch (ggml_n_dims(tensor)) { case 1: for (int i0 = 0; i0 < tensor->ne[0]; i0++) { float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]); @@ -183,25 +183,27 @@ float fclamp(const float v, const float min, const float max) { } void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) { - GGML_ASSERT(tensor->n_dims == 1); GGML_ASSERT(tensor->ne[0] == ne0); + GGML_ASSERT(tensor->ne[1] == 1); + GGML_ASSERT(tensor->ne[2] == 1); + GGML_ASSERT(tensor->ne[3] == 1); } void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) { - GGML_ASSERT(tensor->n_dims == 2); GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[1] == ne1); + GGML_ASSERT(tensor->ne[2] == 1); + GGML_ASSERT(tensor->ne[3] == 1); } void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) { - GGML_ASSERT(tensor->n_dims == 3); GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[1] == ne1); GGML_ASSERT(tensor->ne[2] == ne2); + GGML_ASSERT(tensor->ne[3] == 1); } void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) { - GGML_ASSERT(tensor->n_dims == 4); GGML_ASSERT(tensor->ne[0] == ne0); GGML_ASSERT(tensor->ne[1] == ne1); GGML_ASSERT(tensor->ne[2] == ne2); @@ -225,8 +227,8 @@ int64_t get_example_targets_batch( bool sample_random_offsets ) { GGML_ASSERT(samples_count > 0); - GGML_ASSERT(tokens_input->n_dims == 2); - GGML_ASSERT(target_probs->n_dims == 3); + GGML_ASSERT(ggml_is_matrix(tokens_input)); + GGML_ASSERT(ggml_is_3d(target_probs)); int64_t n_vocab = target_probs->ne[0]; int64_t n_tokens = tokens_input->ne[0]; int64_t n_batch = tokens_input->ne[1]; diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py index a937410dd8a9f..53bb8a3d97a05 100755 --- a/convert-lora-to-ggml.py +++ b/convert-lora-to-ggml.py @@ -3,7 +3,6 @@ import json import os -import re import struct import sys from typing import Any, BinaryIO, Sequence @@ -11,41 +10,13 @@ import numpy as np import torch -NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1} - +from pathlib import Path +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) +import gguf -HF_SUBLAYER_TO_GGML = { - "self_attn.q_proj": "attn_q", - "self_attn.k_proj": "attn_k", - "self_attn.v_proj": "attn_v", - "self_attn.o_proj": "attn_output", - "mlp.gate_proj": "ffn_gate", - "mlp.down_proj": "ffn_down", - "mlp.up_proj": "ffn_up", - "input_layernorm": "attn_norm", - "post_attention_layernorm": "ffn_norm", -} - - -def translate_tensor_name(t: str) -> str: - match = re.match(r".*layers\.(\d+)\.(\w+\.\w+)\.lora_(A|B)\.weight", t) - if match: - nn = match.group(1) - sub_layer = match.group(2) - lora_type = match.group(3) - - sub_layer_renamed = HF_SUBLAYER_TO_GGML.get(sub_layer) - if sub_layer_renamed is None: - print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}") - sys.exit(1) - output_string = ( - f"blk.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}" - ) - return output_string - else: - print(f"Error: unrecognized tensor {t}") - sys.exit(1) +NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1} def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None: @@ -61,9 +32,7 @@ def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None: fout.write(struct.pack("i", int(params["lora_alpha"]))) -def write_tensor_header( - self, name: str, shape: Sequence[int], data_type: np.dtype[Any] -) -> None: +def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None: sname = name.encode("utf-8") fout.write( struct.pack( @@ -78,11 +47,12 @@ def write_tensor_header( fout.seek((fout.tell() + 31) & -32) -if len(sys.argv) != 2: - print(f"Usage: python {sys.argv[0]} ") +if len(sys.argv) < 2: + print(f"Usage: python {sys.argv[0]} [arch]") print( "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'" ) + print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)") sys.exit(1) input_json = os.path.join(sys.argv[1], "adapter_config.json") @@ -90,6 +60,14 @@ def write_tensor_header( output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin") model = torch.load(input_model, map_location="cpu") +arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama" + +if arch_name not in gguf.MODEL_ARCH_NAMES.values(): + print(f"Error: unsupported architecture {arch_name}") + sys.exit(1) + +arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)] +name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone with open(input_json, "r") as f: params = json.load(f) @@ -117,6 +95,7 @@ def write_tensor_header( write_file_header(fout, params) for k, v in model.items(): + orig_k = k if k.endswith(".default.weight"): k = k.replace(".default.weight", ".weight") if k in ["llama_proj.weight", "llama_proj.bias"]: @@ -129,7 +108,32 @@ def write_tensor_header( v = v.float() t = v.detach().numpy() - tname = translate_tensor_name(k) + + prefix = "base_model.model." + if k.startswith(prefix): + k = k[len(prefix) :] + + lora_suffixes = (".lora_A.weight", ".lora_B.weight") + if k.endswith(lora_suffixes): + suffix = k[-len(lora_suffixes[0]):] + k = k[: -len(lora_suffixes[0])] + else: + print(f"Error: unrecognized tensor name {orig_k}") + sys.exit(1) + + tname = name_map.get_name(k) + if tname is None: + print(f"Error: could not map tensor name {orig_k}") + print(" Note: the arch parameter must be specified if the model is not llama") + sys.exit(1) + + if suffix == ".lora_A.weight": + tname += ".weight.loraA" + elif suffix == ".lora_B.weight": + tname += ".weight.loraB" + else: + assert False + print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB") write_tensor_header(fout, tname, t.shape, t.dtype) t.tofile(fout) diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index 8155101d0ab93..2dc2988d34c81 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -1258,9 +1258,9 @@ static struct ggml_tensor * forward_lora( } static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) { - assert(logits->n_dims == 2); - assert(probs->n_dims == 2); - assert(best_samples->n_dims == 1); + assert(ggml_is_matrix(logits)); + assert(ggml_is_matrix(probs)); + assert(ggml_is_vector(best_samples)); assert(logits->ne[1] == best_samples->ne[0]); assert(logits->ne[0] == probs->ne[0]); assert(logits->ne[1] == probs->ne[1]); @@ -1292,9 +1292,9 @@ static void sample_softmax_batch( struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples ) { - GGML_ASSERT(best_samples->n_dims == 2); - GGML_ASSERT(logits->n_dims == 3); - GGML_ASSERT(probs->n_dims == 3); + GGML_ASSERT(ggml_is_matrix(best_samples)); + GGML_ASSERT(ggml_is_3d(logits)); + GGML_ASSERT(ggml_is_3d(probs)); int n_tokens = best_samples->ne[0]; int n_batch = best_samples->ne[1]; int n_vocab = logits->ne[0]; @@ -1334,7 +1334,7 @@ static void print_row(struct ggml_tensor * probs, int i) { } static void print_matrix(struct ggml_tensor * probs) { - assert(probs->n_dims == 2); + assert(ggml_is_matrix(probs)); for (int i = 0; i < probs->ne[1]; ++i) { for (int k = 0; k < probs->ne[0]; ++k) { float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k); @@ -1386,8 +1386,8 @@ static void get_example_targets(int example_id, struct ggml_tensor * tokens_inpu static void get_example_targets_batch( struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets ) { - GGML_ASSERT(tokens_input->n_dims == 2); - GGML_ASSERT( targets->n_dims == 3); + GGML_ASSERT(ggml_is_matrix(tokens_input)); + GGML_ASSERT(ggml_is_3d(targets)); int n_tokens = tokens_input->ne[0]; int n_batch = tokens_input->ne[1]; GGML_ASSERT(n_tokens == targets->ne[1]); diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index c8cd910ed1938..512763a160267 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -130,13 +130,13 @@ int main(int argc, char ** argv) { const ggml_type qtype = GGML_TYPE_Q4_1; size_t ctx_size = 0; - ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += sizex*sizey*ggml_type_sizef(qtype); - ctx_size += sizex*sizey*ggml_type_sizef(qtype); - ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS - ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS + ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); + ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); + ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez); + ctx_size += ggml_row_size(qtype, sizex*sizey); + ctx_size += ggml_row_size(qtype, sizex*sizey); + ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS + ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS ctx_size += 1024*1024*16; printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024)); diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index cae3bf3c3dc65..4d41e17793f43 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -427,7 +427,7 @@ static void print_row(struct ggml_tensor * probs, int i) { } static void print_matrix(struct ggml_tensor * probs) { - assert(probs->n_dims == 2); + assert(ggml_is_matrix(probs)); for (int i = 0; i < probs->ne[1]; ++i) { for (int k = 0; k < probs->ne[0]; ++k) { float p = get_f32_2d(probs, k, i); @@ -639,7 +639,7 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) { int ct; - switch (gg_weights->n_dims){ + switch (ggml_n_dims(gg_weights)) { case 1: ct = 0; for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){ diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index af46e44a6e216..b9849e8c910a0 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -1110,7 +1110,7 @@ static void write_tensor(struct llama_file * file, struct ggml_tensor * tensor, name = ggml_get_name(tensor); } uint32_t name_len = strlen(name); - uint32_t nd = tensor->n_dims; + uint32_t nd = ggml_n_dims(tensor); uint32_t ne[4] = { (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index 9ab63a29310ad..9e24bf24c75e1 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -195,7 +195,7 @@ static bool gguf_ex_read_1(const std::string & fname) { struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); - printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data); + printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, ggml_n_dims(cur), cur->name, cur->data); // print first 10 elements const float * data = (const float *) cur->data; diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 4bb7b93b63440..1124659685034 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -514,7 +514,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { ctx_size += padded_size; if (verbosity >= 3) { printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, padded_size=%zu, offset=%zu\n", __func__, i, - cur->n_dims, cur->name, tensor_size, padded_size, offset); + ggml_n_dims(cur), cur->name, tensor_size, padded_size, offset); } } } @@ -962,7 +962,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i } // quantize only 2D tensors - quantize &= (cur->n_dims == 2); + quantize &= (ggml_n_dims(cur) == 2); if (quantize) { new_type = type; @@ -1035,7 +1035,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i fout.put(0); } - printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), cur->n_dims, quantize, + printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize, orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); } diff --git a/examples/server/public/completion.js b/examples/server/public/completion.js index c281f0fbd5535..6e2b99565dc6e 100644 --- a/examples/server/public/completion.js +++ b/examples/server/public/completion.js @@ -34,7 +34,8 @@ export async function* llama(prompt, params = {}, config = {}) { headers: { 'Connection': 'keep-alive', 'Content-Type': 'application/json', - 'Accept': 'text/event-stream' + 'Accept': 'text/event-stream', + ...(params.api_key ? {'Authorization': `Bearer ${params.api_key}`} : {}) }, signal: controller.signal, }); diff --git a/examples/server/public/index.html b/examples/server/public/index.html index 451fd4a3be602..07d779d2008a2 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -235,7 +235,8 @@ grammar: '', n_probs: 0, // no completion_probabilities, image_data: [], - cache_prompt: true + cache_prompt: true, + api_key: '' }) /* START: Support for storing prompt templates and parameters in browsers LocalStorage */ @@ -790,6 +791,10 @@
${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
+
+ + +
` diff --git a/examples/server/server.cpp b/examples/server/server.cpp index d6ff3771f12ab..404549352ef46 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -37,6 +37,7 @@ using json = nlohmann::json; struct server_params { std::string hostname = "127.0.0.1"; + std::string api_key; std::string public_path = "examples/server/public"; int32_t port = 8080; int32_t read_timeout = 600; @@ -1954,6 +1955,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); printf(" --port PORT port to listen (default (default: %d)\n", sparams.port); printf(" --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str()); + printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n"); printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); printf(" --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled"); printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel); @@ -2003,6 +2005,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } sparams.public_path = argv[i]; } + else if (arg == "--api-key") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + sparams.api_key = argv[i]; + } else if (arg == "--timeout" || arg == "-to") { if (++i >= argc) @@ -2670,6 +2681,32 @@ int main(int argc, char **argv) httplib::Server svr; + // Middleware for API key validation + auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool { + // If API key is not set, skip validation + if (sparams.api_key.empty()) { + return true; + } + + // Check for API key in the header + auto auth_header = req.get_header_value("Authorization"); + std::string prefix = "Bearer "; + if (auth_header.substr(0, prefix.size()) == prefix) { + std::string received_api_key = auth_header.substr(prefix.size()); + if (received_api_key == sparams.api_key) { + return true; // API key is valid + } + } + + // API key is invalid or not provided + res.set_content("Unauthorized: Invalid API Key", "text/plain"); + res.status = 401; // Unauthorized + + LOG_WARNING("Unauthorized: Invalid API Key", {}); + + return false; + }; + svr.set_default_headers({{"Server", "llama.cpp"}, {"Access-Control-Allow-Origin", "*"}, {"Access-Control-Allow-Headers", "content-type"}}); @@ -2712,8 +2749,11 @@ int main(int argc, char **argv) res.set_content(data.dump(), "application/json"); }); - svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res) + svr.Post("/completion", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res) { + if (!validate_api_key(req, res)) { + return; + } json data = json::parse(req.body); const int task_id = llama.request_completion(data, false, false, -1); if (!json_value(data, "stream", false)) { @@ -2800,8 +2840,11 @@ int main(int argc, char **argv) }); // TODO: add mount point without "/v1" prefix -- how? - svr.Post("/v1/chat/completions", [&llama](const httplib::Request &req, httplib::Response &res) + svr.Post("/v1/chat/completions", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res) { + if (!validate_api_key(req, res)) { + return; + } json data = oaicompat_completion_params_parse(json::parse(req.body)); const int task_id = llama.request_completion(data, false, false, -1); @@ -2870,8 +2913,11 @@ int main(int argc, char **argv) } }); - svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res) + svr.Post("/infill", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res) { + if (!validate_api_key(req, res)) { + return; + } json data = json::parse(req.body); const int task_id = llama.request_completion(data, true, false, -1); if (!json_value(data, "stream", false)) { @@ -3006,11 +3052,15 @@ int main(int argc, char **argv) svr.set_error_handler([](const httplib::Request &, httplib::Response &res) { + if (res.status == 401) + { + res.set_content("Unauthorized", "text/plain"); + } if (res.status == 400) { res.set_content("Invalid request", "text/plain"); } - else if (res.status != 500) + else if (res.status == 404) { res.set_content("File Not Found", "text/plain"); res.status = 404; @@ -3033,11 +3083,15 @@ int main(int argc, char **argv) // to make it ctrl+clickable: LOG_TEE("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port); - LOG_INFO("HTTP server listening", { - {"hostname", sparams.hostname}, - {"port", sparams.port}, - }); + std::unordered_map log_data; + log_data["hostname"] = sparams.hostname; + log_data["port"] = std::to_string(sparams.port); + + if (!sparams.api_key.empty()) { + log_data["api_key"] = "api_key: ****" + sparams.api_key.substr(sparams.api_key.length() - 4); + } + LOG_INFO("HTTP server listening", log_data); // run the HTTP server in a thread - see comment below std::thread t([&]() { diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 146659c9f212a..7268e24ad4bed 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -8890,6 +8890,12 @@ static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, gg (void) dst; } +static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]); +} + void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { const int64_t nrows = ggml_nrows(tensor); @@ -8939,8 +8945,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses if (ne0 % MATRIX_ROW_PADDING != 0) { - size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING) - * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type); + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); } char * buf; @@ -9481,8 +9486,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t if (ggml_is_quantized(tensor->type)) { if (ne0 % MATRIX_ROW_PADDING != 0) { - size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING) - * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type); + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); } } diff --git a/ggml.c b/ggml.c index ee207a5d19454..a380114faf6ff 100644 --- a/ggml.c +++ b/ggml.c @@ -1997,12 +1997,6 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) { return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN); } -size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) { - static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); - - return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type); -} - int ggml_blck_size(enum ggml_type type) { return type_traits[type].blck_size; } @@ -2011,6 +2005,11 @@ size_t ggml_type_size(enum ggml_type type) { return type_traits[type].type_size; } +size_t ggml_row_size(enum ggml_type type, int64_t ne) { + assert(ne % ggml_blck_size(type) == 0); + return ggml_type_size(type)*ne/ggml_blck_size(type); +} + double ggml_type_sizef(enum ggml_type type) { return ((double)(type_traits[type].type_size))/type_traits[type].blck_size; } @@ -2049,24 +2048,37 @@ size_t ggml_element_size(const struct ggml_tensor * tensor) { return ggml_type_size(tensor->type); } -static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) { +bool ggml_is_scalar(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1; } -static inline bool ggml_is_vector(const struct ggml_tensor * tensor) { +bool ggml_is_vector(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1; } -static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) { +bool ggml_is_matrix(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return tensor->ne[2] == 1 && tensor->ne[3] == 1; } +bool ggml_is_3d(const struct ggml_tensor * tensor) { + return tensor->ne[3] == 1; +} + +int ggml_n_dims(const struct ggml_tensor * tensor) { + for (int i = GGML_MAX_DIMS - 1; i >= 1; --i) { + if (tensor->ne[i] > 1) { + return i + 1; + } + } + return 1; +} + static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); @@ -2473,7 +2485,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( view_src = view_src->view_src; } - size_t data_size = ggml_type_size(type)*(ne[0]/ggml_blck_size(type)); + size_t data_size = ggml_row_size(type, ne[0]); for (int i = 1; i < n_dims; i++) { data_size *= ne[i]; } @@ -2516,7 +2528,6 @@ static struct ggml_tensor * ggml_new_tensor_impl( /*.type =*/ type, /*.backend =*/ GGML_BACKEND_CPU, /*.buffer =*/ NULL, - /*.n_dims =*/ n_dims, /*.ne =*/ { 1, 1, 1, 1 }, /*.nb =*/ { 0, 0, 0, 0 }, /*.op =*/ GGML_OP_NONE, @@ -2623,7 +2634,7 @@ struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) { } struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) { - return ggml_new_tensor(ctx, src->type, src->n_dims, src->ne); + return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne); } static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) { @@ -3072,7 +3083,7 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * struct ggml_tensor * ggml_view_tensor( struct ggml_context * ctx, struct ggml_tensor * src) { - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src, 0); + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, GGML_MAX_DIMS, src->ne, src, 0); ggml_format_name(result, "%s (view)", src->name); for (int i = 0; i < GGML_MAX_DIMS; i++) { @@ -3230,10 +3241,10 @@ static struct ggml_tensor * ggml_add_cast_impl( is_node = true; } - struct ggml_tensor * result = ggml_new_tensor(ctx, type, a->n_dims, a->ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne); result->op = GGML_OP_ADD; - result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne) : NULL; + result->grad = is_node ? ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne) : NULL; result->src[0] = a; result->src[1] = b; @@ -3602,12 +3613,12 @@ struct ggml_tensor * ggml_sum_rows( is_node = true; } - int64_t ne[4] = {1,1,1,1}; - for (int i=1; in_dims; ++i) { + int64_t ne[GGML_MAX_DIMS] = { 1 }; + for (int i = 1; i < GGML_MAX_DIMS; ++i) { ne[i] = a->ne[i]; } - struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, a->n_dims, ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne); result->op = GGML_OP_SUM_ROWS; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -3628,8 +3639,8 @@ struct ggml_tensor * ggml_mean( is_node = true; } - int64_t ne[GGML_MAX_DIMS] = { 1, a->ne[1], a->ne[2], a->ne[3] }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, ne); + int64_t ne[4] = { 1, a->ne[1], a->ne[2], a->ne[3] }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); result->op = GGML_OP_MEAN; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -3651,8 +3662,7 @@ struct ggml_tensor * ggml_argmax( is_node = true; } - int64_t ne[GGML_MAX_DIMS] = { a->ne[1], 1, 1, 1 }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, ne); + struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, a->ne[1]); result->op = GGML_OP_ARGMAX; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -3675,7 +3685,7 @@ struct ggml_tensor * ggml_repeat( is_node = true; } - struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne); result->op = GGML_OP_REPEAT; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -3702,7 +3712,7 @@ struct ggml_tensor * ggml_repeat_back( return a; } - struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne); result->op = GGML_OP_REPEAT_BACK; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -4078,7 +4088,7 @@ struct ggml_tensor * ggml_mul_mat( } const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); result->op = GGML_OP_MUL_MAT; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -4112,7 +4122,7 @@ struct ggml_tensor * ggml_mul_mat_id( } const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(as[0]->n_dims, b->n_dims), ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); ggml_set_op_params_i32(result, 0, id); ggml_set_op_params_i32(result, 1, n_as); @@ -4150,7 +4160,7 @@ struct ggml_tensor * ggml_out_prod( // a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3] const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); result->op = GGML_OP_OUT_PROD; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -4435,7 +4445,7 @@ struct ggml_tensor * ggml_reshape( //GGML_ASSERT(false); } - struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a, 0); + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0); ggml_format_name(result, "%s (reshaped)", a->name); result->op = GGML_OP_RESHAPE; @@ -4813,7 +4823,7 @@ struct ggml_tensor * ggml_diag( } const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] }; - struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, MAX(a->n_dims, 2), ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, 4, ne); result->op = GGML_OP_DIAG; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -5460,7 +5470,7 @@ struct ggml_tensor * ggml_pool_1d( is_node = true; } - const int64_t ne[3] = { + const int64_t ne[2] = { ggml_calc_pool_output_size(a->ne[0], k0, s0, p0), a->ne[1], }; @@ -5579,7 +5589,7 @@ struct ggml_tensor * ggml_argsort( enum ggml_sort_order order) { bool is_node = false; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, a->ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne); ggml_set_op_params_i32(result, 0, (int32_t) order); @@ -5626,7 +5636,7 @@ struct ggml_tensor * ggml_flash_attn( } //struct ggml_tensor * result = ggml_dup_tensor(ctx, q); - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, q->n_dims, q->ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, q->ne); int32_t t = masked ? 1 : 0; ggml_set_op_params(result, &t, sizeof(t)); @@ -5659,7 +5669,7 @@ struct ggml_tensor * ggml_flash_ff( } //struct ggml_tensor * result = ggml_dup_tensor(ctx, a); - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, a->ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne); result->op = GGML_OP_FLASH_FF; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -5775,7 +5785,6 @@ struct ggml_tensor * ggml_win_part( const int np = npx*npy; const int64_t ne[4] = { a->ne[0], w, w, np, }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); int32_t params[] = { npx, npy, w }; @@ -9571,16 +9580,11 @@ static bool ggml_compute_forward_mul_mat_use_blas( } #endif -// off1 = offset in i11 and i1 -// cne1 = ne11 and ne1 -// in a normal matrix multiplication, off1 = 0 and cne1 = ne1 -// during GGML_TASK_INIT, the full src1 is converted regardless of off1 and cne1 static void ggml_compute_forward_mul_mat( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, - struct ggml_tensor * dst, - int64_t off1, int64_t cne1) { + struct ggml_tensor * dst) { int64_t t0 = ggml_perf_time_us(); UNUSED(t0); @@ -9648,9 +9652,9 @@ static void ggml_compute_forward_mul_mat( const int64_t i03 = i13/r3; const int64_t i02 = i12/r2; - const void * x = (char *) src0->data + i02*nb02 + i03*nb03; - const float * y = (float *) ((char *) src1->data + off1*nb11 + i12*nb12 + i13*nb13); - float * d = (float *) ((char *) dst->data + off1*nb1 + i12*nb2 + i13*nb3); + const void * x = (char *) src0->data + i02*nb02 + i03*nb03; + const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13); + float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); if (type != GGML_TYPE_F32) { float * const wdata = params->wdata; @@ -9667,7 +9671,7 @@ static void ggml_compute_forward_mul_mat( } cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, - cne1, ne01, ne10, + ne1, ne01, ne10, 1.0f, y, ne10, x, ne00, 0.0f, d, ne01); @@ -9683,7 +9687,7 @@ static void ggml_compute_forward_mul_mat( if (params->type == GGML_TASK_INIT) { if (src1->type != vec_dot_type) { char * wdata = params->wdata; - const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); + const size_t row_size = ggml_row_size(vec_dot_type, ne10); assert(params->wsize >= ne11*ne12*ne13*row_size); assert(src1->type == GGML_TYPE_F32); @@ -9706,10 +9710,10 @@ static void ggml_compute_forward_mul_mat( } const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; - const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); + const size_t row_size = ggml_row_size(vec_dot_type, ne10); - const int64_t nr0 = ne01; // src0 rows - const int64_t nr1 = cne1*ne12*ne13; // src1 rows + const int64_t nr0 = ne01; // src0 rows + const int64_t nr1 = ne1*ne12*ne13; // src1 rows //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1); @@ -9751,9 +9755,9 @@ static void ggml_compute_forward_mul_mat( for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) { for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) { for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) { - const int64_t i13 = (ir1/(ne12*cne1)); - const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1; - const int64_t i11 = (ir1 - i13*ne12*cne1 - i12*cne1) + off1; + const int64_t i13 = (ir1/(ne12*ne1)); + const int64_t i12 = (ir1 - i13*ne12*ne1)/ne1; + const int64_t i11 = (ir1 - i13*ne12*ne1 - i12*ne1); // broadcast src0 into src1 const int64_t i03 = i13/r3; @@ -9793,28 +9797,191 @@ static void ggml_compute_forward_mul_mat( static void ggml_compute_forward_mul_mat_id( const struct ggml_compute_params * params, - const struct ggml_tensor * src0, + const struct ggml_tensor * ids, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { - // during GGML_TASK_INIT the entire src1 is converted to vec_dot_type - ggml_compute_forward_mul_mat(params, dst->src[2], src1, dst, 0, dst->ne[1]); - return; - } + const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS + + GGML_TENSOR_BINARY_OP_LOCALS + + const int ith = params->ith; + const int nth = params->nth; + + const enum ggml_type type = src0->type; - const struct ggml_tensor * ids = src0; + const bool src1_cont = ggml_is_contiguous(src1); + + ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot; + enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type; + ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float; + + GGML_ASSERT(ne0 == ne01); + GGML_ASSERT(ne1 == ne11); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == ggml_type_size(type)); + GGML_ASSERT(nb10 == ggml_type_size(src1->type)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + // broadcast factors + const int64_t r2 = ne12/ne02; + const int64_t r3 = ne13/ne03; + + // row groups const int id = ggml_get_op_params_i32(dst, 0); const int n_as = ggml_get_op_params_i32(dst, 1); - for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { - const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]); + char * wdata_src1_end = (src1->type == vec_dot_type) ? + (char *) params->wdata : + (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t)); - GGML_ASSERT(row_id >= 0 && row_id < n_as); + int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as] + int64_t * matrix_rows = matrix_row_counts + n_as; // [n_as][ne11] - const struct ggml_tensor * src0_row = dst->src[row_id + 2]; - ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1); + #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)] + + if (params->type == GGML_TASK_INIT) { + char * wdata = params->wdata; + if (src1->type != vec_dot_type) { + const size_t row_size = ggml_row_size(vec_dot_type, ne10); + + assert(params->wsize >= ne11*ne12*ne13*row_size); + assert(src1->type == GGML_TYPE_F32); + + for (int64_t i13 = 0; i13 < ne13; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); + wdata += row_size; + } + } + } + } + + // initialize matrix_row_counts + GGML_ASSERT(wdata == wdata_src1_end); + memset(matrix_row_counts, 0, n_as*sizeof(int64_t)); + + // group rows by src0 matrix + for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { + const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]); + + GGML_ASSERT(row_id >= 0 && row_id < n_as); + MMID_MATRIX_ROW(row_id, matrix_row_counts[row_id]) = i01; + matrix_row_counts[row_id] += 1; + } + + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // compute each matrix multiplication in sequence + for (int cur_a = 0; cur_a < n_as; ++cur_a) { + const int64_t cne1 = matrix_row_counts[cur_a]; + + if (cne1 == 0) { + continue; + } + + const struct ggml_tensor * src0_cur = dst->src[cur_a + 2]; + + const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; + const size_t row_size = ggml_row_size(vec_dot_type, ne10); + + const int64_t nr0 = ne01; // src0 rows + const int64_t nr1 = cne1*ne12*ne13; // src1 rows + + //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1); + + // distribute the thread work across the inner or outer loop based on which one is larger + + const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows + const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows + + const int64_t ith0 = ith % nth0; + const int64_t ith1 = ith / nth0; + + const int64_t dr0 = (nr0 + nth0 - 1)/nth0; + const int64_t dr1 = (nr1 + nth1 - 1)/nth1; + + const int64_t ir010 = dr0*ith0; + const int64_t ir011 = MIN(ir010 + dr0, nr0); + + const int64_t ir110 = dr1*ith1; + const int64_t ir111 = MIN(ir110 + dr1, nr1); + + //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111); + + // threads with no work simply yield (not sure if it helps) + if (ir010 >= ir011 || ir110 >= ir111) { + sched_yield(); + continue; + } + + assert(ne12 % ne02 == 0); + assert(ne13 % ne03 == 0); + + // block-tiling attempt + const int64_t blck_0 = 16; + const int64_t blck_1 = 16; + + // attempt to reduce false-sharing (does not seem to make a difference) + float tmp[16]; + + for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) { + for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) { + for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) { + const int64_t i13 = (ir1/(ne12*cne1)); // Note: currently, src1 is always a matrix + const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1; + const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1); + const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11); + + // broadcast src0 into src1 + const int64_t i03 = i13/r3; + const int64_t i02 = i12/r2; + + const int64_t i1 = i11; + const int64_t i2 = i12; + const int64_t i3 = i13; + + const char * src0_row = (const char *) src0_cur->data + (0 + i02*nb02 + i03*nb03); + + // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides + // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using + // the original src1 data pointer, so we should index using the indices directly + // TODO: this is a bit of a hack, we should probably have a better way to handle this + const char * src1_col = (const char *) wdata + + (src1_cont || src1->type != vec_dot_type + ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size + : (i11*nb11 + i12*nb12 + i13*nb13)); + + float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)); + + //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { + // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col); + //} + + for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { + vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col); + } + memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float)); + } + } + } } + + #undef MMID_MATRIX_ROW } // ggml_compute_forward_out_prod @@ -14182,7 +14349,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } break; case GGML_OP_MUL_MAT: { - ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor, 0, tensor->ne[1]); + ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_MUL_MAT_ID: { @@ -14558,7 +14725,7 @@ static struct ggml_tensor * ggml_recompute_graph_node( return replacements->vals[i]; } - struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, node->n_dims, node->ne); + struct ggml_tensor * clone = ggml_new_tensor(ctx, node->type, GGML_MAX_DIMS, node->ne); // insert clone into replacements GGML_ASSERT(replacements->set.keys[i] == NULL); // assert that we don't overwrite @@ -15982,7 +16149,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { } break; case GGML_OP_MUL_MAT_ID: { - // FIXME: blas n_tasks = n_threads; } break; case GGML_OP_OUT_PROD: @@ -16307,25 +16473,21 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { } else #endif if (node->src[1]->type != vec_dot_type) { - cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type); + cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1])); } } break; case GGML_OP_MUL_MAT_ID: { - const struct ggml_tensor * a = node->src[2]; - const struct ggml_tensor * b = node->src[1]; - const enum ggml_type vec_dot_type = type_traits[a->type].vec_dot_type; -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(a, b, node)) { - if (a->type != GGML_TYPE_F32) { - // here we need memory just for single 2D matrix from src0 - cur = ggml_type_size(GGML_TYPE_F32)*(a->ne[0]*a->ne[1]); - } - } else -#endif - if (b->type != vec_dot_type) { - cur = ggml_type_size(vec_dot_type)*ggml_nelements(b)/ggml_blck_size(vec_dot_type); + const struct ggml_tensor * src0 = node->src[2]; + const struct ggml_tensor * src1 = node->src[1]; + const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type; + if (src1->type != vec_dot_type) { + cur = ggml_row_size(vec_dot_type, ggml_nelements(src1)); } + const int n_as = ggml_get_op_params_i32(node, 1); + cur = GGML_PAD(cur, sizeof(int64_t)); // align + cur += n_as * sizeof(int64_t); // matrix_row_counts + cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows } break; case GGML_OP_OUT_PROD: { @@ -16555,7 +16717,7 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou fprintf(fout, "%-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n", ggml_type_name(tensor->type), ggml_op_name (tensor->op), - tensor->n_dims, + ggml_n_dims(tensor), ne[0], ne[1], ne[2], ne[3], nb[0], nb[1], nb[2], nb[3], tensor->data, @@ -16570,7 +16732,7 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char arg, ggml_type_name(tensor->type), ggml_op_name (tensor->op), - tensor->n_dims, + ggml_n_dims(tensor), ne[0], ne[1], ne[2], ne[3], nb[0], nb[1], nb[2], nb[3], tensor->data, @@ -16660,11 +16822,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { const uint32_t type = tensor->type; const uint32_t op = tensor->op; - const uint32_t n_dims = tensor->n_dims; fwrite(&type, sizeof(uint32_t), 1, fout); fwrite(&op, sizeof(uint32_t), 1, fout); - fwrite(&n_dims, sizeof(uint32_t), 1, fout); for (int j = 0; j < GGML_MAX_DIMS; ++j) { const uint64_t ne = tensor->ne[j]; @@ -16694,11 +16854,9 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { const uint32_t type = tensor->type; const uint32_t op = tensor->op; - const uint32_t n_dims = tensor->n_dims; fwrite(&type, sizeof(uint32_t), 1, fout); fwrite(&op, sizeof(uint32_t), 1, fout); - fwrite(&n_dims, sizeof(uint32_t), 1, fout); for (int j = 0; j < GGML_MAX_DIMS; ++j) { const uint64_t ne = tensor->ne[j]; @@ -16870,12 +17028,10 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context * { uint32_t type; uint32_t op; - uint32_t n_dims; for (uint32_t i = 0; i < n_leafs; ++i) { type = *(const uint32_t *) ptr; ptr += sizeof(type); op = *(const uint32_t *) ptr; ptr += sizeof(op); - n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims); int64_t ne[GGML_MAX_DIMS]; size_t nb[GGML_MAX_DIMS]; @@ -16891,7 +17047,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context * nb[j] = nb_cur; } - struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne); + struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne); tensor->op = (enum ggml_op) op; @@ -16908,7 +17064,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context * ptr += ggml_nbytes(tensor); - fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor)); + fprintf(stderr, "%s: loaded leaf %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor)); } } @@ -16918,12 +17074,10 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context * { uint32_t type; uint32_t op; - uint32_t n_dims; for (uint32_t i = 0; i < n_nodes; ++i) { type = *(const uint32_t *) ptr; ptr += sizeof(type); op = *(const uint32_t *) ptr; ptr += sizeof(op); - n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims); enum ggml_op eop = (enum ggml_op) op; @@ -16994,7 +17148,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context * } break; default: { - tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne); + tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, GGML_MAX_DIMS, ne); tensor->op = eop; } break; @@ -17013,7 +17167,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context * result->nodes[i] = tensor; - fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor)); + fprintf(stderr, "%s: loaded node %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor)); } } } @@ -17151,7 +17305,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph fprintf(fp, "(%s)|", ggml_type_name(node->type)); } - if (node->n_dims == 2) { + if (ggml_is_matrix(node)) { fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | %s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op)); } else { fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | %s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op)); @@ -17418,7 +17572,7 @@ static enum ggml_opt_result ggml_opt_adam( int64_t i = 0; for (int p = 0; p < np; ++p) { const int64_t ne = ggml_nelements(ps[p]); - const float p_decay = ((ps[p]->n_dims >= decay_min_ndim) ? decay : 0.0f) * sched; + const float p_decay = ((ggml_n_dims(ps[p]) >= decay_min_ndim) ? decay : 0.0f) * sched; for (int64_t j = 0; j < ne; ++j) { float x = ggml_get_f32_1d(ps[p], j); float g_ = g[i]*gnorm; @@ -18737,7 +18891,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p return NULL; } - const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type); + const size_t size_cur = ggml_row_size(info->type, ne); ctx->size += GGML_PAD(size_cur, ctx->alignment); } @@ -19241,8 +19395,8 @@ void gguf_add_tensor( ctx->infos[idx].ne[i] = 1; } - ctx->infos[idx].n_dims = tensor->n_dims; - for (int i = 0; i < tensor->n_dims; i++) { + ctx->infos[idx].n_dims = ggml_n_dims(tensor); + for (uint32_t i = 0; i < ctx->infos[idx].n_dims; i++) { ctx->infos[idx].ne[i] = tensor->ne[i]; } diff --git a/ggml.h b/ggml.h index 2e294fb3f1925..b5a97f7c23292 100644 --- a/ggml.h +++ b/ggml.h @@ -509,7 +509,6 @@ extern "C" { struct ggml_backend_buffer * buffer; - int n_dims; int64_t ne[GGML_MAX_DIMS]; // number of elements size_t nb[GGML_MAX_DIMS]; // stride in bytes: // nb[0] = ggml_type_size(type) @@ -541,7 +540,7 @@ extern "C" { void * extra; // extra things e.g. for ggml-cuda.cu - char padding[12]; + char padding[8]; }; static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); @@ -646,11 +645,14 @@ extern "C" { GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor); GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor); GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN - GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split); - GGML_API int ggml_blck_size (enum ggml_type type); - GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block - GGML_API double ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float + GGML_API int ggml_blck_size(enum ggml_type type); + GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block + GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row + + GGML_DEPRECATED( + GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float + "use ggml_row_size() instead"); GGML_API const char * ggml_type_name(enum ggml_type type); GGML_API const char * ggml_op_name (enum ggml_op op); @@ -669,6 +671,11 @@ extern "C" { GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor); GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor); GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor); + GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1); diff --git a/klite.embd b/klite.embd index 4b246b1ed0959..69dcb289c8e7f 100644 --- a/klite.embd +++ b/klite.embd @@ -3215,6 +3215,7 @@ Current version: 100 const default_oai_base = "https://api.openai.com"; const default_claude_base = "https://api.anthropic.com"; const default_palm_base = "https://generativelanguage.googleapis.com/v1beta2/models/text-bison-001:generateText?key="; + const default_gemini_base = "https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?key="; const a1111_models_endpoint = "/sdapi/v1/sd-models"; const a1111_options_endpoint = "/sdapi/v1/options"; @@ -6526,6 +6527,7 @@ Current version: 100 else if(epchoice==4) //palm endpoint { let desired_palm_key = document.getElementById("custom_palm_key").value.trim(); + let mdlname = document.getElementById("custom_palm_model").value; if(desired_palm_key!="") { @@ -6535,7 +6537,7 @@ Current version: 100 custom_palm_key = desired_palm_key; localsettings.saved_palm_key = custom_palm_key; - selected_models = [{ "performance": 100.0, "queued": 0.0, "eta": 0, "name": "text-bison-001", "count": 1 }]; + selected_models = [{ "performance": 100.0, "queued": 0.0, "eta": 0, "name": mdlname, "count": 1 }]; selected_workers = []; if (perfdata == null) { //generate some fake perf data if horde is offline and using custom endpoint @@ -8912,7 +8914,7 @@ Current version: 100 } else if (custom_palm_key != "")//handle for PaLM { - let targetep = default_palm_base + custom_palm_key; + let urlbase = default_palm_base; let payload = {"prompt":{"text":submit_payload.prompt}, "temperature":submit_payload.params.temperature, "maxOutputTokens": submit_payload.params.max_length, @@ -8920,6 +8922,75 @@ Current version: 100 "topK": (submit_payload.params.top_k<1?300:submit_payload.params.top_k), "candidateCount":1}; + if(document.getElementById("custom_palm_model").value=="text-bison-001") + { + payload.safetySettings = [ + { + "category": "HARM_CATEGORY_TOXICITY", + "threshold": "BLOCK_NONE" + }, + { + "category": "HARM_CATEGORY_UNSPECIFIED", + "threshold": "BLOCK_NONE" + }, + { + "category": "HARM_CATEGORY_VIOLENCE", + "threshold": "BLOCK_NONE" + }, + { + "category": "HARM_CATEGORY_SEXUAL", + "threshold": "BLOCK_NONE" + } + ]; + } + else if(document.getElementById("custom_palm_model").value=="gemini-pro") + { + if(localsettings.opmode==1) + { + submit_payload.prompt = submit_payload.prompt + " \n ASSISTANT: Here is a continuation of the story: \nASSISTANT:"; + } + + urlbase = default_gemini_base; + payload = { + "contents": [ + { + "parts": [ + { + "text": submit_payload.prompt + } + ] + } + ], + "safetySettings": [ + { + "category": "HARM_CATEGORY_HARASSMENT", + "threshold": "BLOCK_NONE" + }, + { + "category": "HARM_CATEGORY_HATE_SPEECH", + "threshold": "BLOCK_NONE" + }, + { + "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", + "threshold": "BLOCK_NONE" + }, + { + "category": "HARM_CATEGORY_DANGEROUS_CONTENT", + "threshold": "BLOCK_NONE" + } + ], + "generationConfig": { + "temperature":submit_payload.params.temperature, + "maxOutputTokens": submit_payload.params.max_length, + "topP": submit_payload.params.top_p, + "topK": (submit_payload.params.top_k<1?300:submit_payload.params.top_k), + "candidateCount":1, + "stopSequences": [] + } + }; + } + + let targetep = urlbase + custom_palm_key; last_request_str = JSON.stringify(payload); fetch(targetep, { @@ -8933,8 +9004,10 @@ Current version: 100 .then((response) => response.json()) .then((data) => { console.log("sync finished response: " + JSON.stringify(data)); - if (custom_palm_key != "" && data.candidates != null && data.candidates.length>0 && data.candidates[0].output != "") { + if (custom_palm_key != "" && data.candidates != null && data.candidates.length>0 && data.candidates[0].output && data.candidates[0].output != "") { synchro_polled_response = data.candidates[0].output; + }else if (custom_palm_key != "" && data.candidates != null && data.candidates.length>0 && data.candidates[0].content && data.candidates[0].content.parts != null && data.candidates[0].content.parts.length>0) { + synchro_polled_response = data.candidates[0].content.parts[0].text; } else { //error occurred, maybe captcha failed @@ -11769,7 +11842,7 @@ Current version: 100 - + @@ -11898,10 +11971,14 @@ Current version: 100
Claude Compatibility Rename Fix
diff --git a/koboldcpp.py b/koboldcpp.py index e66ad1c20218e..8e863d5496316 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -395,7 +395,7 @@ def bring_terminal_to_foreground(): modelbusy = threading.Lock() requestsinqueue = 0 defaultport = 5001 -KcppVersion = "1.52.1.yr0-ROCm" +KcppVersion = "1.52.2.yr0-ROCm" showdebug = True showsamplerwarning = True showmaxctxwarning = True diff --git a/llama.cpp b/llama.cpp index fc38e18f6a84e..466c0a65e543e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1515,6 +1515,10 @@ struct llama_context { // decode output (2-dimensional array: [n_tokens][n_vocab]) std::vector logits; +#ifndef NDEBUG + // guard against access to unset logits + std::vector logits_valid; +#endif bool logits_all = false; // input embedding (1-dimensional array: [n_embd]) @@ -1565,7 +1569,7 @@ static bool llama_kv_cache_init( cache.cells.clear(); cache.cells.resize(n_ctx); - cache.buf.resize(n_elements*(ggml_type_sizef(ktype) + ggml_type_sizef(vtype)) + 2u*n_layer*ggml_tensor_overhead()); + cache.buf.resize(ggml_row_size(ktype, n_elements) + ggml_row_size(vtype, n_elements) + 2u*n_layer*ggml_tensor_overhead()); memset(cache.buf.data, 0, cache.buf.size); struct ggml_init_params params; @@ -3852,8 +3856,8 @@ static void llm_build_k_shift( ggml_rope_custom_inplace(ctx, ggml_view_3d(ctx, kv.k_l[il], n_embd_head, n_head_kv, n_ctx, - ggml_type_sizef(kv.k_l[il]->type)*n_embd_head, - ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa, + ggml_row_size(kv.k_l[il]->type, n_embd_head), + ggml_row_size(kv.k_l[il]->type, n_embd_gqa), 0), K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); @@ -3882,7 +3886,7 @@ static void llm_build_kv_store( cb(v_cur_t, "v_cur_t", il); struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa, - (ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa)*kv_head); + (ggml_row_size(kv.k_l[il]->type, n_embd_gqa))*kv_head); cb(k_cache_view, "k_cache_view", il); struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa, @@ -4041,8 +4045,8 @@ static struct ggml_tensor * llm_build_kqv( struct ggml_tensor * k = ggml_view_3d(ctx, kv.k_l[il], n_embd_head, n_kv, n_head_kv, - ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa, - ggml_type_sizef(kv.k_l[il]->type)*n_embd_head, + ggml_row_size(kv.k_l[il]->type, n_embd_gqa), + ggml_row_size(kv.k_l[il]->type, n_embd_head), 0); cb(k, "k", il); @@ -6182,6 +6186,14 @@ static int llama_decode_internal( { auto & logits_out = lctx.logits; +#ifndef NDEBUG + auto & logits_valid = lctx.logits_valid; + logits_valid.clear(); + logits_valid.resize(n_tokens); + + logits_out.clear(); +#endif + if (batch.logits) { logits_out.resize(n_vocab * n_tokens); for (uint32_t i = 0; i < n_tokens; i++) { @@ -6189,13 +6201,22 @@ static int llama_decode_internal( continue; } memcpy(logits_out.data() + (n_vocab*i), (float *) ggml_get_data(res) + (n_vocab*i), sizeof(float)*n_vocab); +#ifndef NDEBUG + logits_valid[i] = true; +#endif } } else if (lctx.logits_all) { logits_out.resize(n_vocab * n_tokens); memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*n_tokens); +#ifndef NDEBUG + std::fill(logits_valid.begin(), logits_valid.end(), true); +#endif } else { logits_out.resize(n_vocab); memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(n_tokens - 1)), sizeof(float)*n_vocab); +#ifndef NDEBUG + logits_valid[n_tokens - 1] = true; +#endif } } @@ -8734,7 +8755,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'? // quantize only 2D tensors - quantize &= (tensor->n_dims == 2); + quantize &= (ggml_n_dims(tensor) == 2); quantize &= params->quantize_output_tensor || name != "output.weight"; quantize &= !params->only_copy; @@ -8889,53 +8910,60 @@ static int llama_apply_lora_from_file_internal( const int64_t t_start_lora_us = ggml_time_us(); - auto fin = std::ifstream(path_lora, std::ios::binary); - if (!fin) { - LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, path_lora); - return 1; - } + llama_file fin(path_lora, "rb"); // verify magic and version { - uint32_t magic; - fin.read((char *) &magic, sizeof(magic)); - uint32_t format_version; - fin.read((char *) &format_version, sizeof(format_version)); + uint32_t magic = fin.read_u32(); + if (magic != LLAMA_FILE_MAGIC_GGLA) { + LLAMA_LOG_ERROR("%s: bad file magic\n", __func__); + return 1; + } + uint32_t format_version = fin.read_u32(); if (format_version != 1) { LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ ); return 1; } } - int32_t lora_r; - int32_t lora_alpha; - fin.read((char *) &lora_r, sizeof(lora_r)); - fin.read((char *) &lora_alpha, sizeof(lora_alpha)); + int32_t lora_r = fin.read_u32(); + int32_t lora_alpha = fin.read_u32(); float scaling = scale * (float)lora_alpha / (float)lora_r; LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling); + // create a name -> tensor map of the model to accelerate lookups + // find the max tensor size to estimate the required temporary buffer size + size_t max_tensor_size = 0; + std::unordered_map model_tensors; + for (const auto & kv : model.tensors_by_name) { + model_tensors.insert(kv); + size_t f32_size = ggml_nelements(kv.second) * sizeof(float); + max_tensor_size = std::max(max_tensor_size, f32_size); + } + // create a temporary ggml context to store the lora tensors - // todo: calculate size from biggest possible tensor - std::vector lora_buf(1024ull * 1024ull * 1024ull); + // TODO: use ggml-alloc + size_t lora_ctx_size = max_tensor_size * 3; + LLAMA_LOG_INFO("%s: allocating %.f MB for lora temporary buffer\n", __func__, lora_ctx_size / 1024.0 / 1024.0); + std::vector lora_buf(lora_ctx_size); + struct ggml_init_params params; params.mem_size = lora_buf.size(); params.mem_buffer = lora_buf.data(); params.no_alloc = false; - ggml_context * lora_ctx = ggml_init(params); - std::unordered_map lora_tensors; + using unique_context = std::unique_ptr; - // create a name -> tensor map of the model to accelerate lookups - std::unordered_map model_tensors; - for (const auto & kv : model.tensors_by_name) { - model_tensors.insert(kv); - } + unique_context lora_ctx(nullptr, ggml_free); + lora_ctx.reset(ggml_init(params)); + std::unordered_map lora_tensors; // load base model std::unique_ptr ml; - ggml_context * base_ctx = NULL; + + unique_context base_ctx(nullptr, ggml_free); std::vector base_buf; if (path_base_model) { LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model); @@ -8944,6 +8972,7 @@ static int llama_apply_lora_from_file_internal( size_t ctx_size; size_t mmapped_size; ml->calc_sizes(ctx_size, mmapped_size); + base_buf.resize(ctx_size); ggml_init_params base_params; @@ -8951,9 +8980,9 @@ static int llama_apply_lora_from_file_internal( base_params.mem_buffer = base_buf.data(); base_params.no_alloc = ml->use_mmap; - base_ctx = ggml_init(base_params); + base_ctx.reset(ggml_init(base_params)); - // maybe this should in llama_model_loader + // maybe this should be in llama_model_loader if (ml->use_mmap) { ml->mapping.reset(new llama_mmap(&ml->file, /* prefetch */ 0, ggml_is_numa())); } @@ -8966,27 +8995,35 @@ static int llama_apply_lora_from_file_internal( std::vector work_buffer; while (true) { + if (fin.tell() == fin.size) { + // eof + break; + } + int32_t n_dims; - int32_t length; + int32_t name_len; int32_t ftype; - fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); - fin.read(reinterpret_cast(&length), sizeof(length)); - fin.read(reinterpret_cast(&ftype), sizeof(ftype)); - if (fin.eof()) { - break; + fin.read_raw(&n_dims, sizeof(n_dims)); + fin.read_raw(&name_len, sizeof(name_len)); + fin.read_raw(&ftype, sizeof(ftype)); + + if (n_dims != 1 && n_dims != 2) { + LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims); + return 1; } int32_t ne[2] = { 1, 1 }; for (int i = 0; i < n_dims; ++i) { - fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); + fin.read_raw(&ne[i], sizeof(ne[i])); } std::string name; { + GGML_ASSERT(name_len <= 1024); char buf[1024]; - fin.read(buf, length); - name = std::string(buf, length); + fin.read_raw(buf, name_len); + name = std::string(buf, name_len); } // check for lora suffix and get the type of tensor @@ -9000,7 +9037,7 @@ static int llama_apply_lora_from_file_internal( std::string lora_type = name.substr(pos + lora_suffix.length()); std::string base_name = name; base_name.erase(pos); - // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(),base_name.c_str(), lora_type.c_str()); + // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(), base_name.c_str(), lora_type.c_str()); if (model_tensors.find(base_name) == model_tensors.end()) { LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data()); @@ -9019,22 +9056,15 @@ static int llama_apply_lora_from_file_internal( return false; } } - ggml_tensor * lora_tensor; - if (n_dims == 2) { - lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]); - } - else { - LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims); - return 1; - } - ggml_set_name(lora_tensor, "lora_tensor"); + ggml_tensor * lora_tensor = ggml_new_tensor_2d(lora_ctx.get(), wtype, ne[0], ne[1]); + ggml_set_name(lora_tensor, name.c_str()); // load tensor data - size_t offset = fin.tellg(); + size_t offset = fin.tell(); size_t tensor_data_size = ggml_nbytes(lora_tensor); offset = (offset + 31) & -32; - fin.seekg(offset); - fin.read((char*)lora_tensor->data, tensor_data_size); + fin.seek(offset, SEEK_SET); + fin.read_raw(lora_tensor->data, tensor_data_size); lora_tensors[name] = lora_tensor; @@ -9067,13 +9097,11 @@ static int llama_apply_lora_from_file_internal( // load from base model if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) { - // TODO: throw LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); return 1; } - // TODO: not tested!! maybe not working! - base_t = ml->create_tensor(base_ctx, base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU); + base_t = ml->create_tensor(base_ctx.get(), base_name, { dest_t->ne[0], dest_t->ne[1] }, GGML_BACKEND_CPU); ml->load_data_for(base_t); } else { base_t = dest_t; @@ -9102,43 +9130,45 @@ static int llama_apply_lora_from_file_internal( } // w = w + BA*s - ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); + ggml_tensor * BA = ggml_mul_mat(lora_ctx.get(), loraA, loraB); offload_func(BA); ggml_set_name(BA, "BA"); if (scaling != 1.0f) { - ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling); + ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx.get(), scaling); ggml_set_name(scale_tensor, "scale_tensor"); - BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor); + BA = ggml_scale_inplace(lora_ctx.get(), BA, scale_tensor); offload_func(BA); ggml_set_name(BA, "BA_scaled"); } ggml_tensor * r; if (base_t == dest_t) { - r = ggml_add_inplace(lora_ctx, dest_t, BA); + r = ggml_add_inplace(lora_ctx.get(), dest_t, BA); offload_func_force_inplace(r); ggml_set_name(r, "r_add_inplace"); } else { - r = ggml_add(lora_ctx, base_t, BA); + r = ggml_add(lora_ctx.get(), base_t, BA); offload_func(r); ggml_set_name(r, "r_add"); - r = ggml_cpy(lora_ctx, r, dest_t); + r = ggml_cpy(lora_ctx.get(), r, dest_t); offload_func(r); ggml_set_name(r, "r_cpy"); } - struct ggml_cgraph * gf = ggml_new_graph(lora_ctx); + struct ggml_cgraph * gf = ggml_new_graph(lora_ctx.get()); ggml_build_forward_expand(gf, r); ggml_graph_compute_helper(work_buffer, gf, n_threads); + // the tensors in the adapter must be sorted such that loraA and loraB of the same tensor are next to each other + GGML_ASSERT(lora_tensors.size() == 2); + // we won't need these tensors again, reset the context to save memory - ggml_free(lora_ctx); - lora_ctx = ggml_init(params); + lora_ctx.reset(ggml_init(params)); lora_tensors.clear(); n_tensors++; @@ -9148,12 +9178,6 @@ static int llama_apply_lora_from_file_internal( } } - // TODO: this should be in a destructor, it will leak on failure - ggml_free(lora_ctx); - if (base_ctx) { - ggml_free(base_ctx); - } - const int64_t t_lora_us = ggml_time_us() - t_start_lora_us; LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0); @@ -10328,6 +10352,7 @@ float * llama_get_logits(struct llama_context * ctx) { } float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) { + assert(ctx->logits_valid.at(i)); return ctx->logits.data() + i*ctx->model.hparams.n_vocab; } diff --git a/llama.h b/llama.h index d713821ced2f0..f7821bca57b56 100644 --- a/llama.h +++ b/llama.h @@ -39,6 +39,7 @@ #define LLAMA_MAX_RNG_STATE (64*1024) +#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN diff --git a/otherarch/rwkv_v3.cpp b/otherarch/rwkv_v3.cpp index ccc9f11e9e789..a398b3f881db0 100644 --- a/otherarch/rwkv_v3.cpp +++ b/otherarch/rwkv_v3.cpp @@ -1421,7 +1421,7 @@ bool rwkv_instance_from_file(const char * file_path, struct rwkv_instance & inst // Verify order of dimensions struct ggml_tensor * emb = model.emb; - RWKV_ASSERT_NULL_MSG(RWKV_ERROR_MODEL_PARAMS | RWKV_ERROR_SHAPE, emb->n_dims == 2, "Unexpected dimension count of embedding matrix %d", emb->n_dims); + RWKV_ASSERT_NULL_MSG(RWKV_ERROR_MODEL_PARAMS | RWKV_ERROR_SHAPE, ggml_n_dims(emb) == 2, "Unexpected dimension count of embedding matrix %d", ggml_n_dims(emb)); RWKV_ASSERT_NULL_MSG(RWKV_ERROR_MODEL_PARAMS | RWKV_ERROR_DIMENSION, emb->ne[0] == model.header.n_embed, "Unexpected dimension of embedding matrix %" PRId64, emb->ne[0]); RWKV_ASSERT_NULL_MSG(RWKV_ERROR_MODEL_PARAMS | RWKV_ERROR_DIMENSION, emb->ne[1] == model.header.n_vocab, "Unexpected dimension of embedding matrix %" PRId64, emb->ne[1]); diff --git a/requirements.txt b/requirements.txt index 4a868907f39e0..f16909f5b339f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ transformers>=4.34.0 gguf>=0.1.0 customtkinter>=5.1.0 packaging>=23.2 +protobuf>=4.21.0 diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index afca85143fb30..df2c3fb6e031f 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -54,7 +54,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float)); } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16) { GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0); - std::vector dataq(ggml_type_size(tensor->type)*size/ggml_blck_size(tensor->type)); + std::vector dataq(ggml_row_size(tensor->type, size)); int64_t hist[16]; ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size, hist); ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); @@ -72,6 +72,8 @@ static std::vector tensor_to_float(const ggml_tensor * t) { ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type); size_t bs = ggml_blck_size(t->type); + std::vector vq(ggml_blck_size(t->type)); + bool quantized = ggml_is_quantized(t->type); // access elements by index to avoid gaps in views for (int64_t i3 = 0; i3 < t->ne[3]; i3++) { @@ -85,9 +87,8 @@ static std::vector tensor_to_float(const ggml_tensor * t) { tv.push_back(*(float *) &buf[i]); } else if (t->type == GGML_TYPE_I32) { tv.push_back((float)*(int32_t *) &buf[i]); - } else if (ggml_is_quantized(t->type)) { - std::vector vq(ggml_blck_size(t->type)); - tt.to_float(&buf[i], vq.data(), ggml_blck_size(t->type)); + } else if (quantized) { + tt.to_float(&buf[i], vq.data(), bs); tv.insert(tv.end(), vq.begin(), vq.end()); } else { GGML_ASSERT(false);