Update to 1.52.2 with 'upstream/concedo'

YellowRoseCx · Dec 18, 2023 · 509ad00 · 509ad00
2 parents eee005e + ec05230
commit 509ad00
Show file tree

Hide file tree

Showing 21 changed files with 611 additions and 275 deletions.
diff --git a/common/train.cpp b/common/train.cpp
@@ -71,7 +71,7 @@ void free_random_uniform_distribution(struct random_uniform_distribution * rnd)
 
 struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
     float scale = 1.0f; // xavier
-    switch (tensor->n_dims) {
+    switch (ggml_n_dims(tensor)) {
         case 1:
             scale /= sqrtf((float) tensor->ne[0]);
             for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
@@ -119,7 +119,7 @@ struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct
 }
 
 struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) {
-    switch (tensor->n_dims) {
+    switch (ggml_n_dims(tensor)) {
         case 1:
             for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
                 float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
@@ -183,25 +183,27 @@ float fclamp(const float v, const float min, const float max) {
 }
 
 void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
-    GGML_ASSERT(tensor->n_dims == 1);
     GGML_ASSERT(tensor->ne[0] == ne0);
+    GGML_ASSERT(tensor->ne[1] == 1);
+    GGML_ASSERT(tensor->ne[2] == 1);
+    GGML_ASSERT(tensor->ne[3] == 1);
 }
 
 void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
-    GGML_ASSERT(tensor->n_dims == 2);
     GGML_ASSERT(tensor->ne[0] == ne0);
     GGML_ASSERT(tensor->ne[1] == ne1);
+    GGML_ASSERT(tensor->ne[2] == 1);
+    GGML_ASSERT(tensor->ne[3] == 1);
 }
 
 void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
-    GGML_ASSERT(tensor->n_dims == 3);
     GGML_ASSERT(tensor->ne[0] == ne0);
     GGML_ASSERT(tensor->ne[1] == ne1);
     GGML_ASSERT(tensor->ne[2] == ne2);
+    GGML_ASSERT(tensor->ne[3] == 1);
 }
 
 void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
-    GGML_ASSERT(tensor->n_dims == 4);
     GGML_ASSERT(tensor->ne[0] == ne0);
     GGML_ASSERT(tensor->ne[1] == ne1);
     GGML_ASSERT(tensor->ne[2] == ne2);
@@ -225,8 +227,8 @@ int64_t get_example_targets_batch(
     bool                   sample_random_offsets
 ) {
     GGML_ASSERT(samples_count > 0);
-    GGML_ASSERT(tokens_input->n_dims  == 2);
-    GGML_ASSERT(target_probs->n_dims  == 3);
+    GGML_ASSERT(ggml_is_matrix(tokens_input));
+    GGML_ASSERT(ggml_is_3d(target_probs));
     int64_t n_vocab  = target_probs->ne[0];
     int64_t n_tokens = tokens_input->ne[0];
     int64_t n_batch  = tokens_input->ne[1];

diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py
@@ -3,49 +3,20 @@
 
 import json
 import os
-import re
 import struct
 import sys
 from typing import Any, BinaryIO, Sequence
 
 import numpy as np
 import torch
 
-NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
-
+from pathlib import Path
+if 'NO_LOCAL_GGUF' not in os.environ:
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
+import gguf
 
-HF_SUBLAYER_TO_GGML = {
-    "self_attn.q_proj": "attn_q",
-    "self_attn.k_proj": "attn_k",
-    "self_attn.v_proj": "attn_v",
-    "self_attn.o_proj": "attn_output",
-    "mlp.gate_proj": "ffn_gate",
-    "mlp.down_proj": "ffn_down",
-    "mlp.up_proj": "ffn_up",
-    "input_layernorm": "attn_norm",
-    "post_attention_layernorm": "ffn_norm",
-}
-
-
-def translate_tensor_name(t: str) -> str:
-    match = re.match(r".*layers\.(\d+)\.(\w+\.\w+)\.lora_(A|B)\.weight", t)
-    if match:
-        nn = match.group(1)
-        sub_layer = match.group(2)
-        lora_type = match.group(3)
-
-        sub_layer_renamed = HF_SUBLAYER_TO_GGML.get(sub_layer)
-        if sub_layer_renamed is None:
-            print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}")
-            sys.exit(1)
 
-        output_string = (
-            f"blk.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
-        )
-        return output_string
-    else:
-        print(f"Error: unrecognized tensor {t}")
-        sys.exit(1)
+NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
 
 
 def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
@@ -61,9 +32,7 @@ def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
     fout.write(struct.pack("i", int(params["lora_alpha"])))
 
 
-def write_tensor_header(
-    self, name: str, shape: Sequence[int], data_type: np.dtype[Any]
-) -> None:
+def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None:
     sname = name.encode("utf-8")
     fout.write(
         struct.pack(
@@ -78,18 +47,27 @@ def write_tensor_header(
     fout.seek((fout.tell() + 31) & -32)
 
 
-if len(sys.argv) != 2:
-    print(f"Usage: python {sys.argv[0]} <path>")
+if len(sys.argv) < 2:
+    print(f"Usage: python {sys.argv[0]} <path> [arch]")
     print(
         "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
     )
+    print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
     sys.exit(1)
 
 input_json = os.path.join(sys.argv[1], "adapter_config.json")
 input_model = os.path.join(sys.argv[1], "adapter_model.bin")
 output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
 
 model = torch.load(input_model, map_location="cpu")
+arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
+
+if arch_name not in gguf.MODEL_ARCH_NAMES.values():
+    print(f"Error: unsupported architecture {arch_name}")
+    sys.exit(1)
+
+arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
+name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
 
 with open(input_json, "r") as f:
     params = json.load(f)
@@ -117,6 +95,7 @@ def write_tensor_header(
 
     write_file_header(fout, params)
     for k, v in model.items():
+        orig_k = k
         if k.endswith(".default.weight"):
             k = k.replace(".default.weight", ".weight")
         if k in ["llama_proj.weight", "llama_proj.bias"]:
@@ -129,7 +108,32 @@ def write_tensor_header(
             v = v.float()
 
         t = v.detach().numpy()
-        tname = translate_tensor_name(k)
+
+        prefix = "base_model.model."
+        if k.startswith(prefix):
+            k = k[len(prefix) :]
+
+        lora_suffixes = (".lora_A.weight", ".lora_B.weight")
+        if k.endswith(lora_suffixes):
+            suffix = k[-len(lora_suffixes[0]):]
+            k = k[: -len(lora_suffixes[0])]
+        else:
+            print(f"Error: unrecognized tensor name {orig_k}")
+            sys.exit(1)
+
+        tname = name_map.get_name(k)
+        if tname is None:
+            print(f"Error: could not map tensor name {orig_k}")
+            print(" Note: the arch parameter must be specified if the model is not llama")
+            sys.exit(1)
+
+        if suffix == ".lora_A.weight":
+            tname += ".weight.loraA"
+        elif suffix == ".lora_B.weight":
+            tname += ".weight.loraB"
+        else:
+            assert False
+
         print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
         write_tensor_header(fout, tname, t.shape, t.dtype)
         t.tofile(fout)

diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp
@@ -1258,9 +1258,9 @@ static struct ggml_tensor * forward_lora(
 }
 
 static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
-    assert(logits->n_dims == 2);
-    assert(probs->n_dims == 2);
-    assert(best_samples->n_dims == 1);
+    assert(ggml_is_matrix(logits));
+    assert(ggml_is_matrix(probs));
+    assert(ggml_is_vector(best_samples));
     assert(logits->ne[1] == best_samples->ne[0]);
     assert(logits->ne[0] == probs->ne[0]);
     assert(logits->ne[1] == probs->ne[1]);
@@ -1292,9 +1292,9 @@ static void sample_softmax_batch(
     struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs,
     struct ggml_tensor * best_samples
 ) {
-    GGML_ASSERT(best_samples->n_dims == 2);
-    GGML_ASSERT(logits->n_dims == 3);
-    GGML_ASSERT(probs->n_dims == 3);
+    GGML_ASSERT(ggml_is_matrix(best_samples));
+    GGML_ASSERT(ggml_is_3d(logits));
+    GGML_ASSERT(ggml_is_3d(probs));
     int n_tokens = best_samples->ne[0];
     int n_batch  = best_samples->ne[1];
     int n_vocab  = logits->ne[0];
@@ -1334,7 +1334,7 @@ static void print_row(struct ggml_tensor * probs, int i) {
 }
 
 static void print_matrix(struct ggml_tensor * probs) {
-    assert(probs->n_dims == 2);
+    assert(ggml_is_matrix(probs));
     for (int i = 0; i < probs->ne[1]; ++i) {
         for (int k = 0; k < probs->ne[0]; ++k) {
             float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
@@ -1386,8 +1386,8 @@ static void get_example_targets(int example_id, struct ggml_tensor * tokens_inpu
 static void get_example_targets_batch(
     struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets
 ) {
-    GGML_ASSERT(tokens_input->n_dims == 2);
-    GGML_ASSERT(     targets->n_dims == 3);
+    GGML_ASSERT(ggml_is_matrix(tokens_input));
+    GGML_ASSERT(ggml_is_3d(targets));
     int n_tokens = tokens_input->ne[0];
     int n_batch  = tokens_input->ne[1];
     GGML_ASSERT(n_tokens == targets->ne[1]);

diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp
@@ -130,13 +130,13 @@ int main(int argc, char ** argv)  {
     const ggml_type qtype = GGML_TYPE_Q4_1;
 
     size_t ctx_size = 0;
-    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
-    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
-    ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32);
-    ctx_size += sizex*sizey*ggml_type_sizef(qtype);
-    ctx_size += sizex*sizey*ggml_type_sizef(qtype);
-    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
-    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
+    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
+    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
+    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez);
+    ctx_size += ggml_row_size(qtype,         sizex*sizey);
+    ctx_size += ggml_row_size(qtype,         sizex*sizey);
+    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
+    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
     ctx_size += 1024*1024*16;
 
     printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));

diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -427,7 +427,7 @@ static void print_row(struct ggml_tensor * probs, int i) {
 }
 
 static void print_matrix(struct ggml_tensor * probs) {
-    assert(probs->n_dims == 2);
+    assert(ggml_is_matrix(probs));
     for (int i = 0; i < probs->ne[1]; ++i) {
         for (int k = 0; k < probs->ne[0]; ++k) {
             float p = get_f32_2d(probs, k, i);
@@ -639,7 +639,7 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab
 
 static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
     int ct;
-    switch (gg_weights->n_dims){
+    switch (ggml_n_dims(gg_weights)) {
         case 1:
             ct = 0;
             for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
@@ -1110,7 +1110,7 @@ static void write_tensor(struct llama_file * file, struct ggml_tensor * tensor,
         name = ggml_get_name(tensor);
     }
     uint32_t name_len = strlen(name);
-    uint32_t nd = tensor->n_dims;
+    uint32_t nd = ggml_n_dims(tensor);
     uint32_t ne[4] = { (uint32_t)tensor->ne[0],
                        (uint32_t)tensor->ne[1],
                        (uint32_t)tensor->ne[2],

diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp
@@ -195,7 +195,7 @@ static bool gguf_ex_read_1(const std::string & fname) {
 
             struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
 
-            printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
+            printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, ggml_n_dims(cur), cur->name, cur->data);
 
             // print first 10 elements
             const float * data = (const float *) cur->data;

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
@@ -514,7 +514,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             ctx_size += padded_size;
             if (verbosity >= 3) {
                 printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, padded_size=%zu, offset=%zu\n", __func__, i,
-                       cur->n_dims, cur->name, tensor_size, padded_size, offset);
+                       ggml_n_dims(cur), cur->name, tensor_size, padded_size, offset);
             }
         }
     }
@@ -962,7 +962,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
         }
 
         // quantize only 2D tensors
-        quantize &= (cur->n_dims == 2);
+        quantize &= (ggml_n_dims(cur) == 2);
 
         if (quantize) {
             new_type = type;
@@ -1035,7 +1035,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
             fout.put(0);
         }
 
-        printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), cur->n_dims, quantize,
+        printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
                orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
     }
 

diff --git a/examples/server/public/completion.js b/examples/server/public/completion.js
@@ -34,7 +34,8 @@ export async function* llama(prompt, params = {}, config = {}) {
     headers: {
       'Connection': 'keep-alive',
       'Content-Type': 'application/json',
-      'Accept': 'text/event-stream'
+      'Accept': 'text/event-stream',
+      ...(params.api_key ? {'Authorization': `Bearer ${params.api_key}`} : {})
     },
     signal: controller.signal,
   });

diff --git a/examples/server/public/index.html b/examples/server/public/index.html
@@ -235,7 +235,8 @@
       grammar: '',
       n_probs: 0, // no completion_probabilities,
       image_data: [],
-      cache_prompt: true
+      cache_prompt: true,
+      api_key: ''
     })
 
     /* START: Support for storing prompt templates and parameters in browsers LocalStorage */
@@ -790,6 +791,10 @@
             <fieldset>
               ${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
             </fieldset>
+            <fieldset>
+              <label for="api_key">API Key</label>
+              <input type="text" name="api_key" value="${params.value.api_key}" placeholder="Enter API key" oninput=${updateParams} />
+            </fieldset>
           </details>
         </form>
       `