Merge branch 'nomic-ai:main' into main

abdulrahman305 · Feb 2, 2024 · 3fc5977 · 3fc5977
2 parents 5ccb2bf + 10e3f7b
commit 3fc5977
Show file tree

Hide file tree

Showing 79 changed files with 2,236 additions and 1,162 deletions.
diff --git a/gpt4all-backend/bert.cpp b/gpt4all-backend/bert.cpp
@@ -381,10 +381,9 @@ void bert_eval(
 
             struct ggml_tensor *KQ = ggml_mul_mat(ctx0, K, Q);
             // KQ = soft_max(KQ / sqrt(head width))
-            KQ = ggml_soft_max(ctx0,
-                               ggml_scale(ctx0,
-                                          KQ,
-                                          ggml_new_f32(ctx0, 1.0f / sqrt((float)d_head))));
+            KQ = ggml_soft_max(
+                ctx0, ggml_scale(ctx0, KQ, 1.0f / sqrt((float)d_head))
+            );
 
             V = ggml_cont(ctx0, ggml_transpose(ctx0, V));
             struct ggml_tensor *KQV = ggml_mul_mat(ctx0, V, KQ);
@@ -490,10 +489,6 @@ struct bert_ctx * bert_load_from_file(const char *fname)
 #endif
 
     bert_ctx * new_bert = new bert_ctx;
-#if defined(GGML_USE_KOMPUTE)
-    new_bert->buf_compute.force_cpu = true;
-    new_bert->work_buf.force_cpu = true;
-#endif
 
     bert_model & model = new_bert->model;
     bert_vocab & vocab = new_bert->vocab;
@@ -714,9 +709,10 @@ Bert::~Bert() {
     bert_free(d_ptr->ctx);
 }
 
-bool Bert::loadModel(const std::string &modelPath, int n_ctx)
+bool Bert::loadModel(const std::string &modelPath, int n_ctx, int ngl)
 {
     (void)n_ctx;
+    (void)ngl;
     d_ptr->ctx = bert_load_from_file(modelPath.c_str());
     d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
     d_ptr->modelLoaded = d_ptr->ctx != nullptr;
@@ -729,10 +725,11 @@ bool Bert::isModelLoaded() const
     return d_ptr->modelLoaded;
 }
 
-size_t Bert::requiredMem(const std::string &modelPath, int n_ctx)
+size_t Bert::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
 {
     (void)modelPath;
     (void)n_ctx;
+    (void)ngl;
     return 0;
 }
 

diff --git a/gpt4all-backend/bert_impl.h b/gpt4all-backend/bert_impl.h
@@ -18,9 +18,9 @@ class Bert : public LLModel {
 
     bool supportsEmbedding() const override { return true; }
     bool supportsCompletion() const override { return true; }
-    bool loadModel(const std::string &modelPath, int n_ctx) override;
+    bool loadModel(const std::string &modelPath, int n_ctx, int ngl) override;
     bool isModelLoaded() const override;
-    size_t requiredMem(const std::string &modelPath, int n_ctx) override;
+    size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) override;
     size_t stateSize() const override;
     size_t saveState(uint8_t *dest) const override;
     size_t restoreState(const uint8_t *src) override;

diff --git a/gpt4all-backend/gptj.cpp b/gpt4all-backend/gptj.cpp
@@ -414,11 +414,7 @@ bool gptj_eval(
             struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
 
             // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale(ctx0,
-                        KQ,
-                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
-                        );
+            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrt(float(n_embd)/n_head));
 
             // KQ_masked = mask_past(KQ_scaled)
             struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
@@ -676,17 +672,19 @@ GPTJ::GPTJ()
     d_ptr->modelLoaded = false;
 }
 
-size_t GPTJ::requiredMem(const std::string &modelPath, int n_ctx) {
+size_t GPTJ::requiredMem(const std::string &modelPath, int n_ctx, int ngl) {
     (void)n_ctx;
+    (void)ngl;
     gptj_model dummy_model;
     gpt_vocab dummy_vocab;
     size_t mem_req;
     gptj_model_load(modelPath, dummy_model, dummy_vocab, &mem_req);
     return mem_req;
 }
 
-bool GPTJ::loadModel(const std::string &modelPath, int n_ctx) {
+bool GPTJ::loadModel(const std::string &modelPath, int n_ctx, int ngl) {
     (void)n_ctx;
+    (void)ngl;
     std::mt19937 rng(time(NULL));
     d_ptr->rng = rng;
 

diff --git a/gpt4all-backend/gptj_impl.h b/gpt4all-backend/gptj_impl.h
@@ -17,9 +17,9 @@ class GPTJ : public LLModel {
 
     bool supportsEmbedding() const override { return false; }
     bool supportsCompletion() const override { return true; }
-    bool loadModel(const std::string &modelPath, int n_ctx) override;
+    bool loadModel(const std::string &modelPath, int n_ctx, int ngl) override;
     bool isModelLoaded() const override;
-    size_t requiredMem(const std::string &modelPath, int n_ctx) override;
+    size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) override;
     size_t stateSize() const override;
     size_t saveState(uint8_t *dest) const override;
     size_t restoreState(const uint8_t *src) override;

diff --git a/gpt4all-backend/llama.cpp-mainline b/gpt4all-backend/llama.cpp-mainline
diff --git a/gpt4all-backend/llama.cpp.cmake b/gpt4all-backend/llama.cpp.cmake
@@ -175,6 +175,7 @@ if (LLAMA_KOMPUTE)
             DEPENDS ${LLAMA_DIR}/${source}
               ${LLAMA_DIR}/kompute-shaders/common.comp
               ${LLAMA_DIR}/kompute-shaders/op_getrows.comp
+              ${LLAMA_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp
               ${LLAMA_DIR}/kompute-shaders/op_mul_mv_q_n.comp
             COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${LLAMA_DIR}/${source}
             COMMENT "Compiling ${source} to ${source}.spv"
@@ -231,7 +232,6 @@ if (LLAMA_KOMPUTE)
           kompute-shaders/op_add.comp
           kompute-shaders/op_addrow.comp
           kompute-shaders/op_mul.comp
-          kompute-shaders/op_mulrow.comp
           kompute-shaders/op_silu.comp
           kompute-shaders/op_relu.comp
           kompute-shaders/op_gelu.comp
@@ -264,7 +264,6 @@ if (LLAMA_KOMPUTE)
           shaderop_add.h
           shaderop_addrow.h
           shaderop_mul.h
-          shaderop_mulrow.h
           shaderop_silu.h
           shaderop_relu.h
           shaderop_gelu.h