Merge branch 'nomic-ai:main' into main

abdulrahman305 · Feb 26, 2024 · 8e5a234 · 8e5a234
2 parents 1d07ae2 + f8b1069
commit 8e5a234
Show file tree

Hide file tree

Showing 44 changed files with 1,735 additions and 1,005 deletions.
diff --git a/.circleci/continue_config.yml b/.circleci/continue_config.yml
@@ -611,6 +611,7 @@ jobs:
             $Env:Path += ";$MinGwBin"
             $Env:Path += ";C:\Program Files\CMake\bin"
             $Env:Path += ";C:\VulkanSDK\1.3.261.1\bin"
+            $Env:VULKAN_SDK = "C:\VulkanSDK\1.3.261.1"
             cd gpt4all-backend
             mkdir runtimes/win-x64
             cd runtimes/win-x64
@@ -651,6 +652,7 @@ jobs:
           command: |
             $Env:Path += ";C:\Program Files\CMake\bin"
             $Env:Path += ";C:\VulkanSDK\1.3.261.1\bin"
+            $Env:VULKAN_SDK = "C:\VulkanSDK\1.3.261.1"
             cd gpt4all-backend
             mkdir runtimes/win-x64_msvc
             cd runtimes/win-x64_msvc
@@ -1107,8 +1109,12 @@ workflows:
     jobs:
       - hold:
           type: approval
+      - csharp-hold:
+          type: approval
       - nuget-hold:
           type: approval
+      - nodejs-hold:
+          type: approval
       - npm-hold:
           type: approval
       - build-bindings-backend-linux:
@@ -1151,21 +1157,21 @@ workflows:
             branches:
               only:
           requires:
-            - npm-hold
+            - nodejs-hold
             - build-bindings-backend-linux
       - build-nodejs-windows:
           filters:
             branches:
               only:
           requires:
-            - npm-hold
+            - nodejs-hold
             - build-bindings-backend-windows-msvc
       - build-nodejs-macos:
           filters:
             branches:
               only:
           requires:
-            - npm-hold
+            - nodejs-hold
             - build-bindings-backend-macos
 
 
@@ -1175,21 +1181,21 @@ workflows:
             branches:
               only:
           requires:
-            - nuget-hold
+            - csharp-hold
             - build-bindings-backend-linux
       - build-csharp-windows:
           filters:
             branches:
               only:
           requires:
-            - nuget-hold
+            - csharp-hold
             - build-bindings-backend-windows
       - build-csharp-macos:
           filters:
             branches:
               only:
           requires:
-            - nuget-hold
+            - csharp-hold
             - build-bindings-backend-macos
       - store-and-upload-nupkgs:
           filters:

diff --git a/gpt4all-backend/llama.cpp-mainline b/gpt4all-backend/llama.cpp-mainline
diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamamodel.cpp
@@ -64,6 +64,7 @@ static int llama_sample_top_p_top_k(
         int last_n_tokens_size,
         int top_k,
         float top_p,
+        float min_p,
         float temp,
         float repeat_penalty,
         int32_t pos) {
@@ -83,6 +84,7 @@ static int llama_sample_top_p_top_k(
     llama_sample_tail_free(ctx, &candidates_p, 1.0f, 1);
     llama_sample_typical(ctx, &candidates_p, 1.0f, 1);
     llama_sample_top_p(ctx, &candidates_p, top_p, 1);
+    llama_sample_min_p(ctx, &candidates_p, min_p, 1);
     llama_sample_temp(ctx, &candidates_p, temp);
     return llama_sample_token(ctx, &candidates_p);
 }
@@ -260,19 +262,23 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
     d_ptr->model_params.progress_callback = &LLModel::staticProgressCallback;
     d_ptr->model_params.progress_callback_user_data = this;
 
-#ifdef GGML_USE_METAL
+#ifdef GGML_USE_KOMPUTE
+    if (d_ptr->device != -1) {
+        d_ptr->model_params.main_gpu = d_ptr->device;
+        d_ptr->model_params.n_gpu_layers = ngl;
+    }
+#elif defined(GGML_USE_METAL)
+    (void)ngl;
+
     if (llama_verbose()) {
         std::cerr << "llama.cpp: using Metal" << std::endl;
     }
 
     // always fully offload on Metal
     // TODO(cebtenzzre): use this parameter to allow using more than 53% of system RAM to load a model
     d_ptr->model_params.n_gpu_layers = 100;
-#elif defined(GGML_USE_KOMPUTE)
-    if (d_ptr->device != -1) {
-        d_ptr->model_params.main_gpu = d_ptr->device;
-        d_ptr->model_params.n_gpu_layers = ngl;
-    }
+#else
+    (void)ngl;
 #endif
 
     d_ptr->model = llama_load_model_from_file_gpt4all(modelPath.c_str(), &d_ptr->model_params);
@@ -388,7 +394,7 @@ LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
     const size_t n_prev_toks = std::min((size_t) promptCtx.repeat_last_n, promptCtx.tokens.size());
     return llama_sample_top_p_top_k(d_ptr->ctx,
         promptCtx.tokens.data() + promptCtx.tokens.size() - n_prev_toks,
-        n_prev_toks, promptCtx.top_k, promptCtx.top_p, promptCtx.temp,
+        n_prev_toks, promptCtx.top_k, promptCtx.top_p, promptCtx.min_p, promptCtx.temp,
         promptCtx.repeat_penalty, promptCtx.n_last_batch_tokens - 1);
 }
 
@@ -469,6 +475,7 @@ std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryReq
         return devices;
     }
 #else
+    (void)memoryRequired;
     std::cerr << __func__ << ": built without Kompute\n";
 #endif
 

diff --git a/gpt4all-backend/llmodel.h b/gpt4all-backend/llmodel.h
@@ -66,6 +66,7 @@ class LLModel {
         int32_t n_predict = 200;
         int32_t top_k = 40;
         float   top_p = 0.9f;
+        float   min_p = 0.0f;
         float   temp = 0.9f;
         int32_t n_batch = 9;
         float   repeat_penalty = 1.10f;

diff --git a/gpt4all-backend/llmodel_c.cpp b/gpt4all-backend/llmodel_c.cpp
@@ -134,6 +134,7 @@ void llmodel_prompt(llmodel_model model, const char *prompt,
     wrapper->promptContext.n_predict = ctx->n_predict;
     wrapper->promptContext.top_k = ctx->top_k;
     wrapper->promptContext.top_p = ctx->top_p;
+    wrapper->promptContext.min_p = ctx->min_p;
     wrapper->promptContext.temp = ctx->temp;
     wrapper->promptContext.n_batch = ctx->n_batch;
     wrapper->promptContext.repeat_penalty = ctx->repeat_penalty;
@@ -156,6 +157,7 @@ void llmodel_prompt(llmodel_model model, const char *prompt,
     ctx->n_predict = wrapper->promptContext.n_predict;
     ctx->top_k = wrapper->promptContext.top_k;
     ctx->top_p = wrapper->promptContext.top_p;
+    ctx->min_p = wrapper->promptContext.min_p;
     ctx->temp = wrapper->promptContext.temp;
     ctx->n_batch = wrapper->promptContext.n_batch;
     ctx->repeat_penalty = wrapper->promptContext.repeat_penalty;

diff --git a/gpt4all-backend/llmodel_c.h b/gpt4all-backend/llmodel_c.h
@@ -39,6 +39,7 @@ struct llmodel_prompt_context {
     int32_t n_predict;      // number of tokens to predict
     int32_t top_k;          // top k logits to sample from
     float top_p;            // nucleus sampling probability threshold
+    float min_p;            // Min P sampling
     float temp;             // temperature to adjust model's output distribution
     int32_t n_batch;        // number of predictions to generate in parallel
     float repeat_penalty;   // penalty factor for repeated tokens

diff --git a/gpt4all-bindings/cli/app.py b/gpt4all-bindings/cli/app.py
@@ -120,6 +120,7 @@ def _old_loop(gpt4all_instance):
             n_predict=200,
             top_k=40,
             top_p=0.9,
+            min_p=0.0,
             temp=0.9,
             n_batch=9,
             repeat_penalty=1.1,
@@ -156,6 +157,7 @@ def _new_loop(gpt4all_instance):
                 temp=0.9,
                 top_k=40,
                 top_p=0.9,
+                min_p=0.0,
                 repeat_penalty=1.1,
                 repeat_last_n=64,
                 n_batch=9,

diff --git a/gpt4all-bindings/csharp/Gpt4All/Bindings/LLPromptContext.cs b/gpt4all-bindings/csharp/Gpt4All/Bindings/LLPromptContext.cs
@@ -64,6 +64,15 @@ public float TopP
         set => _ctx.top_p = value;
     }
 
+    /// <summary>
+    /// min p sampling probability threshold
+    /// </summary>
+    public float MinP
+    {
+        get => _ctx.min_p;
+        set => _ctx.min_p = value;
+    }
+
     /// <summary>
     /// temperature to adjust model's output distribution
     /// </summary>

diff --git a/gpt4all-bindings/csharp/Gpt4All/Bindings/NativeMethods.cs b/gpt4all-bindings/csharp/Gpt4All/Bindings/NativeMethods.cs
@@ -29,6 +29,8 @@ public unsafe partial struct llmodel_prompt_context
 
     public float top_p;
 
+    public float min_p;
+
     public float temp;
 
     [NativeTypeName("int32_t")]

diff --git a/gpt4all-bindings/csharp/Gpt4All/Extensions/LLPromptContextExtensions.cs b/gpt4all-bindings/csharp/Gpt4All/Extensions/LLPromptContextExtensions.cs
@@ -16,6 +16,7 @@ public static string Dump(this LLModelPromptContext context)
             n_predict = {ctx.n_predict}
             top_k = {ctx.top_k}
             top_p = {ctx.top_p}
+            min_p = {ctx.min_p}
             temp = {ctx.temp}
             n_batch = {ctx.n_batch}
             repeat_penalty = {ctx.repeat_penalty}

diff --git a/gpt4all-bindings/csharp/Gpt4All/Extensions/PredictRequestOptionsExtensions.cs b/gpt4all-bindings/csharp/Gpt4All/Extensions/PredictRequestOptionsExtensions.cs
@@ -12,6 +12,7 @@ public static LLModelPromptContext ToPromptContext(this PredictRequestOptions op
             TokensSize = opts.TokensSize,
             TopK = opts.TopK,
             TopP = opts.TopP,
+            MinP = opts.MinP,
             PastNum = opts.PastConversationTokensNum,
             RepeatPenalty = opts.RepeatPenalty,
             Temperature = opts.Temperature,

diff --git a/gpt4all-bindings/csharp/Gpt4All/Prediction/PredictRequestOptions.cs b/gpt4all-bindings/csharp/Gpt4All/Prediction/PredictRequestOptions.cs
@@ -16,6 +16,8 @@ public record PredictRequestOptions
 
     public float TopP { get; init; } = 0.9f;
 
+    public float MinP { get; init; } = 0.0f;
+
     public float Temperature { get; init; } = 0.1f;
 
     public int Batches { get; init; } = 8;

diff --git a/gpt4all-bindings/golang/binding.cpp b/gpt4all-bindings/golang/binding.cpp
@@ -36,7 +36,7 @@ std::string res = "";
 void * mm;
 
 void model_prompt( const char *prompt, void *m, char* result, int repeat_last_n, float repeat_penalty, int n_ctx, int tokens, int top_k,
-                            float top_p, float temp, int n_batch,float ctx_erase)
+                            float top_p, float min_p, float temp, int n_batch,float ctx_erase)
 {
     llmodel_model* model = (llmodel_model*) m;
 
@@ -69,6 +69,7 @@ void model_prompt( const char *prompt, void *m, char* result, int repeat_last_n,
         .n_predict = 50,
         .top_k = 10,
         .top_p = 0.9,
+        .min_p = 0.0,
         .temp = 1.0,
         .n_batch = 1,
         .repeat_penalty = 1.2,
@@ -83,6 +84,7 @@ void model_prompt( const char *prompt, void *m, char* result, int repeat_last_n,
     prompt_context->top_k = top_k;
     prompt_context->context_erase = ctx_erase;
     prompt_context->top_p = top_p;
+    prompt_context->min_p = min_p;
     prompt_context->temp = temp;
     prompt_context->n_batch = n_batch;    
 

diff --git a/gpt4all-bindings/golang/binding.h b/gpt4all-bindings/golang/binding.h
@@ -7,12 +7,12 @@ extern "C" {
 void* load_model(const char *fname, int n_threads);
 
 void model_prompt( const char *prompt, void *m, char* result, int repeat_last_n, float repeat_penalty, int n_ctx, int tokens, int top_k,
-                            float top_p, float temp, int n_batch,float ctx_erase);
+                            float top_p, float min_p, float temp, int n_batch,float ctx_erase);
 
 void free_model(void *state_ptr);
 
 extern unsigned char getTokenCallback(void *, char *);
 
 #ifdef __cplusplus
 }
-#endif
+#endif
diff --git a/gpt4all-bindings/golang/gpt4all.go b/gpt4all-bindings/golang/gpt4all.go
@@ -7,7 +7,7 @@ package gpt4all
 // #cgo LDFLAGS: -lgpt4all -lm -lstdc++ -ldl
 // void* load_model(const char *fname, int n_threads);
 // void model_prompt( const char *prompt, void *m, char* result, int repeat_last_n, float repeat_penalty, int n_ctx, int tokens, int top_k,
-//                            float top_p, float temp, int n_batch,float ctx_erase);
+//                            float top_p, float min_p, float temp, int n_batch,float ctx_erase);
 // void free_model(void *state_ptr);
 // extern unsigned char getTokenCallback(void *, char *);
 // void llmodel_set_implementation_search_path(const char *path);
@@ -58,7 +58,7 @@ func (l *Model) Predict(text string, opts ...PredictOption) (string, error) {
 	out := make([]byte, po.Tokens)
 
 	C.model_prompt(input, l.state, (*C.char)(unsafe.Pointer(&out[0])), C.int(po.RepeatLastN), C.float(po.RepeatPenalty), C.int(po.ContextSize),
-		C.int(po.Tokens), C.int(po.TopK), C.float(po.TopP), C.float(po.Temperature), C.int(po.Batch), C.float(po.ContextErase))
+		C.int(po.Tokens), C.int(po.TopK), C.float(po.TopP), C.float(po.MinP), C.float(po.Temperature), C.int(po.Batch), C.float(po.ContextErase))
 
 	res := C.GoString((*C.char)(unsafe.Pointer(&out[0])))
 	res = strings.TrimPrefix(res, " ")

diff --git a/gpt4all-bindings/golang/options.go b/gpt4all-bindings/golang/options.go
@@ -2,7 +2,7 @@ package gpt4all
 
 type PredictOptions struct {
 	ContextSize, RepeatLastN, Tokens, TopK, Batch  int
-	TopP, Temperature, ContextErase, RepeatPenalty float64
+	TopP, MinP, Temperature, ContextErase, RepeatPenalty float64
 }
 
 type PredictOption func(p *PredictOptions)
@@ -11,6 +11,7 @@ var DefaultOptions PredictOptions = PredictOptions{
 	Tokens:        200,
 	TopK:          10,
 	TopP:          0.90,
+        MinP:          0.0,
 	Temperature:   0.96,
 	Batch:         1,
 	ContextErase:  0.55,
@@ -50,6 +51,13 @@ func SetTopP(topp float64) PredictOption {
 	}
 }
 
+// SetMinP sets the value for min p sampling
+func SetMinP(minp float64) PredictOption {
+	return func(p *PredictOptions) {
+		p.MinP = minp
+	}
+}
+
 // SetRepeatPenalty sets the repeat penalty.
 func SetRepeatPenalty(ce float64) PredictOption {
 	return func(p *PredictOptions) {

diff --git a/gpt4all-bindings/java/src/main/java/com/hexadevlabs/gpt4all/LLModel.java b/gpt4all-bindings/java/src/main/java/com/hexadevlabs/gpt4all/LLModel.java
@@ -32,6 +32,7 @@ private GenerationConfig() {
             n_predict.set(128);
             top_k.set(40);
             top_p.set(0.95);
+            min_p.set(0.0);
             temp.set(0.28);
             n_batch.set(8);
             repeat_penalty.set(1.1);
@@ -71,6 +72,11 @@ public Builder withTopP(float top_p) {
                 return this;
             }
 
+            public Builder withMinP(float min_p) {
+                configToBuild.min_p.set(min_p);
+                return this;
+            }
+
             public Builder withTemp(float temp) {
                 configToBuild.temp.set(temp);
                 return this;

diff --git a/gpt4all-bindings/java/src/main/java/com/hexadevlabs/gpt4all/LLModelLibrary.java b/gpt4all-bindings/java/src/main/java/com/hexadevlabs/gpt4all/LLModelLibrary.java
@@ -48,6 +48,7 @@ class LLModelPromptContext extends Struct {
         public final int32_t n_predict = new int32_t();
         public final int32_t top_k = new int32_t();
         public final Float top_p = new Float();
+        public final Float min_p = new Float();
         public final Float temp = new Float();
         public final int32_t n_batch = new int32_t();
         public final Float repeat_penalty = new Float();