Merge remote-tracking branch 'upstream/concedo'

YellowRoseCx · Oct 1, 2023 · 1c4e4a6 · 1c4e4a6
2 parents 354fa72 + bc841ec
commit 1c4e4a6
Show file tree

Hide file tree

Showing 80 changed files with 11,937 additions and 4,300 deletions.
diff --git a/.gitignore b/.gitignore
@@ -12,31 +12,44 @@
 .vs/
 .vscode/
 
+lcov-report/
+gcovr-report/
+
 build*/
 out/
 tmp/
 
 models/*
 models-mnt
 
-/main
-/quantize
-/quantize-stats
-/result
-/perplexity
-/embedding
-/train-text-from-scratch
-/convert-llama2c-to-ggml
-/simple
-/benchmark-matmult
-/vdot
-/server
 /Pipfile
+/baby-llama
+/beam-search
+/benchmark-matmult
+/convert-llama2c-to-ggml
 /embd-input-test
+/embedding
 /gguf
 /gguf-llama-simple
 /libllama.so
 /llama-bench
+/main
+/metal
+/perplexity
+/q8dot
+/quantize
+/quantize-stats
+/result
+/save-load-state
+/server
+/simple
+/batched
+/export-lora
+/finetune
+/speculative
+/parallel
+/train-text-from-scratch
+/vdot
 build-info.h
 arm_neon.h
 compile_commands.json

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -251,37 +251,38 @@ endif()
 
 if (LLAMA_ALL_WARNINGS)
     if (NOT MSVC)
-        set(c_flags
-            -Wall
-            -Wextra
-            -Wpedantic
-            -Wcast-qual
-            -Wdouble-promotion
-            -Wshadow
-            -Wstrict-prototypes
-            -Wpointer-arith
-            -Wmissing-prototypes
-            -Werror=implicit-int
-            -Wno-unused-function
-        )
-        set(cxx_flags
-            -Wall
-            -Wextra
-            -Wpedantic
-            -Wcast-qual
-            -Wmissing-declarations
-            -Wno-unused-function
-            -Wno-multichar
-        )
-        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-            # g++ only
-            set(cxx_flags ${cxx_flags} -Wno-format-truncation -Wno-array-bounds)
+        set(warning_flags -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
+        set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int
+            -Werror=implicit-function-declaration)
+        set(cxx_flags -Wmissing-declarations -Wmissing-noreturn)
+
+        if (CMAKE_C_COMPILER_ID MATCHES "Clang")
+            set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return)
+            set(cxx_flags ${cxx_flags} -Wmissing-prototypes -Wextra-semi)
+
+            if (
+                (CMAKE_C_COMPILER_ID STREQUAL "Clang"      AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR
+                (CMAKE_C_COMPILER_ID STREQUAL "AppleClang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 7.3.0)
+            )
+                set(c_flags ${c_flags} -Wdouble-promotion)
+            endif()
+        elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU")
+            set(c_flags ${c_flags} -Wdouble-promotion)
+            set(cxx_flags ${cxx_flags} -Wno-array-bounds)
+
+            if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0)
+                set(cxx_flags ${cxx_flags} -Wno-format-truncation)
+            endif()
+            if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0)
+                set(cxx_flags ${cxx_flags} -Wextra-semi)
+            endif()
         endif()
     else()
         # todo : msvc
     endif()
 
     add_compile_options(
+            ${warning_flags}
             "$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
             "$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
     )

diff --git a/Makefile b/Makefile
@@ -39,10 +39,15 @@ endif
 #
 
 # keep standard at C11 and C++11
-CFLAGS   = -I.              -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11   -fPIC -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS -D_GNU_SOURCE
-CXXFLAGS = -I. -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c++11 -fPIC -DGGML_USE_K_QUANTS -DLOG_DISABLE_LOGS -D_GNU_SOURCE
+CFLAGS   = -I.            -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11   -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE
+CXXFLAGS = -I. -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c++11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE
 LDFLAGS  =
 
+ifndef LLAMA_NO_K_QUANTS
+CFLAGS   += -DGGML_USE_K_QUANTS
+CXXFLAGS += -DGGML_USE_K_QUANTS
+endif
+
 # these are used on windows, to build some libraries with extra old device compatibility
 SIMPLECFLAGS =
 FULLCFLAGS =
@@ -285,19 +290,17 @@ ifeq ($(OS),Windows_NT)
 	endif
 else
 	DEFAULT_BUILD = $(CXX) $(CXXFLAGS)  $^ -shared -o [email protected] $(LDFLAGS)
-	FAILSAFE_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
+
 	ifdef LLAMA_OPENBLAS
 	OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o [email protected] $(LDFLAGS)
-	NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o [email protected] $(LDFLAGS)
 	endif
 	ifdef LLAMA_CLBLAST
 		ifeq ($(UNAME_S),Darwin)
-				CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o [email protected] $(LDFLAGS)
+			CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o [email protected] $(LDFLAGS)
 		else
-				CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o [email protected] $(LDFLAGS)
+			CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -lOpenCL $(ARCH_ADD) -lopenblas -shared -o [email protected] $(LDFLAGS)
 		endif
 	endif
-
 	ifdef LLAMA_CUBLAS
 		CUBLAS_BUILD = $(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) $^ -shared -o [email protected] $(CUBLASLD_FLAGS) $(LDFLAGS)
 	endif
@@ -351,12 +354,20 @@ ggml_cublas.o: ggml.c ggml.h ggml-cuda.h k_quants.h
 	$(CC)  $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
 
 #quants K
+KQ1 =
+KQ2 =
+KQ3 =
+ifndef LLAMA_NO_K_QUANTS
+KQ1 = k_quants.o
+KQ2 = k_quants_noavx2.o
+KQ3 = k_quants_failsafe.o
 k_quants.o: k_quants.c k_quants.h ggml.h ggml-cuda.h
 	$(CC)  $(CFLAGS) $(FULLCFLAGS) -c $< -o $@
 k_quants_noavx2.o: k_quants.c k_quants.h ggml.h ggml-cuda.h
 	$(CC)  $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@
 k_quants_failsafe.o: k_quants.c k_quants.h ggml.h ggml-cuda.h
 	$(CC)  $(CFLAGS) $(NONECFLAGS) -c $< -o $@
+endif # LLAMA_NO_K_QUANTS
 
 #there's no intrinsics or special gpu ops used here, so we can have a universal object
 ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
@@ -416,7 +427,7 @@ gpttype_adapter_cublas.o: $(GPTTYPE_ADAPTER)
 clean:
 	rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf gguf.exe main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so koboldcpp_hipblas.so
 
-main: examples/main/main.cpp build-info.h ggml.o k_quants.o ggml-alloc.o llama.o common.o console.o grammar-parser.o $(OBJS)
+main: examples/main/main.cpp build-info.h ggml.o $(KQ1) ggml-alloc.o llama.o common.o console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
@@ -425,31 +436,69 @@ main: examples/main/main.cpp build-info.h ggml.o k_quants.o ggml-alloc.o llama.o
 gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
+
 #generated libraries
-koboldcpp_default: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o ggml-alloc.o grammar-parser.o $(OBJS)
+koboldcpp_default: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o $(KQ1) ggml-alloc.o grammar-parser.o $(OBJS)
 	$(DEFAULT_BUILD)
-koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o k_quants.o ggml-alloc.o grammar-parser.o $(OBJS)
+
+ifdef OPENBLAS_BUILD
+koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o $(KQ1) ggml-alloc.o grammar-parser.o $(OBJS)
 	$(OPENBLAS_BUILD)
-koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_failsafe.o ggml-alloc.o grammar-parser.o $(OBJS)
+else
+koboldcpp_openblas:
+	$(DONOTHING)
+endif
+
+ifdef FAILSAFE_BUILD
+koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o $(KQ3) ggml-alloc.o grammar-parser.o $(OBJS)
 	$(FAILSAFE_BUILD)
-koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o k_quants_noavx2.o ggml-alloc.o grammar-parser.o $(OBJS)
+else
+koboldcpp_failsafe:
+	$(DONOTHING)
+endif
+
+ifdef NOAVX2_BUILD
+koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o $(KQ2) ggml-alloc.o grammar-parser.o $(OBJS)
 	$(NOAVX2_BUILD)
-koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o k_quants.o ggml-alloc.o grammar-parser.o $(OBJS)
+else
+koboldcpp_noavx2:
+	$(DONOTHING)
+endif
+
+ifdef CLBLAST_BUILD
+koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o $(KQ1) ggml-alloc.o grammar-parser.o $(OBJS)
 	$(CLBLAST_BUILD)
-koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o grammar-parser.o $(CUBLAS_OBJS) $(OBJS)
+else
+koboldcpp_clblast:
+	$(DONOTHING)
+endif
+
+ifdef CUBLAS_BUILD
+koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o $(KQ1) ggml-alloc.o grammar-parser.o $(CUBLAS_OBJS) $(OBJS)
 	$(CUBLAS_BUILD)
-koboldcpp_hipblas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o grammar-parser.o $(HIP_OBJS) $(OBJS)
+else
+koboldcpp_cublas:
+	$(DONOTHING)
+endif
+
+ifdef HIPBLAS_BUILD
+koboldcpp_hipblas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o $(KQ1) ggml-alloc.o grammar-parser.o $(HIP_OBJS) $(OBJS)
 	$(HIPBLAS_BUILD)
+else
+koboldcpp_hipblas:
+	$(DONOTHING)
+endif
 
-quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o k_quants.o ggml-alloc.o
+# tools
+quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o $(KQ1) ggml-alloc.o
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-quantize_gptj: ggml.o llama.o k_quants.o ggml-alloc.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp
+quantize_gptj: ggml.o llama.o $(KQ1) ggml-alloc.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-quantize_gpt2: ggml.o llama.o k_quants.o ggml-alloc.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp
+quantize_gpt2: ggml.o llama.o $(KQ1) ggml-alloc.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-quantize_neox: ggml.o llama.o k_quants.o ggml-alloc.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
+quantize_neox: ggml.o llama.o $(KQ1) ggml-alloc.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-quantize_mpt: ggml.o llama.o k_quants.o ggml-alloc.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp
+quantize_mpt: ggml.o llama.o $(KQ1) ggml-alloc.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 perplexity: examples/perplexity/perplexity.cpp                build-info.h ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o k_quants.o ggml-alloc.o $(CUBLAS_OBJS) $(HIP_OBJS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(HIPLDFLAGS)

diff --git a/Package.swift b/Package.swift
@@ -10,7 +10,7 @@ let platforms: [SupportedPlatform]? = [
     .tvOS(.v14)
 ]
 let exclude: [String] = []
-let additionalSources: [String] = ["ggml-metal.m"]
+let additionalSources: [String] = ["ggml-metal.m", "ggml-metal.metal"]
 let additionalSettings: [CSetting] = [
     .unsafeFlags(["-fno-objc-arc"]),
     .define("GGML_SWIFT"),
@@ -44,7 +44,9 @@ let package = Package(
             cSettings: [
                 .unsafeFlags(["-Wno-shorten-64-to-32"]),
                 .define("GGML_USE_K_QUANTS"),
-                .define("GGML_USE_ACCELERATE")
+                .define("GGML_USE_ACCELERATE"),
+                .define("ACCELERATE_NEW_LAPACK"),
+                .define("ACCELERATE_LAPACK_ILP64")
             ] + additionalSettings,
             linkerSettings: [
                 .linkedFramework("Accelerate")

diff --git a/README.md b/README.md
@@ -77,6 +77,9 @@ For more information, be sure to run the program with the `--help` flag.
 - If you're on Windows with an Nvidia GPU you can get CUDA support out of the box using the `--usecublas` flag, make sure you select the correct .exe with CUDA support.
 - You can attempt a CuBLAS build with `LLAMA_CUBLAS=1` or using the provided CMake file (best for visual studio users). If you use the CMake file to build, copy the `koboldcpp_cublas.dll` generated into the same directory as the `koboldcpp.py` file. If you are bundling executables, you may need to include CUDA dynamic libraries (such as `cublasLt64_11.dll` and `cublas64_11.dll`) in order for the executable to work correctly on a different PC.
 
+## AMD
+- Please check out https://github.com/YellowRoseCx/koboldcpp-rocm
+
 ## Questions and Help
 - **First, please check out [The KoboldCpp FAQ and Knowledgebase](https://github.com/LostRuins/koboldcpp/wiki) which may already have answers to your questions! Also please search through past issues and discussions.**
 - If you cannot find an answer, open an issue on this github, or find us on the [KoboldAI Discord](https://koboldai.org/discord).
@@ -94,7 +97,7 @@ For more information, be sure to run the program with the `--help` flag.
 - The other files are also under the AGPL v3.0 License unless otherwise stated
 
 ## Notes
-- Generation delay scales linearly with original prompt length. If OpenBLAS is enabled then prompt ingestion becomes about 2-3x faster. This is automatic on windows, but will require linking on OSX and Linux. CLBlast speeds this up even further, and `--gpulayers` + `--useclblast` more so.
+- Generation delay scales linearly with original prompt length. If OpenBLAS is enabled then prompt ingestion becomes about 2-3x faster. This is automatic on windows, but will require linking on OSX and Linux. CLBlast speeds this up even further, and `--gpulayers` + `--useclblast` or `--usecublas` more so.
 - I have heard of someone claiming a false AV positive report. The exe is a simple pyinstaller bundle that includes the necessary python scripts and dlls to run. If this still concerns you, you might wish to rebuild everything from source code using the makefile, and you can rebuild the exe yourself with pyinstaller by using `make_pyinstaller.bat`
 - Supported GGML models (Includes backward compatibility for older versions/legacy GGML models, though some newer features might be unavailable):
   - LLAMA and LLAMA2 (LLaMA / Alpaca / GPT4All / Vicuna / Koala / Pygmalion 7B / Metharme 7B / WizardLM and many more)
@@ -103,4 +106,5 @@ For more information, be sure to run the program with the `--help` flag.
   - RWKV
   - GPT-NeoX / Pythia / StableLM / Dolly / RedPajama
   - MPT models
+  - Falcon (GGUF only)
 
diff --git a/Remote-Link.cmd b/Remote-Link.cmd
@@ -1,2 +1,18 @@
-curl -L https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-windows-amd64.exe -o cloudflared.exe
-cloudflared.exe tunnel --url localhost:5001
+: # This script will help setup a cloudflared tunnel for accessing KoboldCpp over the internet
+: # It should work out of the box on both linux and windows
+: # ======
+: # WINDOWS PORTION
+:<<BATCH
+    @echo off
+    echo Starting Cloudflare Tunnel for Windows
+    curl -L https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-windows-amd64.exe -o cloudflared.exe
+    cloudflared.exe tunnel --url localhost:5001
+    GOTO ENDING
+BATCH
+: # LINUX PORTION
+echo 'Starting Cloudflare Tunnel for Linux'
+curl -L https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -o 'cloudflared-linux-amd64' # 
+chmod +x 'cloudflared-linux-amd64' # 
+./cloudflared-linux-amd64 tunnel --url http://localhost:5001 # 
+exit # 
+:ENDING
diff --git a/class.py b/class.py
@@ -188,6 +188,18 @@ def get_requested_parameters(self, model_name, model_path, menu_path, parameters
                                     "extra_classes": "",
                                     'children': [{'text': 'False', 'value': False}, {'text': 'True', 'value': True}],
                                     })
+        requested_parameters.append({
+                                    "uitype": "text",
+                                    "unit": "text",
+                                    "label": "GPU ID",
+                                    "id": "kcpp_tensor_split_str",
+                                    "default": "1",
+                                    "check": {"value": "", 'check': "!="},
+                                    "tooltip": "Which GPU's do we use? For example:1 2",
+                                    "menu_path": "",
+                                    "refresh_model_inputs": False,
+                                    "extra_classes": ""
+                                    })
         requested_parameters.append({
                                     "uitype": "dropdown",
                                     "unit": "int",
@@ -202,18 +214,6 @@ def get_requested_parameters(self, model_name, model_path, menu_path, parameters
                                     "extra_classes": "",
                                     'children': [{'text': 'False', 'value': 0}, {'text': 'True', 'value': 1}],
                                     })
-        requested_parameters.append({
-                                    "uitype": "text",
-                                    "unit": "text",
-                                    "label": "Tensor Split",
-                                    "id": "kcpp_tensor_split_str",
-                                    "default": self.kcpp_tensor_split_str,
-                                    "check": {"value": "", 'check': "!="},
-                                    "tooltip": "Tensor Split, values are space separated",
-                                    "menu_path": "",
-                                    "refresh_model_inputs": False,
-                                    "extra_classes": ""
-                                    })
         return requested_parameters
 
     def set_input_parameters(self, parameters):
@@ -232,6 +232,7 @@ def set_input_parameters(self, parameters):
             self.kcpp_tensor_split = []
             for s in splits:
                 self.kcpp_tensor_split.append(int(s))
+                print(self.kcpp_tensor_split)
 
         accel = parameters["kcpp_accelerator"]
         if accel==0:
@@ -271,7 +272,8 @@ def _load(self, save_model: bool, initial_load: bool) -> None:
             blasbatchsize=self.kcpp_blasbatchsize, ropeconfig=[self.kcpp_ropescale, self.kcpp_ropebase], stream=False, smartcontext=self.kcpp_smartcontext,
             unbantokens=False, bantokens=None, usemirostat=None, forceversion=0, nommap=self.kcpp_nommap,
             usemlock=False, noavx2=self.kcpp_noavx2, debugmode=self.kcpp_debugmode, skiplauncher=True, hordeconfig=None, noblas=self.kcpp_noblas,
-            useclblast=self.kcpp_useclblast, usecublas=self.kcpp_usecublas, gpulayers=self.kcpp_gpulayers, tensor_split=self.kcpp_tensor_split, config=None, onready='', multiuser=False)
+            useclblast=self.kcpp_useclblast, usecublas=self.kcpp_usecublas, gpulayers=self.kcpp_gpulayers, tensor_split=self.kcpp_tensor_split, config=None,
+            onready='', multiuser=False, foreground=False)
 
             koboldcpp.main(kcppargs,False) #initialize library without enabling Lite http server
             kcpp_backend_loaded = True