From e7a962c70a6c86e77f1f2b64b5bebcfb9f74fa81 Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Thu, 2 May 2024 10:57:54 +0800
Subject: [PATCH 1/9] update readme

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 427cd9822dacc..363f94bcbce63 100644
--- a/README.md
+++ b/README.md
@@ -32,11 +32,11 @@ For more information, be sure to run the program with the `--help` flag, or [che
 ## OSX and Linux
 
 ### Linux Usage (Precompiled Binary, Recommended)
-On Linux, we provide a `koboldcpp-linux-x64` PyInstaller prebuilt binary on the **[releases](https://github.com/LostRuins/koboldcpp/releases/latest)** page for modern systems. Simply download and run the binary.
+On Linux, we provide a `koboldcpp-linux-x64-cuda1150` PyInstaller prebuilt binary on the **[releases](https://github.com/LostRuins/koboldcpp/releases/latest)** page for modern systems. Simply download and run the binary.
 
 Alternatively, you can also install koboldcpp to the current directory by running the following terminal command:
 ```
-curl -fLo koboldcpp https://github.com/LostRuins/koboldcpp/releases/latest/download/koboldcpp-linux-x64 && chmod +x koboldcpp
+curl -fLo koboldcpp https://github.com/LostRuins/koboldcpp/releases/latest/download/koboldcpp-linux-x64-cuda1150 && chmod +x koboldcpp
 ```
 After running this command you can launch Koboldcpp from the current directory using `./koboldcpp` in the terminal (for CLI usage, run with `--help`).
 

From fb7e72352ec26c78e4956ec46f9db729d7788b25 Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Thu, 2 May 2024 14:17:48 +0800
Subject: [PATCH 2/9] benchmark includes ver

---
 koboldcpp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/koboldcpp.py b/koboldcpp.py
index a5ca38809a8f2..043354d546e51 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -3197,7 +3197,7 @@ def onready_subprocess():
         s_pp = float(benchmaxctx-benchlen)/t_pp
         s_gen = float(benchlen)/t_gen
         datetimestamp = datetime.now(timezone.utc)
-        print(f"\nBenchmark Completed - Results:\n======")
+        print(f"\nBenchmark Completed - v{KcppVersion} Results:\n======")
         print(f"Timestamp: {datetimestamp}")
         print(f"Backend: {libname}")
         print(f"Layers: {args.gpulayers}")

From 0d8c4a9b73eda0c20a8cf191431288f6ede78d25 Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Thu, 2 May 2024 14:21:44 +0800
Subject: [PATCH 3/9] remove quick lowvram option

---
 koboldcpp.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/koboldcpp.py b/koboldcpp.py
index 043354d546e51..5f696008fdd1f 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -1987,7 +1987,6 @@ def changerunmode(a,b,c):
 
         if index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
             lowvram_box.grid(row=4, column=0, padx=8, pady=1,  stick="nw")
-            quick_lowvram_box.grid(row=4, column=0, padx=8, pady=1,  stick="nw")
             mmq_box.grid(row=4, column=1, padx=8, pady=1,  stick="nw")
             quick_mmq_box.grid(row=4, column=1, padx=8, pady=1,  stick="nw")
             splitmode_box.grid(row=5, column=1, padx=8, pady=1,  stick="nw")
@@ -1995,7 +1994,6 @@ def changerunmode(a,b,c):
             tensor_split_entry.grid(row=8, column=1, padx=8, pady=1, stick="nw")
         else:
             lowvram_box.grid_forget()
-            quick_lowvram_box.grid_forget()
             mmq_box.grid_forget()
             quick_mmq_box.grid_forget()
             tensor_split_label.grid_forget()
@@ -2033,7 +2031,6 @@ def changerunmode(a,b,c):
     quick_gpuname_label.grid(row=3, column=1, padx=75, sticky="W")
     quick_gpuname_label.configure(text_color="#ffff00")
     quick_gpu_layers_entry,quick_gpu_layers_label = makelabelentry(quick_tab,"GPU Layers:", gpulayers_var, 6, 50,"How many layers to offload onto the GPU.\nVRAM intensive, usage increases with model and context size.\nRequires some trial and error to find the best fit value.")
-    quick_lowvram_box = makecheckbox(quick_tab,  "Low VRAM (No KV offload)", lowvram_var, 4,0,tooltiptxt="Avoid offloading KV Cache or scratch buffers to VRAM.\nAllows more layers to fit, but may result in a speed loss.")
     quick_mmq_box = makecheckbox(quick_tab,  "Use QuantMatMul (mmq)", mmq_var, 4,1,tooltiptxt="Enable MMQ mode instead of CuBLAS for prompt processing. Read the wiki. Speed may vary.")
 
 

From 4c5d307f597291355f129c0e085a504f2665344e Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Thu, 2 May 2024 23:11:24 +0800
Subject: [PATCH 4/9] fixed benchmark interrupt (+2 squashed commit)

Squashed commit:

[6e334c8b] require enter key to be pressed

[d50d49b6] fixed bench script
---
 koboldcpp.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/koboldcpp.py b/koboldcpp.py
index 5f696008fdd1f..fa419100eac1f 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -1502,9 +1502,6 @@ def show_new_gui():
     from tkinter.filedialog import askopenfilename
     from tkinter.filedialog import asksaveasfile
 
-    global using_gui_launcher
-    using_gui_launcher = True
-
     # if args received, launch
     if len(sys.argv) != 1:
         import tkinter as tk
@@ -1568,6 +1565,8 @@ def on_resize(event):
                     ctk.set_widget_scaling(smallratio)
 
     root.bind("<Configure>", on_resize)
+    global using_gui_launcher
+    using_gui_launcher = True
 
     # trigger empty tooltip then remove it
     def show_tooltip(event, tooltip_text=None):
@@ -3217,7 +3216,11 @@ def onready_subprocess():
                     file.write(f"\n{datetimestamp},{libname},{args.gpulayers},{benchmodel},{benchmaxctx},{benchlen},{t_pp:.2f},{s_pp:.2f},{t_gen:.2f},{s_gen:.2f},{(t_pp+t_gen):.2f},{resultok},{result}")
             except Exception as e:
                 print(f"Error writing benchmark to file: {e}")
-
+        global using_gui_launcher
+        if using_gui_launcher and not save_to_file:
+            print("===")
+            print("Press ENTER key to exit.", flush=True)
+            input()
 
     if start_server:
         if args.remotetunnel:
@@ -3229,11 +3232,6 @@ def onready_subprocess():
     else:
         # Flush stdout for previous win32 issue so the client can see output.
         print(f"Server was not started, main function complete. Idling.", flush=True)
-        global using_gui_launcher
-        if using_gui_launcher:
-            print("===")
-            print("Press a key to exit", flush=True)
-            input()
 
 def run_in_queue(launch_args, input_queue, output_queue):
     main(launch_args, start_server=False)

From a34a09d19698c85ab9ef95ada54d83c55519c3fa Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Fri, 3 May 2024 15:57:13 +0800
Subject: [PATCH 5/9] replace destroy with quit for tk

---
 koboldcpp.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/koboldcpp.py b/koboldcpp.py
index fa419100eac1f..c64fee035c51a 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -1508,7 +1508,8 @@ def show_new_gui():
         root = tk.Tk() #we dont want the useless window to be visible, but we want it in taskbar
         root.attributes("-alpha", 0)
         args.model_param = askopenfilename(title="Select ggml model .bin or .gguf file or .kcpps config")
-        root.destroy()
+        root.withdraw()
+        root.quit()
         if args.model_param and args.model_param!="" and args.model_param.lower().endswith('.kcpps'):
             loadconfigfile(args.model_param)
         if not args.model_param and not args.sdconfig:
@@ -2190,7 +2191,8 @@ def guilaunch():
             model_var.set(tmp)
         nonlocal nextstate
         nextstate = 1
-        root.destroy()
+        root.withdraw()
+        root.quit()
         pass
 
     def export_vars():
@@ -2496,7 +2498,8 @@ def show_gui_msgbox(title,message):
         root = tk.Tk()
         root.attributes("-alpha", 0)
         messagebox.showerror(title=title, message=message)
-        root.destroy()
+        root.withdraw()
+        root.quit()
     except Exception as ex2:
         pass
 
@@ -3181,8 +3184,8 @@ def onready_subprocess():
         else:
             print(f"\nRunning benchmark (Not Saved)...")
 
-        benchprompt = "11111111"
-        for i in range(0,10): #generate massive prompt
+        benchprompt = "1111111111111111"
+        for i in range(0,12): #generate massive prompt
             benchprompt += benchprompt
         genout = generate(benchprompt,memory="",images=[],max_length=benchlen,max_context_length=benchmaxctx,temperature=0.1,top_k=1,rep_pen=1,use_default_badwordsids=True)
         result = genout['text']

From b6bfab128f3b39c117f3e65aedd06a7475d5566d Mon Sep 17 00:00:00 2001
From: henk717 <henk@henk.tech>
Date: Fri, 3 May 2024 11:12:57 +0200
Subject: [PATCH 6/9] CUDA 12 CI (#815)

* Allow KCPP_CUDA to specify CUDA version

* CUDA 12 CI Linux

* CUDA 12 CI

* Fix KCPP_CUDA indent

* KCPP_CUDA ENV Fix

StackOverflow is bad for advice sometimes....

* Lowcase cuda on output filename

* Strip . from filename output
---
 .../kcpp-build-release-linux-cuda12.yaml      | 33 ++++++++++++++++++
 .../kcpp-build-release-win-cuda12.yaml        | 34 +++++++++++++++++++
 koboldcpp.sh                                  | 21 +++++++++---
 3 files changed, 83 insertions(+), 5 deletions(-)
 create mode 100644 .github/workflows/kcpp-build-release-linux-cuda12.yaml
 create mode 100644 .github/workflows/kcpp-build-release-win-cuda12.yaml

diff --git a/.github/workflows/kcpp-build-release-linux-cuda12.yaml b/.github/workflows/kcpp-build-release-linux-cuda12.yaml
new file mode 100644
index 0000000000000..0cbeaf9a2b942
--- /dev/null
+++ b/.github/workflows/kcpp-build-release-linux-cuda12.yaml
@@ -0,0 +1,33 @@
+name: Koboldcpp Builder Linux CUDA12
+
+on: workflow_dispatch
+env:
+  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+  KCPP_CUDA: 12.1.0
+
+jobs:
+  linux:
+    runs-on: ubuntu-20.04
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+        with:
+          ref: concedo_experimental
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install git curl bzip2
+
+      - name: Build
+        id: make_build
+        run: |
+          ./koboldcpp.sh dist
+
+      - name: Save artifact
+        uses: actions/upload-artifact@v3
+        with:
+          name: kcpp_linux_binary
+          path: dist/
diff --git a/.github/workflows/kcpp-build-release-win-cuda12.yaml b/.github/workflows/kcpp-build-release-win-cuda12.yaml
new file mode 100644
index 0000000000000..ed9065d939728
--- /dev/null
+++ b/.github/workflows/kcpp-build-release-win-cuda12.yaml
@@ -0,0 +1,34 @@
+name: Koboldcpp Builder Windows CUDA12
+
+on: workflow_dispatch
+env:
+  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+
+jobs:
+  windows:
+    runs-on: windows-2019
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+        with:
+          ref: concedo_experimental
+
+      - uses: Jimver/cuda-toolkit@v0.2.11
+        id: cuda-toolkit
+        with:
+          cuda: '12.1.0'
+        
+      - name: Build
+        id: cmake_build
+        run: |
+          mkdir build
+          cd build
+          cmake .. -DLLAMA_CUBLAS=ON -DCMAKE_SYSTEM_VERSION="10.0.19041.0"
+          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
+
+      - name: Save artifact
+        uses: actions/upload-artifact@v3
+        with:
+          name: kcpp_windows_cuda_binary
+          path: build/bin/Release/
diff --git a/koboldcpp.sh b/koboldcpp.sh
index b2923ec4ff7bd..587270d2add22 100755
--- a/koboldcpp.sh
+++ b/koboldcpp.sh
@@ -1,13 +1,24 @@
 #!/bin/bash
+
 if [ ! -f "bin/micromamba" ]; then
 	curl -Ls https://anaconda.org/conda-forge/micromamba/1.5.3/download/linux-64/micromamba-1.5.3-0.tar.bz2 | tar -xvj bin/micromamba
 fi
 
 if [[ ! -f "conda/envs/linux/bin/python" || $1 == "rebuild" ]]; then
-	bin/micromamba create --no-shortcuts -r conda -n linux -f environment.yaml -y
-	bin/micromamba create --no-shortcuts -r conda -n linux -f environment.yaml -y
+	cp environment.yaml environment.tmp.yaml
+	if [ -n "$KCPP_CUDA" ]; then
+		sed -i -e "s/nvidia\/label\/cuda-11.5.0/nvidia\/label\/cuda-$KCPP_CUDA/g" environment.tmp.yaml
+	else
+		KCPP_CUDA=11.5.0
+	fi
+	bin/micromamba create --no-shortcuts -r conda -n linux -f environment.tmp.yaml -y
+	bin/micromamba create --no-shortcuts -r conda -n linux -f environment.tmp.yaml -y
 	bin/micromamba run -r conda -n linux make clean
+	echo $KCPP_CUDA > conda/envs/linux/cudaver
+	echo rm environment.tmp.yaml
 fi
+KCPP_CUDA=$(<conda/envs/linux/cudaver)
+KCPP_CUDAAPPEND=-cuda${KCPP_CUDA//.}$KCPP_APPEND
 
 bin/micromamba run -r conda -n linux make LLAMA_VULKAN=1 LLAMA_OPENBLAS=1 LLAMA_CLBLAST=1 LLAMA_CUBLAS=1 LLAMA_PORTABLE=1 LLAMA_ADD_CONDA_PATHS=1
 
@@ -15,9 +26,9 @@ if [[ $1 == "rebuild" ]]; then
 	echo Rebuild complete, you can now try to launch Koboldcpp.
 elif [[ $1 == "dist" ]]; then
 	bin/micromamba remove -r conda -n linux --force ocl-icd -y
-	bin/micromamba run -r conda -n linux pyinstaller --noconfirm --onefile --collect-all customtkinter --add-data='./koboldcpp_default.so:.' --add-data='./koboldcpp_cublas.so:.' --add-data='./koboldcpp_vulkan.so:.' --add-data='./koboldcpp_openblas.so:.' --add-data='./koboldcpp_clblast.so:.' --add-data "./koboldcpp_failsafe.so:." --add-data "./koboldcpp_noavx2.so:." --add-data "./koboldcpp_clblast_noavx2.so:." --add-data "./koboldcpp_vulkan_noavx2.so:." --add-data "./koboldcpp_vulkan.so:." --add-data='./klite.embd:.' --add-data='./kcpp_docs.embd:.' --add-data='./rwkv_vocab.embd:.' --add-data='./rwkv_world_vocab.embd:.' --clean --console koboldcpp.py -n "koboldcpp-linux-x64"
-	bin/micromamba run -r conda -n linux pyinstaller --noconfirm --onefile --collect-all customtkinter --add-data='./koboldcpp_default.so:.' --add-data='./koboldcpp_openblas.so:.' --add-data='./koboldcpp_vulkan.so:.' --add-data='./koboldcpp_clblast.so:.' --add-data "./koboldcpp_failsafe.so:." --add-data "./koboldcpp_noavx2.so:." --add-data "./koboldcpp_clblast_noavx2.so:." --add-data "./koboldcpp_vulkan_noavx2.so:." --add-data "./koboldcpp_vulkan.so:." --add-data='./klite.embd:.' --add-data='./kcpp_docs.embd:.' --add-data='./rwkv_vocab.embd:.' --add-data='./rwkv_world_vocab.embd:.' --clean --console koboldcpp.py -n "koboldcpp-linux-x64-nocuda"
+	bin/micromamba run -r conda -n linux pyinstaller --noconfirm --onefile --collect-all customtkinter --add-data='./koboldcpp_default.so:.' --add-data='./koboldcpp_cublas.so:.' --add-data='./koboldcpp_vulkan.so:.' --add-data='./koboldcpp_openblas.so:.' --add-data='./koboldcpp_clblast.so:.' --add-data "./koboldcpp_failsafe.so:." --add-data "./koboldcpp_noavx2.so:." --add-data "./koboldcpp_clblast_noavx2.so:." --add-data "./koboldcpp_vulkan_noavx2.so:." --add-data "./koboldcpp_vulkan.so:." --add-data='./klite.embd:.' --add-data='./kcpp_docs.embd:.' --add-data='./rwkv_vocab.embd:.' --add-data='./rwkv_world_vocab.embd:.' --clean --console koboldcpp.py -n "koboldcpp-linux-x64$KCPP_CUDAAPPEND"
+	bin/micromamba run -r conda -n linux pyinstaller --noconfirm --onefile --collect-all customtkinter --add-data='./koboldcpp_default.so:.' --add-data='./koboldcpp_openblas.so:.' --add-data='./koboldcpp_vulkan.so:.' --add-data='./koboldcpp_clblast.so:.' --add-data "./koboldcpp_failsafe.so:." --add-data "./koboldcpp_noavx2.so:." --add-data "./koboldcpp_clblast_noavx2.so:." --add-data "./koboldcpp_vulkan_noavx2.so:." --add-data "./koboldcpp_vulkan.so:." --add-data='./klite.embd:.' --add-data='./kcpp_docs.embd:.' --add-data='./rwkv_vocab.embd:.' --add-data='./rwkv_world_vocab.embd:.' --clean --console koboldcpp.py -n "koboldcpp-linux-x64-nocuda$KCPP_APPEND"
 	bin/micromamba install -r conda -n linux ocl-icd -c conda-forge -y
- else
+else
 	bin/micromamba run -r conda -n linux python koboldcpp.py $*
 fi

From 640f19514046e38eba1bf8227b7d3aa75fac17b8 Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Fri, 3 May 2024 18:13:39 +0800
Subject: [PATCH 7/9] add kobble tiny to readme

---
 README.md   | 4 ++--
 colab.ipynb | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 363f94bcbce63..cc85b3ce98dd8 100644
--- a/README.md
+++ b/README.md
@@ -90,8 +90,8 @@ There are some community made AUR packages (Maintained by @AlpinDale) available:
 - Clone the repo `git clone https://github.com/LostRuins/koboldcpp.git`
 - Navigate to the koboldcpp folder `cd koboldcpp`
 - Build the project `make`
-- Grab a small GGUF model, such as `wget https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/phi-2.Q2_K.gguf`
-- Start the python server `python koboldcpp.py --model phi-2.Q2_K.gguf`
+- Grab a small GGUF model, such as `wget https://huggingface.co/concedo/KobbleTinyV2-1.1B-GGUF/resolve/main/KobbleTiny-Q4_K.gguf`
+- Start the python server `python koboldcpp.py --model KobbleTiny-Q4_K.gguf`
 - Connect to `http://localhost:5001` on your mobile browser
 - If you encounter any errors, make sure your packages are up-to-date with `pkg up`
 - GPU acceleration for Termux may be possible but I have not explored it. If you find a good cross-device solution, do share or PR it.
diff --git a/colab.ipynb b/colab.ipynb
index ca791c6859e34..606069712b94b 100644
--- a/colab.ipynb
+++ b/colab.ipynb
@@ -48,7 +48,7 @@
       "source": [
         "#@title <b>v-- Enter your model below and then click this to start Koboldcpp</b>\r\n",
         "\r\n",
-        "Model = \"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\" #@param [\"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\",\"https://huggingface.co/Sao10K/Fimbulvetr-11B-v2-GGUF/resolve/main/Fimbulvetr-11B-v2-Test-14.q4_K_M.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-13B-GGUF/resolve/main/mythomax-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/ReMM-SLERP-L2-13B-GGUF/resolve/main/remm-slerp-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/Xwin-LM-13B-v0.2-GGUF/resolve/main/xwin-lm-13b-v0.2.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/Stheno-L2-13B-GGUF/resolve/main/stheno-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-Kimiko-v2-13B-GGUF/resolve/main/mythomax-l2-kimiko-v2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/airoboros-mistral2.2-7B-GGUF/resolve/main/airoboros-mistral2.2-7b.Q4_K_S.gguf\",\"https://huggingface.co/afrideva/phi-2-uncensored-GGUF/resolve/main/phi-2-uncensored.q3_k_m.gguf\",\"https://huggingface.co/grimjim/kukulemon-7B-GGUF/resolve/main/kukulemon-7B.Q8_0.gguf\"]{allow-input: true}\r\n",
+        "Model = \"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\" #@param [\"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\",\"https://huggingface.co/Sao10K/Fimbulvetr-11B-v2-GGUF/resolve/main/Fimbulvetr-11B-v2-Test-14.q4_K_M.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-13B-GGUF/resolve/main/mythomax-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/ReMM-SLERP-L2-13B-GGUF/resolve/main/remm-slerp-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/Xwin-LM-13B-v0.2-GGUF/resolve/main/xwin-lm-13b-v0.2.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/Stheno-L2-13B-GGUF/resolve/main/stheno-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-Kimiko-v2-13B-GGUF/resolve/main/mythomax-l2-kimiko-v2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/airoboros-mistral2.2-7B-GGUF/resolve/main/airoboros-mistral2.2-7b.Q4_K_S.gguf\",\"https://huggingface.co/concedo/KobbleTinyV2-1.1B-GGUF/resolve/main/KobbleTiny-Q4_K.gguf\",\"https://huggingface.co/grimjim/kukulemon-7B-GGUF/resolve/main/kukulemon-7B.Q8_0.gguf\"]{allow-input: true}\r\n",
         "Layers = 99 #@param [99]{allow-input: true}\r\n",
         "ContextSize = 4096 #@param [4096] {allow-input: true}\r\n",
         "ForceRebuild = False #@param {type:\"boolean\"}\r\n",

From 89db8afded10625a75dbabb6e04656832158f159 Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Sat, 4 May 2024 10:07:54 +0800
Subject: [PATCH 8/9] revert moondream to try and fix llava

---
 examples/llava/clip.cpp | 71 +++++++----------------------------------
 1 file changed, 11 insertions(+), 60 deletions(-)

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index e3c9bcd4364aa..f568f470c8f5c 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -104,7 +104,6 @@ static std::string format(const char * fmt, ...) {
 #define TN_POS_EMBD        "%s.position_embd.weight"
 #define TN_CLASS_EMBD      "v.class_embd"
 #define TN_PATCH_EMBD      "v.patch_embd.weight"
-#define TN_PATCH_BIAS      "v.patch_embd.bias"
 #define TN_ATTN_K          "%s.blk.%d.attn_k.%s"
 #define TN_ATTN_Q          "%s.blk.%d.attn_q.%s"
 #define TN_ATTN_V          "%s.blk.%d.attn_v.%s"
@@ -426,7 +425,6 @@ struct clip_vision_model {
     // embeddings
     struct ggml_tensor * class_embedding;
     struct ggml_tensor * patch_embeddings;
-    struct ggml_tensor * patch_bias;
     struct ggml_tensor * position_embeddings;
 
     struct ggml_tensor * pre_ln_w;
@@ -503,11 +501,6 @@ struct clip_ctx {
     bool use_gelu = false;
     int32_t ftype = 1;
 
-    bool has_class_embedding = true;
-    bool has_pre_norm = true;
-    bool has_post_norm = false;
-    bool has_patch_bias = false;
-
     struct gguf_context * ctx_gguf;
     struct ggml_context * ctx_data;
 
@@ -533,7 +526,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
     const int patch_size           = hparams.patch_size;
     const int num_patches          = ((image_size / patch_size) * (image_size / patch_size));
     const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
-    const int num_positions        = num_patches + (ctx->has_class_embedding ? 1 : 0);
+    const int num_positions        = num_patches + 1;
     const int hidden_size          = hparams.hidden_size;
     const int n_head               = hparams.n_head;
     const int d_head               = hidden_size / n_head;
@@ -564,23 +557,16 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
     inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
     inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
 
-    if (ctx->has_patch_bias) {
-        // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
-        inp = ggml_add(ctx0, inp, model.patch_bias);
-    }
-
     // concat class_embeddings and patch_embeddings
-    struct ggml_tensor * embeddings = inp;
-    if (ctx->has_class_embedding) {
-        embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
-        embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
-                embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
-        embeddings = ggml_acc(ctx0, embeddings, inp,
-                embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
-    }
+    struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
     ggml_set_name(embeddings, "embeddings");
     ggml_set_input(embeddings);
 
+    embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
+            embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
+
+    embeddings = ggml_acc(ctx0, embeddings, inp,
+            embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
 
     struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
     ggml_set_name(positions, "positions");
@@ -590,7 +576,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
         ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
 
     // pre-layernorm
-    if (ctx->has_pre_norm) {
+    {
         embeddings = ggml_norm(ctx0, embeddings, eps);
         ggml_set_name(embeddings, "pre_ln");
 
@@ -678,14 +664,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
         embeddings = cur;
     }
 
-    // post-layernorm
-    if (ctx->has_post_norm) {
-        embeddings = ggml_norm(ctx0, embeddings, eps);
-        ggml_set_name(embeddings, "post_ln");
-
-        embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
-    }
-
     // llava projector
     {
         embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
@@ -1170,39 +1148,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
         }
 
-        try {
-            vision_model.class_embedding  = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
-            new_clip->has_class_embedding = true;
-        } catch (const std::exception& e) {
-            new_clip->has_class_embedding = false;
-        }
-
-        try {
-            vision_model.pre_ln_w  = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
-            vision_model.pre_ln_b  = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
-            new_clip->has_pre_norm = true;
-        } catch (std::exception & e) {
-            new_clip->has_pre_norm = false;
-        }
-
-        try {
-            vision_model.post_ln_w  = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight"));
-            vision_model.post_ln_b  = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias"));
-            new_clip->has_post_norm = true;
-        } catch (std::exception & e) {
-            new_clip->has_post_norm = false;
-        }
-
-        try {
-            vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS);
-            new_clip->has_patch_bias = true;
-        } catch (std::exception & e) {
-            new_clip->has_patch_bias = false;
-        }
-
         try {
             vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
+            vision_model.class_embedding     = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
             vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
+            vision_model.pre_ln_w            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
+            vision_model.pre_ln_b            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
         } catch(const std::exception& e) {
             LOG_TEE("%s: failed to load vision model tensors\n", __func__);
         }

From a3718c6354dd0459154fa5297f1f29587845bc76 Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Sat, 4 May 2024 10:38:20 +0800
Subject: [PATCH 9/9] 1.64.1 to fix llava issues

---
 koboldcpp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/koboldcpp.py b/koboldcpp.py
index c64fee035c51a..edf732bc3eb78 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -636,7 +636,7 @@ def string_contains_sequence_substring(inputstr,sequences):
 modelbusy = threading.Lock()
 requestsinqueue = 0
 defaultport = 5001
-KcppVersion = "1.64"
+KcppVersion = "1.64.1"
 showdebug = True
 showsamplerwarning = True
 showmaxctxwarning = True