diff --git a/.github/workflows/kcpp-build-release-linux-cuda12.yaml b/.github/workflows/kcpp-build-release-linux-cuda12.yaml
new file mode 100644
index 0000000000000..0cbeaf9a2b942
--- /dev/null
+++ b/.github/workflows/kcpp-build-release-linux-cuda12.yaml
@@ -0,0 +1,33 @@
+name: Koboldcpp Builder Linux CUDA12
+
+on: workflow_dispatch
+env:
+  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+  KCPP_CUDA: 12.1.0
+
+jobs:
+  linux:
+    runs-on: ubuntu-20.04
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+        with:
+          ref: concedo_experimental
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install git curl bzip2
+
+      - name: Build
+        id: make_build
+        run: |
+          ./koboldcpp.sh dist
+
+      - name: Save artifact
+        uses: actions/upload-artifact@v3
+        with:
+          name: kcpp_linux_binary
+          path: dist/
diff --git a/.github/workflows/kcpp-build-release-win-cuda12.yaml b/.github/workflows/kcpp-build-release-win-cuda12.yaml
new file mode 100644
index 0000000000000..ed9065d939728
--- /dev/null
+++ b/.github/workflows/kcpp-build-release-win-cuda12.yaml
@@ -0,0 +1,34 @@
+name: Koboldcpp Builder Windows CUDA12
+
+on: workflow_dispatch
+env:
+  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+
+jobs:
+  windows:
+    runs-on: windows-2019
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+        with:
+          ref: concedo_experimental
+
+      - uses: Jimver/cuda-toolkit@v0.2.11
+        id: cuda-toolkit
+        with:
+          cuda: '12.1.0'
+        
+      - name: Build
+        id: cmake_build
+        run: |
+          mkdir build
+          cd build
+          cmake .. -DLLAMA_CUBLAS=ON -DCMAKE_SYSTEM_VERSION="10.0.19041.0"
+          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
+
+      - name: Save artifact
+        uses: actions/upload-artifact@v3
+        with:
+          name: kcpp_windows_cuda_binary
+          path: build/bin/Release/
diff --git a/README.md b/README.md
index 899c56adabba3..852947823b1c9 100644
--- a/README.md
+++ b/README.md
@@ -41,8 +41,16 @@ My typical start command looks like this: ``python koboldcpp.py --threads 6 --bl
 - By default, you can connect to http://localhost:5001
 - You can also run it using the command line. For info, please check `koboldcpp.exe --help` or `python koboldcpp.py --help`
 
+- **(Nvidia Only) GPU Acceleration**: If you're on Windows with an Nvidia GPU you can get CUDA support out of the box using the `--usecublas` flag, make sure you select the correct .exe with CUDA support.
+- **Any GPU Acceleration**: As a slightly slower alternative, try CLBlast with `--useclblast` flags for a slightly slower but more GPU compatible speedup.
+- **GPU Layer Offloading**: Want even more speedup? Combine one of the above GPU flags with `--gpulayers` to offload entire layers to the GPU! **Much faster, but uses more VRAM**. Experiment to determine number of layers to offload, and reduce by a few if you run out of memory.
+- **Increasing Context Size**: Try `--contextsize 4096` to 2x your context size! without much perplexity gain. Note that you'll have to increase the max context in the Kobold Lite UI as well (click and edit the number text field).
+- If you are having crashes or issues, you can try turning off BLAS with the `--noblas` flag. You can also try running in a non-avx2 compatibility mode with `--noavx2`. Lastly, you can try turning off mmap with `--nommap`.
+
+For more information, be sure to run the program with the `--help` flag, or [check the wiki](https://github.com/LostRuins/koboldcpp/wiki).
+
+
 ## Compiling for AMD on Windows
-- You're encouraged to use the .exe released, but if you want to compile your binaries from source at Windows, the easiest way is:
   - Use the latest release of w64devkit (https://github.com/skeeto/w64devkit). Be sure to use the "vanilla one", not i686 or other different stuff. If you try they will conflit with the precompiled libs!
   - Make sure you are using the w64devkit integrated terminal, (powershell should work for the cmake hipblas part)
   - *This site may be useful, it has some patches for Windows ROCm to help it with compilation that I used, but I'm not sure if it's necessary.* https://streamhpc.com/blog/2023-08-01/how-to-get-full-cmake-support-for-amd-hip-sdk-on-windows-including-patches/
@@ -111,8 +119,8 @@ You can then run koboldcpp anywhere from the terminal by running `koboldcpp` to
 - Clone the repo `git clone https://github.com/LostRuins/koboldcpp.git`
 - Navigate to the koboldcpp folder `cd koboldcpp`
 - Build the project `make`
-- Grab a small GGUF model, such as `wget https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/phi-2.Q2_K.gguf`
-- Start the python server `python koboldcpp.py --model phi-2.Q2_K.gguf`
+- Grab a small GGUF model, such as `wget https://huggingface.co/concedo/KobbleTinyV2-1.1B-GGUF/resolve/main/KobbleTiny-Q4_K.gguf`
+- Start the python server `python koboldcpp.py --model KobbleTiny-Q4_K.gguf`
 - Connect to `http://localhost:5001` on your mobile browser
 - If you encounter any errors, make sure your packages are up-to-date with `pkg up`
 - GPU acceleration for Termux may be possible but I have not explored it. If you find a good cross-device solution, do share or PR it.
diff --git a/colab.ipynb b/colab.ipynb
index ca791c6859e34..606069712b94b 100644
--- a/colab.ipynb
+++ b/colab.ipynb
@@ -48,7 +48,7 @@
       "source": [
         "#@title <b>v-- Enter your model below and then click this to start Koboldcpp</b>\r\n",
         "\r\n",
-        "Model = \"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\" #@param [\"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\",\"https://huggingface.co/Sao10K/Fimbulvetr-11B-v2-GGUF/resolve/main/Fimbulvetr-11B-v2-Test-14.q4_K_M.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-13B-GGUF/resolve/main/mythomax-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/ReMM-SLERP-L2-13B-GGUF/resolve/main/remm-slerp-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/Xwin-LM-13B-v0.2-GGUF/resolve/main/xwin-lm-13b-v0.2.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/Stheno-L2-13B-GGUF/resolve/main/stheno-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-Kimiko-v2-13B-GGUF/resolve/main/mythomax-l2-kimiko-v2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/airoboros-mistral2.2-7B-GGUF/resolve/main/airoboros-mistral2.2-7b.Q4_K_S.gguf\",\"https://huggingface.co/afrideva/phi-2-uncensored-GGUF/resolve/main/phi-2-uncensored.q3_k_m.gguf\",\"https://huggingface.co/grimjim/kukulemon-7B-GGUF/resolve/main/kukulemon-7B.Q8_0.gguf\"]{allow-input: true}\r\n",
+        "Model = \"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\" #@param [\"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\",\"https://huggingface.co/Sao10K/Fimbulvetr-11B-v2-GGUF/resolve/main/Fimbulvetr-11B-v2-Test-14.q4_K_M.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-13B-GGUF/resolve/main/mythomax-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/ReMM-SLERP-L2-13B-GGUF/resolve/main/remm-slerp-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/Xwin-LM-13B-v0.2-GGUF/resolve/main/xwin-lm-13b-v0.2.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/Stheno-L2-13B-GGUF/resolve/main/stheno-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-Kimiko-v2-13B-GGUF/resolve/main/mythomax-l2-kimiko-v2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/airoboros-mistral2.2-7B-GGUF/resolve/main/airoboros-mistral2.2-7b.Q4_K_S.gguf\",\"https://huggingface.co/concedo/KobbleTinyV2-1.1B-GGUF/resolve/main/KobbleTiny-Q4_K.gguf\",\"https://huggingface.co/grimjim/kukulemon-7B-GGUF/resolve/main/kukulemon-7B.Q8_0.gguf\"]{allow-input: true}\r\n",
         "Layers = 99 #@param [99]{allow-input: true}\r\n",
         "ContextSize = 4096 #@param [4096] {allow-input: true}\r\n",
         "ForceRebuild = False #@param {type:\"boolean\"}\r\n",
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index e3c9bcd4364aa..f568f470c8f5c 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -104,7 +104,6 @@ static std::string format(const char * fmt, ...) {
 #define TN_POS_EMBD        "%s.position_embd.weight"
 #define TN_CLASS_EMBD      "v.class_embd"
 #define TN_PATCH_EMBD      "v.patch_embd.weight"
-#define TN_PATCH_BIAS      "v.patch_embd.bias"
 #define TN_ATTN_K          "%s.blk.%d.attn_k.%s"
 #define TN_ATTN_Q          "%s.blk.%d.attn_q.%s"
 #define TN_ATTN_V          "%s.blk.%d.attn_v.%s"
@@ -426,7 +425,6 @@ struct clip_vision_model {
     // embeddings
     struct ggml_tensor * class_embedding;
     struct ggml_tensor * patch_embeddings;
-    struct ggml_tensor * patch_bias;
     struct ggml_tensor * position_embeddings;
 
     struct ggml_tensor * pre_ln_w;
@@ -503,11 +501,6 @@ struct clip_ctx {
     bool use_gelu = false;
     int32_t ftype = 1;
 
-    bool has_class_embedding = true;
-    bool has_pre_norm = true;
-    bool has_post_norm = false;
-    bool has_patch_bias = false;
-
     struct gguf_context * ctx_gguf;
     struct ggml_context * ctx_data;
 
@@ -533,7 +526,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
     const int patch_size           = hparams.patch_size;
     const int num_patches          = ((image_size / patch_size) * (image_size / patch_size));
     const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
-    const int num_positions        = num_patches + (ctx->has_class_embedding ? 1 : 0);
+    const int num_positions        = num_patches + 1;
     const int hidden_size          = hparams.hidden_size;
     const int n_head               = hparams.n_head;
     const int d_head               = hidden_size / n_head;
@@ -564,23 +557,16 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
     inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
     inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
 
-    if (ctx->has_patch_bias) {
-        // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
-        inp = ggml_add(ctx0, inp, model.patch_bias);
-    }
-
     // concat class_embeddings and patch_embeddings
-    struct ggml_tensor * embeddings = inp;
-    if (ctx->has_class_embedding) {
-        embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
-        embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
-                embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
-        embeddings = ggml_acc(ctx0, embeddings, inp,
-                embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
-    }
+    struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
     ggml_set_name(embeddings, "embeddings");
     ggml_set_input(embeddings);
 
+    embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
+            embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
+
+    embeddings = ggml_acc(ctx0, embeddings, inp,
+            embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
 
     struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
     ggml_set_name(positions, "positions");
@@ -590,7 +576,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
         ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
 
     // pre-layernorm
-    if (ctx->has_pre_norm) {
+    {
         embeddings = ggml_norm(ctx0, embeddings, eps);
         ggml_set_name(embeddings, "pre_ln");
 
@@ -678,14 +664,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
         embeddings = cur;
     }
 
-    // post-layernorm
-    if (ctx->has_post_norm) {
-        embeddings = ggml_norm(ctx0, embeddings, eps);
-        ggml_set_name(embeddings, "post_ln");
-
-        embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
-    }
-
     // llava projector
     {
         embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
@@ -1170,39 +1148,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
         }
 
-        try {
-            vision_model.class_embedding  = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
-            new_clip->has_class_embedding = true;
-        } catch (const std::exception& e) {
-            new_clip->has_class_embedding = false;
-        }
-
-        try {
-            vision_model.pre_ln_w  = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
-            vision_model.pre_ln_b  = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
-            new_clip->has_pre_norm = true;
-        } catch (std::exception & e) {
-            new_clip->has_pre_norm = false;
-        }
-
-        try {
-            vision_model.post_ln_w  = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight"));
-            vision_model.post_ln_b  = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias"));
-            new_clip->has_post_norm = true;
-        } catch (std::exception & e) {
-            new_clip->has_post_norm = false;
-        }
-
-        try {
-            vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS);
-            new_clip->has_patch_bias = true;
-        } catch (std::exception & e) {
-            new_clip->has_patch_bias = false;
-        }
-
         try {
             vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
+            vision_model.class_embedding     = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
             vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
+            vision_model.pre_ln_w            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
+            vision_model.pre_ln_b            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
         } catch(const std::exception& e) {
             LOG_TEE("%s: failed to load vision model tensors\n", __func__);
         }
diff --git a/koboldcpp.py b/koboldcpp.py
index c6c8b48daf4ff..2a349548fbc43 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -661,7 +661,7 @@ def string_contains_sequence_substring(inputstr,sequences):
 modelbusy = threading.Lock()
 requestsinqueue = 0
 defaultport = 5001
-KcppVersion = "1.64.yr0-ROCm"
+KcppVersion = "1.64.1.yr0-ROCm"
 showdebug = True
 showsamplerwarning = True
 showmaxctxwarning = True
@@ -1528,16 +1528,14 @@ def show_new_gui():
     from tkinter.filedialog import askopenfilename
     from tkinter.filedialog import asksaveasfile
 
-    global using_gui_launcher
-    using_gui_launcher = True
-
     # if args received, launch
     if len(sys.argv) != 1:
         import tkinter as tk
         root = tk.Tk() #we dont want the useless window to be visible, but we want it in taskbar
         root.attributes("-alpha", 0)
         args.model_param = askopenfilename(title="Select ggml model .bin or .gguf file or .kcpps config")
-        root.destroy()
+        root.withdraw()
+        root.quit()
         if args.model_param and args.model_param!="" and args.model_param.lower().endswith('.kcpps'):
             loadconfigfile(args.model_param)
         if not args.model_param and not args.sdconfig:
@@ -1594,6 +1592,8 @@ def on_resize(event):
                     ctk.set_widget_scaling(smallratio)
 
     root.bind("<Configure>", on_resize)
+    global using_gui_launcher
+    using_gui_launcher = True
 
     # trigger empty tooltip then remove it
     def show_tooltip(event, tooltip_text=None):
@@ -2155,7 +2155,6 @@ def changerunmode(a,b,c):
 
         if index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
             lowvram_box.grid(row=4, column=0, padx=8, pady=1,  stick="nw")
-            quick_lowvram_box.grid(row=4, column=0, padx=8, pady=1,  stick="nw")
             mmq_box.grid(row=4, column=1, padx=8, pady=1,  stick="nw")
             quick_mmq_box.grid(row=4, column=1, padx=8, pady=1,  stick="nw")
             splitmode_box.grid(row=5, column=1, padx=8, pady=1,  stick="nw")
@@ -2163,7 +2162,6 @@ def changerunmode(a,b,c):
             tensor_split_entry.grid(row=8, column=1, padx=8, pady=1, stick="nw")
         else:
             lowvram_box.grid_forget()
-            quick_lowvram_box.grid_forget()
             mmq_box.grid_forget()
             quick_mmq_box.grid_forget()
             tensor_split_label.grid_forget()
@@ -2201,7 +2199,6 @@ def changerunmode(a,b,c):
     quick_gpuname_label.grid(row=3, column=1, padx=75, sticky="W")
     quick_gpuname_label.configure(text_color="#ffff00")
     quick_gpu_layers_entry,quick_gpu_layers_label = makelabelentry(quick_tab,"GPU Layers:", gpulayers_var, 6, 50,"How many layers to offload onto the GPU.\nVRAM intensive, usage increases with model and context size.\nRequires some trial and error to find the best fit value.")
-    quick_lowvram_box = makecheckbox(quick_tab,  "Low VRAM (No KV offload)", lowvram_var, 4,0,tooltiptxt="Avoid offloading KV Cache or scratch buffers to VRAM.\nAllows more layers to fit, but may result in a speed loss.")
     quick_mmq_box = makecheckbox(quick_tab,  "Use QuantMatMul (mmq)", mmq_var, 4,1,tooltiptxt="Enable MMQ mode instead of CuBLAS for prompt processing. Read the wiki. Speed may vary.")
 
 
@@ -2364,7 +2361,8 @@ def guilaunch():
             model_var.set(tmp)
         nonlocal nextstate
         nextstate = 1
-        root.destroy()
+        root.withdraw()
+        root.quit()
         pass
 
     def export_vars():
@@ -2816,7 +2814,8 @@ def show_gui_msgbox(title,message):
         root = tk.Tk()
         root.attributes("-alpha", 0)
         messagebox.showerror(title=title, message=message)
-        root.destroy()
+        root.withdraw()
+        root.quit()
     except Exception as ex2:
         pass
 
@@ -3548,8 +3547,8 @@ def onready_subprocess():
         else:
             print(f"\nRunning benchmark (Not Saved)...")
 
-        benchprompt = "11111111"
-        for i in range(0,10): #generate massive prompt
+        benchprompt = "1111111111111111"
+        for i in range(0,12): #generate massive prompt
             benchprompt += benchprompt
         genout = generate(benchprompt,memory="",images=[],max_length=benchlen,max_context_length=benchmaxctx,temperature=0.1,top_k=1,rep_pen=1,use_default_badwordsids=True)
         result = genout['text']
@@ -3560,7 +3559,7 @@ def onready_subprocess():
         s_pp = float(benchmaxctx-benchlen)/t_pp
         s_gen = float(benchlen)/t_gen
         datetimestamp = datetime.now(timezone.utc)
-        print(f"\nBenchmark Completed - Results:\n======")
+        print(f"\nBenchmark Completed - v{KcppVersion} Results:\n======")
         print(f"Timestamp: {datetimestamp}")
         print(f"Backend: {libname}")
         print(f"Layers: {args.gpulayers}")
@@ -3583,7 +3582,11 @@ def onready_subprocess():
                     file.write(f"\n{datetimestamp},{libname},{args.gpulayers},{benchmodel},{benchmaxctx},{benchlen},{t_pp:.2f},{s_pp:.2f},{t_gen:.2f},{s_gen:.2f},{(t_pp+t_gen):.2f},{resultok},{result}")
             except Exception as e:
                 print(f"Error writing benchmark to file: {e}")
-
+        global using_gui_launcher
+        if using_gui_launcher and not save_to_file:
+            print("===")
+            print("Press ENTER key to exit.", flush=True)
+            input()
 
     if start_server:
         if args.checkforupdates:
@@ -3597,11 +3600,6 @@ def onready_subprocess():
     else:
         # Flush stdout for previous win32 issue so the client can see output.
         print(f"Server was not started, main function complete. Idling.", flush=True)
-        global using_gui_launcher
-        if using_gui_launcher:
-            print("===")
-            print("Press a key to exit", flush=True)
-            input()
 
 def run_in_queue(launch_args, input_queue, output_queue):
     main(launch_args, start_server=False)
diff --git a/koboldcpp.sh b/koboldcpp.sh
index b2923ec4ff7bd..587270d2add22 100755
--- a/koboldcpp.sh
+++ b/koboldcpp.sh
@@ -1,13 +1,24 @@
 #!/bin/bash
+
 if [ ! -f "bin/micromamba" ]; then
 	curl -Ls https://anaconda.org/conda-forge/micromamba/1.5.3/download/linux-64/micromamba-1.5.3-0.tar.bz2 | tar -xvj bin/micromamba
 fi
 
 if [[ ! -f "conda/envs/linux/bin/python" || $1 == "rebuild" ]]; then
-	bin/micromamba create --no-shortcuts -r conda -n linux -f environment.yaml -y
-	bin/micromamba create --no-shortcuts -r conda -n linux -f environment.yaml -y
+	cp environment.yaml environment.tmp.yaml
+	if [ -n "$KCPP_CUDA" ]; then
+		sed -i -e "s/nvidia\/label\/cuda-11.5.0/nvidia\/label\/cuda-$KCPP_CUDA/g" environment.tmp.yaml
+	else
+		KCPP_CUDA=11.5.0
+	fi
+	bin/micromamba create --no-shortcuts -r conda -n linux -f environment.tmp.yaml -y
+	bin/micromamba create --no-shortcuts -r conda -n linux -f environment.tmp.yaml -y
 	bin/micromamba run -r conda -n linux make clean
+	echo $KCPP_CUDA > conda/envs/linux/cudaver
+	echo rm environment.tmp.yaml
 fi
+KCPP_CUDA=$(<conda/envs/linux/cudaver)
+KCPP_CUDAAPPEND=-cuda${KCPP_CUDA//.}$KCPP_APPEND
 
 bin/micromamba run -r conda -n linux make LLAMA_VULKAN=1 LLAMA_OPENBLAS=1 LLAMA_CLBLAST=1 LLAMA_CUBLAS=1 LLAMA_PORTABLE=1 LLAMA_ADD_CONDA_PATHS=1
 
@@ -15,9 +26,9 @@ if [[ $1 == "rebuild" ]]; then
 	echo Rebuild complete, you can now try to launch Koboldcpp.
 elif [[ $1 == "dist" ]]; then
 	bin/micromamba remove -r conda -n linux --force ocl-icd -y
-	bin/micromamba run -r conda -n linux pyinstaller --noconfirm --onefile --collect-all customtkinter --add-data='./koboldcpp_default.so:.' --add-data='./koboldcpp_cublas.so:.' --add-data='./koboldcpp_vulkan.so:.' --add-data='./koboldcpp_openblas.so:.' --add-data='./koboldcpp_clblast.so:.' --add-data "./koboldcpp_failsafe.so:." --add-data "./koboldcpp_noavx2.so:." --add-data "./koboldcpp_clblast_noavx2.so:." --add-data "./koboldcpp_vulkan_noavx2.so:." --add-data "./koboldcpp_vulkan.so:." --add-data='./klite.embd:.' --add-data='./kcpp_docs.embd:.' --add-data='./rwkv_vocab.embd:.' --add-data='./rwkv_world_vocab.embd:.' --clean --console koboldcpp.py -n "koboldcpp-linux-x64"
-	bin/micromamba run -r conda -n linux pyinstaller --noconfirm --onefile --collect-all customtkinter --add-data='./koboldcpp_default.so:.' --add-data='./koboldcpp_openblas.so:.' --add-data='./koboldcpp_vulkan.so:.' --add-data='./koboldcpp_clblast.so:.' --add-data "./koboldcpp_failsafe.so:." --add-data "./koboldcpp_noavx2.so:." --add-data "./koboldcpp_clblast_noavx2.so:." --add-data "./koboldcpp_vulkan_noavx2.so:." --add-data "./koboldcpp_vulkan.so:." --add-data='./klite.embd:.' --add-data='./kcpp_docs.embd:.' --add-data='./rwkv_vocab.embd:.' --add-data='./rwkv_world_vocab.embd:.' --clean --console koboldcpp.py -n "koboldcpp-linux-x64-nocuda"
+	bin/micromamba run -r conda -n linux pyinstaller --noconfirm --onefile --collect-all customtkinter --add-data='./koboldcpp_default.so:.' --add-data='./koboldcpp_cublas.so:.' --add-data='./koboldcpp_vulkan.so:.' --add-data='./koboldcpp_openblas.so:.' --add-data='./koboldcpp_clblast.so:.' --add-data "./koboldcpp_failsafe.so:." --add-data "./koboldcpp_noavx2.so:." --add-data "./koboldcpp_clblast_noavx2.so:." --add-data "./koboldcpp_vulkan_noavx2.so:." --add-data "./koboldcpp_vulkan.so:." --add-data='./klite.embd:.' --add-data='./kcpp_docs.embd:.' --add-data='./rwkv_vocab.embd:.' --add-data='./rwkv_world_vocab.embd:.' --clean --console koboldcpp.py -n "koboldcpp-linux-x64$KCPP_CUDAAPPEND"
+	bin/micromamba run -r conda -n linux pyinstaller --noconfirm --onefile --collect-all customtkinter --add-data='./koboldcpp_default.so:.' --add-data='./koboldcpp_openblas.so:.' --add-data='./koboldcpp_vulkan.so:.' --add-data='./koboldcpp_clblast.so:.' --add-data "./koboldcpp_failsafe.so:." --add-data "./koboldcpp_noavx2.so:." --add-data "./koboldcpp_clblast_noavx2.so:." --add-data "./koboldcpp_vulkan_noavx2.so:." --add-data "./koboldcpp_vulkan.so:." --add-data='./klite.embd:.' --add-data='./kcpp_docs.embd:.' --add-data='./rwkv_vocab.embd:.' --add-data='./rwkv_world_vocab.embd:.' --clean --console koboldcpp.py -n "koboldcpp-linux-x64-nocuda$KCPP_APPEND"
 	bin/micromamba install -r conda -n linux ocl-icd -c conda-forge -y
- else
+else
 	bin/micromamba run -r conda -n linux python koboldcpp.py $*
 fi