diff --git a/.github/workflows/kcpp-build-release-linux-cuda12.yaml b/.github/workflows/kcpp-build-release-linux-cuda12.yaml
new file mode 100644
index 0000000000000..0cbeaf9a2b942
--- /dev/null
+++ b/.github/workflows/kcpp-build-release-linux-cuda12.yaml
@@ -0,0 +1,33 @@
+name: Koboldcpp Builder Linux CUDA12
+
+on: workflow_dispatch
+env:
+ BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+ KCPP_CUDA: 12.1.0
+
+jobs:
+ linux:
+ runs-on: ubuntu-20.04
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v3
+ with:
+ ref: concedo_experimental
+
+ - name: Dependencies
+ id: depends
+ run: |
+ sudo apt-get update
+ sudo apt-get install git curl bzip2
+
+ - name: Build
+ id: make_build
+ run: |
+ ./koboldcpp.sh dist
+
+ - name: Save artifact
+ uses: actions/upload-artifact@v3
+ with:
+ name: kcpp_linux_binary
+ path: dist/
diff --git a/.github/workflows/kcpp-build-release-win-cuda12.yaml b/.github/workflows/kcpp-build-release-win-cuda12.yaml
new file mode 100644
index 0000000000000..ed9065d939728
--- /dev/null
+++ b/.github/workflows/kcpp-build-release-win-cuda12.yaml
@@ -0,0 +1,34 @@
+name: Koboldcpp Builder Windows CUDA12
+
+on: workflow_dispatch
+env:
+ BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+
+jobs:
+ windows:
+ runs-on: windows-2019
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v3
+ with:
+ ref: concedo_experimental
+
+ - uses: Jimver/cuda-toolkit@v0.2.11
+ id: cuda-toolkit
+ with:
+ cuda: '12.1.0'
+
+ - name: Build
+ id: cmake_build
+ run: |
+ mkdir build
+ cd build
+ cmake .. -DLLAMA_CUBLAS=ON -DCMAKE_SYSTEM_VERSION="10.0.19041.0"
+ cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
+
+ - name: Save artifact
+ uses: actions/upload-artifact@v3
+ with:
+ name: kcpp_windows_cuda_binary
+ path: build/bin/Release/
diff --git a/README.md b/README.md
index 899c56adabba3..852947823b1c9 100644
--- a/README.md
+++ b/README.md
@@ -41,8 +41,16 @@ My typical start command looks like this: ``python koboldcpp.py --threads 6 --bl
- By default, you can connect to http://localhost:5001
- You can also run it using the command line. For info, please check `koboldcpp.exe --help` or `python koboldcpp.py --help`
+- **(Nvidia Only) GPU Acceleration**: If you're on Windows with an Nvidia GPU you can get CUDA support out of the box using the `--usecublas` flag, make sure you select the correct .exe with CUDA support.
+- **Any GPU Acceleration**: As a slightly slower alternative, try CLBlast with `--useclblast` flags for a slightly slower but more GPU compatible speedup.
+- **GPU Layer Offloading**: Want even more speedup? Combine one of the above GPU flags with `--gpulayers` to offload entire layers to the GPU! **Much faster, but uses more VRAM**. Experiment to determine number of layers to offload, and reduce by a few if you run out of memory.
+- **Increasing Context Size**: Try `--contextsize 4096` to 2x your context size! without much perplexity gain. Note that you'll have to increase the max context in the Kobold Lite UI as well (click and edit the number text field).
+- If you are having crashes or issues, you can try turning off BLAS with the `--noblas` flag. You can also try running in a non-avx2 compatibility mode with `--noavx2`. Lastly, you can try turning off mmap with `--nommap`.
+
+For more information, be sure to run the program with the `--help` flag, or [check the wiki](https://github.com/LostRuins/koboldcpp/wiki).
+
+
## Compiling for AMD on Windows
-- You're encouraged to use the .exe released, but if you want to compile your binaries from source at Windows, the easiest way is:
- Use the latest release of w64devkit (https://github.com/skeeto/w64devkit). Be sure to use the "vanilla one", not i686 or other different stuff. If you try they will conflit with the precompiled libs!
- Make sure you are using the w64devkit integrated terminal, (powershell should work for the cmake hipblas part)
- *This site may be useful, it has some patches for Windows ROCm to help it with compilation that I used, but I'm not sure if it's necessary.* https://streamhpc.com/blog/2023-08-01/how-to-get-full-cmake-support-for-amd-hip-sdk-on-windows-including-patches/
@@ -111,8 +119,8 @@ You can then run koboldcpp anywhere from the terminal by running `koboldcpp` to
- Clone the repo `git clone https://github.com/LostRuins/koboldcpp.git`
- Navigate to the koboldcpp folder `cd koboldcpp`
- Build the project `make`
-- Grab a small GGUF model, such as `wget https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/phi-2.Q2_K.gguf`
-- Start the python server `python koboldcpp.py --model phi-2.Q2_K.gguf`
+- Grab a small GGUF model, such as `wget https://huggingface.co/concedo/KobbleTinyV2-1.1B-GGUF/resolve/main/KobbleTiny-Q4_K.gguf`
+- Start the python server `python koboldcpp.py --model KobbleTiny-Q4_K.gguf`
- Connect to `http://localhost:5001` on your mobile browser
- If you encounter any errors, make sure your packages are up-to-date with `pkg up`
- GPU acceleration for Termux may be possible but I have not explored it. If you find a good cross-device solution, do share or PR it.
diff --git a/colab.ipynb b/colab.ipynb
index ca791c6859e34..606069712b94b 100644
--- a/colab.ipynb
+++ b/colab.ipynb
@@ -48,7 +48,7 @@
"source": [
"#@title v-- Enter your model below and then click this to start Koboldcpp\r\n",
"\r\n",
- "Model = \"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\" #@param [\"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\",\"https://huggingface.co/Sao10K/Fimbulvetr-11B-v2-GGUF/resolve/main/Fimbulvetr-11B-v2-Test-14.q4_K_M.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-13B-GGUF/resolve/main/mythomax-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/ReMM-SLERP-L2-13B-GGUF/resolve/main/remm-slerp-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/Xwin-LM-13B-v0.2-GGUF/resolve/main/xwin-lm-13b-v0.2.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/Stheno-L2-13B-GGUF/resolve/main/stheno-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-Kimiko-v2-13B-GGUF/resolve/main/mythomax-l2-kimiko-v2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/airoboros-mistral2.2-7B-GGUF/resolve/main/airoboros-mistral2.2-7b.Q4_K_S.gguf\",\"https://huggingface.co/afrideva/phi-2-uncensored-GGUF/resolve/main/phi-2-uncensored.q3_k_m.gguf\",\"https://huggingface.co/grimjim/kukulemon-7B-GGUF/resolve/main/kukulemon-7B.Q8_0.gguf\"]{allow-input: true}\r\n",
+ "Model = \"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\" #@param [\"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\",\"https://huggingface.co/Sao10K/Fimbulvetr-11B-v2-GGUF/resolve/main/Fimbulvetr-11B-v2-Test-14.q4_K_M.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-13B-GGUF/resolve/main/mythomax-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/ReMM-SLERP-L2-13B-GGUF/resolve/main/remm-slerp-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/Xwin-LM-13B-v0.2-GGUF/resolve/main/xwin-lm-13b-v0.2.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/Stheno-L2-13B-GGUF/resolve/main/stheno-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-Kimiko-v2-13B-GGUF/resolve/main/mythomax-l2-kimiko-v2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/airoboros-mistral2.2-7B-GGUF/resolve/main/airoboros-mistral2.2-7b.Q4_K_S.gguf\",\"https://huggingface.co/concedo/KobbleTinyV2-1.1B-GGUF/resolve/main/KobbleTiny-Q4_K.gguf\",\"https://huggingface.co/grimjim/kukulemon-7B-GGUF/resolve/main/kukulemon-7B.Q8_0.gguf\"]{allow-input: true}\r\n",
"Layers = 99 #@param [99]{allow-input: true}\r\n",
"ContextSize = 4096 #@param [4096] {allow-input: true}\r\n",
"ForceRebuild = False #@param {type:\"boolean\"}\r\n",
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index e3c9bcd4364aa..f568f470c8f5c 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -104,7 +104,6 @@ static std::string format(const char * fmt, ...) {
#define TN_POS_EMBD "%s.position_embd.weight"
#define TN_CLASS_EMBD "v.class_embd"
#define TN_PATCH_EMBD "v.patch_embd.weight"
-#define TN_PATCH_BIAS "v.patch_embd.bias"
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
@@ -426,7 +425,6 @@ struct clip_vision_model {
// embeddings
struct ggml_tensor * class_embedding;
struct ggml_tensor * patch_embeddings;
- struct ggml_tensor * patch_bias;
struct ggml_tensor * position_embeddings;
struct ggml_tensor * pre_ln_w;
@@ -503,11 +501,6 @@ struct clip_ctx {
bool use_gelu = false;
int32_t ftype = 1;
- bool has_class_embedding = true;
- bool has_pre_norm = true;
- bool has_post_norm = false;
- bool has_patch_bias = false;
-
struct gguf_context * ctx_gguf;
struct ggml_context * ctx_data;
@@ -533,7 +526,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
const int patch_size = hparams.patch_size;
const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
- const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
+ const int num_positions = num_patches + 1;
const int hidden_size = hparams.hidden_size;
const int n_head = hparams.n_head;
const int d_head = hidden_size / n_head;
@@ -564,23 +557,16 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
- if (ctx->has_patch_bias) {
- // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
- inp = ggml_add(ctx0, inp, model.patch_bias);
- }
-
// concat class_embeddings and patch_embeddings
- struct ggml_tensor * embeddings = inp;
- if (ctx->has_class_embedding) {
- embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
- embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
- embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
- embeddings = ggml_acc(ctx0, embeddings, inp,
- embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
- }
+ struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
ggml_set_name(embeddings, "embeddings");
ggml_set_input(embeddings);
+ embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
+ embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
+
+ embeddings = ggml_acc(ctx0, embeddings, inp,
+ embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
ggml_set_name(positions, "positions");
@@ -590,7 +576,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
// pre-layernorm
- if (ctx->has_pre_norm) {
+ {
embeddings = ggml_norm(ctx0, embeddings, eps);
ggml_set_name(embeddings, "pre_ln");
@@ -678,14 +664,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
embeddings = cur;
}
- // post-layernorm
- if (ctx->has_post_norm) {
- embeddings = ggml_norm(ctx0, embeddings, eps);
- ggml_set_name(embeddings, "post_ln");
-
- embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
- }
-
// llava projector
{
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
@@ -1170,39 +1148,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
}
- try {
- vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
- new_clip->has_class_embedding = true;
- } catch (const std::exception& e) {
- new_clip->has_class_embedding = false;
- }
-
- try {
- vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
- vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
- new_clip->has_pre_norm = true;
- } catch (std::exception & e) {
- new_clip->has_pre_norm = false;
- }
-
- try {
- vision_model.post_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight"));
- vision_model.post_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias"));
- new_clip->has_post_norm = true;
- } catch (std::exception & e) {
- new_clip->has_post_norm = false;
- }
-
- try {
- vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS);
- new_clip->has_patch_bias = true;
- } catch (std::exception & e) {
- new_clip->has_patch_bias = false;
- }
-
try {
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
+ vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
+ vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
+ vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
} catch(const std::exception& e) {
LOG_TEE("%s: failed to load vision model tensors\n", __func__);
}
diff --git a/koboldcpp.py b/koboldcpp.py
index c6c8b48daf4ff..2a349548fbc43 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -661,7 +661,7 @@ def string_contains_sequence_substring(inputstr,sequences):
modelbusy = threading.Lock()
requestsinqueue = 0
defaultport = 5001
-KcppVersion = "1.64.yr0-ROCm"
+KcppVersion = "1.64.1.yr0-ROCm"
showdebug = True
showsamplerwarning = True
showmaxctxwarning = True
@@ -1528,16 +1528,14 @@ def show_new_gui():
from tkinter.filedialog import askopenfilename
from tkinter.filedialog import asksaveasfile
- global using_gui_launcher
- using_gui_launcher = True
-
# if args received, launch
if len(sys.argv) != 1:
import tkinter as tk
root = tk.Tk() #we dont want the useless window to be visible, but we want it in taskbar
root.attributes("-alpha", 0)
args.model_param = askopenfilename(title="Select ggml model .bin or .gguf file or .kcpps config")
- root.destroy()
+ root.withdraw()
+ root.quit()
if args.model_param and args.model_param!="" and args.model_param.lower().endswith('.kcpps'):
loadconfigfile(args.model_param)
if not args.model_param and not args.sdconfig:
@@ -1594,6 +1592,8 @@ def on_resize(event):
ctk.set_widget_scaling(smallratio)
root.bind("", on_resize)
+ global using_gui_launcher
+ using_gui_launcher = True
# trigger empty tooltip then remove it
def show_tooltip(event, tooltip_text=None):
@@ -2155,7 +2155,6 @@ def changerunmode(a,b,c):
if index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
lowvram_box.grid(row=4, column=0, padx=8, pady=1, stick="nw")
- quick_lowvram_box.grid(row=4, column=0, padx=8, pady=1, stick="nw")
mmq_box.grid(row=4, column=1, padx=8, pady=1, stick="nw")
quick_mmq_box.grid(row=4, column=1, padx=8, pady=1, stick="nw")
splitmode_box.grid(row=5, column=1, padx=8, pady=1, stick="nw")
@@ -2163,7 +2162,6 @@ def changerunmode(a,b,c):
tensor_split_entry.grid(row=8, column=1, padx=8, pady=1, stick="nw")
else:
lowvram_box.grid_forget()
- quick_lowvram_box.grid_forget()
mmq_box.grid_forget()
quick_mmq_box.grid_forget()
tensor_split_label.grid_forget()
@@ -2201,7 +2199,6 @@ def changerunmode(a,b,c):
quick_gpuname_label.grid(row=3, column=1, padx=75, sticky="W")
quick_gpuname_label.configure(text_color="#ffff00")
quick_gpu_layers_entry,quick_gpu_layers_label = makelabelentry(quick_tab,"GPU Layers:", gpulayers_var, 6, 50,"How many layers to offload onto the GPU.\nVRAM intensive, usage increases with model and context size.\nRequires some trial and error to find the best fit value.")
- quick_lowvram_box = makecheckbox(quick_tab, "Low VRAM (No KV offload)", lowvram_var, 4,0,tooltiptxt="Avoid offloading KV Cache or scratch buffers to VRAM.\nAllows more layers to fit, but may result in a speed loss.")
quick_mmq_box = makecheckbox(quick_tab, "Use QuantMatMul (mmq)", mmq_var, 4,1,tooltiptxt="Enable MMQ mode instead of CuBLAS for prompt processing. Read the wiki. Speed may vary.")
@@ -2364,7 +2361,8 @@ def guilaunch():
model_var.set(tmp)
nonlocal nextstate
nextstate = 1
- root.destroy()
+ root.withdraw()
+ root.quit()
pass
def export_vars():
@@ -2816,7 +2814,8 @@ def show_gui_msgbox(title,message):
root = tk.Tk()
root.attributes("-alpha", 0)
messagebox.showerror(title=title, message=message)
- root.destroy()
+ root.withdraw()
+ root.quit()
except Exception as ex2:
pass
@@ -3548,8 +3547,8 @@ def onready_subprocess():
else:
print(f"\nRunning benchmark (Not Saved)...")
- benchprompt = "11111111"
- for i in range(0,10): #generate massive prompt
+ benchprompt = "1111111111111111"
+ for i in range(0,12): #generate massive prompt
benchprompt += benchprompt
genout = generate(benchprompt,memory="",images=[],max_length=benchlen,max_context_length=benchmaxctx,temperature=0.1,top_k=1,rep_pen=1,use_default_badwordsids=True)
result = genout['text']
@@ -3560,7 +3559,7 @@ def onready_subprocess():
s_pp = float(benchmaxctx-benchlen)/t_pp
s_gen = float(benchlen)/t_gen
datetimestamp = datetime.now(timezone.utc)
- print(f"\nBenchmark Completed - Results:\n======")
+ print(f"\nBenchmark Completed - v{KcppVersion} Results:\n======")
print(f"Timestamp: {datetimestamp}")
print(f"Backend: {libname}")
print(f"Layers: {args.gpulayers}")
@@ -3583,7 +3582,11 @@ def onready_subprocess():
file.write(f"\n{datetimestamp},{libname},{args.gpulayers},{benchmodel},{benchmaxctx},{benchlen},{t_pp:.2f},{s_pp:.2f},{t_gen:.2f},{s_gen:.2f},{(t_pp+t_gen):.2f},{resultok},{result}")
except Exception as e:
print(f"Error writing benchmark to file: {e}")
-
+ global using_gui_launcher
+ if using_gui_launcher and not save_to_file:
+ print("===")
+ print("Press ENTER key to exit.", flush=True)
+ input()
if start_server:
if args.checkforupdates:
@@ -3597,11 +3600,6 @@ def onready_subprocess():
else:
# Flush stdout for previous win32 issue so the client can see output.
print(f"Server was not started, main function complete. Idling.", flush=True)
- global using_gui_launcher
- if using_gui_launcher:
- print("===")
- print("Press a key to exit", flush=True)
- input()
def run_in_queue(launch_args, input_queue, output_queue):
main(launch_args, start_server=False)
diff --git a/koboldcpp.sh b/koboldcpp.sh
index b2923ec4ff7bd..587270d2add22 100755
--- a/koboldcpp.sh
+++ b/koboldcpp.sh
@@ -1,13 +1,24 @@
#!/bin/bash
+
if [ ! -f "bin/micromamba" ]; then
curl -Ls https://anaconda.org/conda-forge/micromamba/1.5.3/download/linux-64/micromamba-1.5.3-0.tar.bz2 | tar -xvj bin/micromamba
fi
if [[ ! -f "conda/envs/linux/bin/python" || $1 == "rebuild" ]]; then
- bin/micromamba create --no-shortcuts -r conda -n linux -f environment.yaml -y
- bin/micromamba create --no-shortcuts -r conda -n linux -f environment.yaml -y
+ cp environment.yaml environment.tmp.yaml
+ if [ -n "$KCPP_CUDA" ]; then
+ sed -i -e "s/nvidia\/label\/cuda-11.5.0/nvidia\/label\/cuda-$KCPP_CUDA/g" environment.tmp.yaml
+ else
+ KCPP_CUDA=11.5.0
+ fi
+ bin/micromamba create --no-shortcuts -r conda -n linux -f environment.tmp.yaml -y
+ bin/micromamba create --no-shortcuts -r conda -n linux -f environment.tmp.yaml -y
bin/micromamba run -r conda -n linux make clean
+ echo $KCPP_CUDA > conda/envs/linux/cudaver
+ echo rm environment.tmp.yaml
fi
+KCPP_CUDA=$(