Merge remote-tracking branch 'upstream/concedo'

YellowRoseCx · Jun 21, 2024 · 50f1ce8 · 50f1ce8
2 parents 257d27b + c9c050f
commit 50f1ce8
Show file tree

Hide file tree

Showing 367 changed files with 45,810 additions and 40,184 deletions.
diff --git a/.devops/llama-cli-intel.Dockerfile b/.devops/llama-cli-intel.Dockerfile
@@ -0,0 +1,26 @@
+ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
+
+FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
+
+ARG LLAMA_SYCL_F16=OFF
+RUN apt-get update && \
+    apt-get install -y git
+
+WORKDIR /app
+
+COPY . .
+
+RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
+        echo "LLAMA_SYCL_F16 is set" && \
+        export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
+    fi && \
+    cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
+    cmake --build build --config Release --target llama-cli
+
+FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
+
+COPY --from=build /app/build/bin/llama-cli /llama-cli
+
+ENV LC_ALL=C.utf8
+
+ENTRYPOINT [ "/llama-cli" ]
diff --git a/.devops/llama-server-intel.Dockerfile b/.devops/llama-server-intel.Dockerfile
@@ -0,0 +1,29 @@
+ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
+
+FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
+
+ARG LLAMA_SYCL_F16=OFF
+RUN apt-get update && \
+    apt-get install -y git libcurl4-openssl-dev
+
+WORKDIR /app
+
+COPY . .
+
+RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
+        echo "LLAMA_SYCL_F16 is set" && \
+        export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
+    fi && \
+    cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
+    cmake --build build --config Release --target llama-server
+
+FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
+
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev
+
+COPY --from=build /app/build/bin/llama-server /llama-server
+
+ENV LC_ALL=C.utf8
+
+ENTRYPOINT [ "/llama-server" ]
diff --git a/.editorconfig b/.editorconfig
@@ -26,3 +26,6 @@ indent_size = 2
 
 [examples/llama.swiftui/llama.swiftui.xcodeproj/*]
 indent_style = tab
+
+[examples/cvector-generator/*.txt]
+insert_final_newline = unset
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -0,0 +1,7 @@
+
+
+- [x] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
+- Self-reported review complexity:
+  - [ ] Low
+  - [ ] Medium
+  - [ ] High
diff --git a/.github/workflows/kcpp-build-release-linux-cuda12.yaml b/.github/workflows/kcpp-build-release-linux-cuda12.yaml
@@ -1,4 +1,4 @@
-name: Koboldcpp Builder Linux CUDA12
+name: Koboldcpp Linux CUDA12
 
 on: workflow_dispatch
 env:

diff --git a/.github/workflows/kcpp-build-release-linux.yaml b/.github/workflows/kcpp-build-release-linux.yaml
@@ -1,4 +1,4 @@
-name: Koboldcpp Builder Linux
+name: Koboldcpp Linux
 
 on: workflow_dispatch
 env:

diff --git a/.github/workflows/kcpp-build-release-win-cuda.yaml b/.github/workflows/kcpp-build-release-win-cuda.yaml
@@ -1,4 +1,4 @@
-name: Koboldcpp Builder Windows CUDA
+name: Koboldcpp Windows CUDA
 
 on: workflow_dispatch
 env:

diff --git a/.github/workflows/kcpp-build-release-win-cuda12.yaml b/.github/workflows/kcpp-build-release-win-cuda12.yaml
@@ -1,4 +1,4 @@
-name: Koboldcpp Builder Windows CUDA12
+name: Koboldcpp Windows CUDA12
 
 on: workflow_dispatch
 env:

diff --git a/.github/workflows/kcpp-build-release-win-full-cu12.yaml b/.github/workflows/kcpp-build-release-win-full-cu12.yaml
@@ -1,4 +1,4 @@
-name: Koboldcpp Builder Windows Full Binaries CUDA 12
+name: Koboldcpp Windows Full Binaries CUDA 12
 
 on: workflow_dispatch
 env:

diff --git a/.github/workflows/kcpp-build-release-win-full.yaml b/.github/workflows/kcpp-build-release-win-full.yaml
@@ -1,4 +1,4 @@
-name: Koboldcpp Builder Windows Full Binaries
+name: Koboldcpp Windows Full Binaries
 
 on: workflow_dispatch
 env:

diff --git a/.github/workflows/kcpp-build-release-win-noavx2-full.yaml b/.github/workflows/kcpp-build-release-win-noavx2-full.yaml
@@ -0,0 +1,78 @@
+name: Koboldcpp Windows Full Binaries
+
+on: workflow_dispatch
+env:
+  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+
+jobs:
+  windows:
+    runs-on: windows-2019
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+        with:
+          ref: ${{ github.head_ref || github.ref_name }}
+
+      - name: Get Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.8.10
+
+      - name: Install python dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install customtkinter==5.2.0 pyinstaller==5.11.0 psutil==5.9.5
+
+      - name: Download and install win64devkit
+        run: |
+          curl -L https://github.com/skeeto/w64devkit/releases/download/v1.22.0/w64devkit-1.22.0.zip --output w64devkit.zip
+          Expand-Archive w64devkit.zip -DestinationPath .
+
+      - name: Add w64devkit to PATH
+        run: |
+          echo "$(Get-Location)\w64devkit\bin" | Out-File -Append -FilePath $env:GITHUB_PATH -Encoding utf8
+
+      - name: Build Non-CUDA
+        id: make_build
+        run: |
+          make -j ${env:NUMBER_OF_PROCESSORS}
+
+      - uses: Jimver/[email protected]
+        id: cuda-toolkit
+        with:
+          cuda: '11.4.4'
+
+      - name: Build CUDA
+        id: cmake_build
+        run: |
+          mkdir build
+          cd build
+          cmake .. -DLLAMA_CUBLAS=ON -DLLAMA_AVX2=OFF -DCMAKE_SYSTEM_VERSION="10.0.19041.0"
+          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
+          mv bin/Release/koboldcpp_cublas.dll ../koboldcpp_cublas.dll
+          cd ..
+
+      # note: The libraries that come from the github cuda directory seem to be larger, so they are not recommended
+      - name: Download CuBLAS Libraries
+        run: |
+          curl -L https://github.com/LostRuins/koboldcpp/releases/download/cuda11_cublas_libraries/cublas64_11.dll --output cublas64_11.dll
+          curl -L https://github.com/LostRuins/koboldcpp/releases/download/cuda11_cublas_libraries/cublasLt64_11.dll --output cublasLt64_11.dll
+          ls
+      # - name: Copy CuBLAS Libraries
+      #   run: |
+      #     copy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4\bin\cublasLt64_11.dll" .
+      #     copy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4\bin\cublas64_11.dll" .
+      #     ls
+
+      - name: Package PyInstallers
+        id: make_pyinstaller
+        run: |
+          ./make_pyinstaller.bat
+          ./make_pyinstaller_cuda.bat
+
+      - name: Save artifact
+        uses: actions/upload-artifact@v3
+        with:
+          name: kcpp_windows_pyinstallers
+          path: dist/
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -77,6 +77,8 @@ file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
 list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
 file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
 list(APPEND GGML_SOURCES_CUDA ${SRCS})
+file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu")
+list(APPEND GGML_SOURCES_CUDA ${SRCS})
 set(GGML_V3_CUDA_SOURCES otherarch/ggml_v3-cuda.cu otherarch/ggml_v3-cuda.h)
 set(GGML_V2_CUDA_SOURCES otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h)
 set(GGML_V2_LEGACY_CUDA_SOURCES otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h)
@@ -185,6 +187,8 @@ if (LLAMA_HIPBLAS)
         list(APPEND GGML_SOURCES_CUDA ${SRCS})
         file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
         list(APPEND GGML_SOURCES_CUDA ${SRCS})
+        file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu")
+        list(APPEND GGML_SOURCES_CUDA ${SRCS})
 
         # only build minimal quants required for fattn quant kv
 

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -0,0 +1,14 @@
+# Contributing Guidelines
+
+## Checklist
+
+* Make sure your PR follows the [coding guidelines](https://github.com/ggerganov/llama.cpp/blob/master/README.md#coding-guidelines)
+* Test your changes using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
+* Execute [the full CI locally on your machine](ci/README.md) before publishing
+
+## PR formatting
+
+* Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
+    - The PR template has a series of review complexity checkboxes `[ ]` that you can mark as `[X]` for your conveience. Refer to [About task lists](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) for more information.
+* If the pull request only contains documentation changes (e.g., updating READMEs, adding new wiki pages), please add `[no ci]` to the commit title. This will skip unnecessary CI checks and help reduce build times.
+* When squashing multiple commits on merge, use the following format for your commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : Fix typo in utils.py (#1234)`
diff --git a/MIT_LICENSE_GGML_LLAMACPP_ONLY b/MIT_LICENSE_GGML_LLAMACPP_ONLY
@@ -23,4 +23,4 @@ SOFTWARE.
 ===================================
 
 Note that the above license applies ONLY to the GGML library and llama.cpp by ggerganov which are licensed under the MIT License
-Kobold Lite by Concedo and the provided python ctypes bindings in koboldcpp dlls are licensed under the AGPL v3.0 License
+KoboldAI Lite by Concedo and the provided python ctypes bindings in koboldcpp dlls are licensed under the AGPL v3.0 License
diff --git a/Makefile b/Makefile
@@ -1,3 +1,6 @@
+# Add custom options to Makefile.local rather than editing this file.
+-include $(abspath $(lastword ${MAKEFILE_LIST})).local
+
 default: koboldcpp_default koboldcpp_failsafe koboldcpp_openblas koboldcpp_noavx2 koboldcpp_clblast koboldcpp_clblast_noavx2 koboldcpp_cublas koboldcpp_hipblas koboldcpp_vulkan koboldcpp_vulkan_noavx2
 tools: quantize_gpt2 quantize_gptj quantize_gguf quantize_neox quantize_mpt quantize_clip whispermain sdmain gguf-split llama-bench perplexity 
 dev: koboldcpp_openblas
@@ -52,7 +55,7 @@ SIMPLECFLAGS =
 FULLCFLAGS =
 NONECFLAGS =
 
-OPENBLAS_FLAGS = -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
+OPENBLAS_FLAGS = -DGGML_USE_OPENBLAS -DGGML_USE_BLAS -I/usr/local/include/openblas
 CLBLAST_FLAGS = -DGGML_USE_CLBLAST
 FAILSAFE_FLAGS = -DUSE_FAILSAFE
 VULKAN_FLAGS = -DGGML_USE_VULKAN
@@ -141,13 +144,16 @@ ifndef LLAMA_NO_ACCELERATE
 	# Mac M1 - include Accelerate framework.
 	# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
 	ifeq ($(UNAME_S),Darwin)
-		CFLAGS  += -DGGML_USE_ACCELERATE
+		CFLAGS  += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS
+		CXXFLAGS  += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS
 		LDFLAGS += -framework Accelerate
+		OBJS += ggml-blas.o
 	endif
 endif
 
 # it is recommended to use the CMAKE file to build for cublas if you can - will likely work better
 OBJS_CUDA_TEMP_INST = $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu))
+OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/mmq*.cu))
 OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu))
 OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
 OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu))
@@ -457,6 +463,10 @@ llavaclip_default.o: examples/llava/clip.cpp examples/llava/clip.h
 llavaclip_cublas.o: examples/llava/clip.cpp examples/llava/clip.h
 	$(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) -c $< -o $@
 
+#this is only used for openblas and accelerate
+ggml-blas.o: ggml-blas.cpp ggml-blas.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 #version 3 libs
 ggml_v3.o: otherarch/ggml_v3.c otherarch/ggml_v3.h
 	$(CC)  $(FASTCFLAGS) $(FULLCFLAGS) -c $< -o $@
@@ -541,6 +551,8 @@ gpttype_adapter_failsafe.o: $(GPTTYPE_ADAPTER)
 	$(CXX) $(CXXFLAGS) $(FAILSAFE_FLAGS) -c $< -o $@
 gpttype_adapter.o: $(GPTTYPE_ADAPTER)
 	$(CXX) $(CXXFLAGS) -c $< -o $@
+gpttype_adapter_openblas.o: $(GPTTYPE_ADAPTER)
+	$(CXX) $(CXXFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
 gpttype_adapter_clblast.o: $(GPTTYPE_ADAPTER)
 	$(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
 gpttype_adapter_cublas.o: $(GPTTYPE_ADAPTER)
@@ -578,7 +590,7 @@ koboldcpp_default: ggml.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter
 	$(DEFAULT_BUILD)
 
 ifdef OPENBLAS_BUILD
-koboldcpp_openblas: ggml_v4_openblas.o ggml_v3_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o gpttype_adapter.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
+koboldcpp_openblas: ggml_v4_openblas.o ggml_v3_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o gpttype_adapter_openblas.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-blas.o $(OBJS_FULL) $(OBJS)
 	$(OPENBLAS_BUILD)
 else
 koboldcpp_openblas:

diff --git a/README.md b/README.md
@@ -11,9 +11,7 @@ make LLAMA_HIPBLAS=1 -j4 && \
 ```
 When the KoboldCPP GUI appears, make sure to select "Use hipBLAS (ROCm)" and set GPU layers
 
---------
-### Quick Summary
-KoboldCpp is an easy-to-use AI text-generation software for GGML and GGUF models. It's a single self contained distributable from Concedo, that builds off llama.cpp, and adds a versatile Kobold API endpoint, additional format support, Stable Diffusion image generation, backward compatibility, as well as a fancy UI with persistent stories, editing tools, save formats, memory, world info, author's note, characters, scenarios and everything Kobold and Kobold Lite have to offer.
+KoboldCpp-ROCm is an easy-to-use AI text-generation software for GGML and GGUF models. It's an AI inference software from Concedo, maintained for AMD GPUs using ROCm by YellowRose, that builds off llama.cpp, and adds a versatile Kobold API endpoint, additional format support, Stable Diffusion image generation, backward compatibility, as well as a fancy UI with persistent stories, editing tools, save formats, memory, world info, author's note, characters, scenarios and everything KoboldAI and KoboldAI Lite have to offer.
 
 ![Preview](media/preview.png)
 ![Preview](media/preview2.png)
@@ -43,7 +41,7 @@ My typical start command looks like this: ``python koboldcpp.py --threads 6 --bl
 
 - **AMD GPU Acceleration**: If you're on Windows with an AMD GPU you can get CUDA/ROCm HIPblas support out of the box using the `--usecublas` flag.
 - **GPU Layer Offloading**: Want even more speedup? Combine one of the above GPU flags with `--gpulayers` to offload entire layers to the GPU! **Much faster, but uses more VRAM**. Experiment to determine number of layers to offload, and reduce by a few if you run out of memory.
-- **Increasing Context Size**: Try `--contextsize 4096` to 2x your context size! without much perplexity gain. Note that you'll have to increase the max context in the Kobold Lite UI as well (click and edit the number text field).
+- **Increasing Context Size**: Try `--contextsize 4096` to 2x your context size! without much perplexity gain. Note that you'll have to increase the max context in the KoboldAI Lite UI as well (click and edit the number text field).
 - If you are having crashes or issues, you can try turning off BLAS with the `--noblas` flag. You can also try running in a non-avx2 compatibility mode with `--noavx2`. Lastly, you can try turning off mmap with `--nommap`.
 
 For more information, be sure to run the program with the `--help` flag, or [check the wiki](https://github.com/LostRuins/koboldcpp/wiki).
@@ -227,7 +225,7 @@ Comparison with OpenCL using 6800xt (old measurement)
 
 ## License
 - The original GGML library and llama.cpp by ggerganov are licensed under the MIT License
-- However, Kobold Lite is licensed under the AGPL v3.0 License
+- However, KoboldAI Lite is licensed under the AGPL v3.0 License
 - The other files are also under the AGPL v3.0 License unless otherwise stated
 
 ## Notes

diff --git a/cmake/arm64-windows-llvm.cmake b/cmake/arm64-windows-llvm.cmake
@@ -9,7 +9,7 @@ set( CMAKE_CXX_COMPILER  clang++ )
 set( CMAKE_C_COMPILER_TARGET   ${target} )
 set( CMAKE_CXX_COMPILER_TARGET ${target} )
 
-set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast" )
+set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
 set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
 
 set( CMAKE_C_FLAGS_INIT   "${arch_c_flags} ${warn_c_flags}" )