Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/concedo'
Browse files Browse the repository at this point in the history
  • Loading branch information
YellowRoseCx committed Jun 21, 2024
2 parents 257d27b + c9c050f commit 50f1ce8
Show file tree
Hide file tree
Showing 367 changed files with 45,810 additions and 40,184 deletions.
26 changes: 26 additions & 0 deletions .devops/llama-cli-intel.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04

FROM intel/oneapi-basekit:$ONEAPI_VERSION as build

ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git

WORKDIR /app

COPY . .

RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
echo "LLAMA_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
fi && \
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
cmake --build build --config Release --target llama-cli

FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime

COPY --from=build /app/build/bin/llama-cli /llama-cli

ENV LC_ALL=C.utf8

ENTRYPOINT [ "/llama-cli" ]
29 changes: 29 additions & 0 deletions .devops/llama-server-intel.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04

FROM intel/oneapi-basekit:$ONEAPI_VERSION as build

ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git libcurl4-openssl-dev

WORKDIR /app

COPY . .

RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
echo "LLAMA_SYCL_F16 is set" && \
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
fi && \
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
cmake --build build --config Release --target llama-server

FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime

RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev

COPY --from=build /app/build/bin/llama-server /llama-server

ENV LC_ALL=C.utf8

ENTRYPOINT [ "/llama-server" ]
3 changes: 3 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,6 @@ indent_size = 2

[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
indent_style = tab

[examples/cvector-generator/*.txt]
insert_final_newline = unset
7 changes: 7 additions & 0 deletions .github/pull_request_template.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@


- [x] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)
- Self-reported review complexity:
- [ ] Low
- [ ] Medium
- [ ] High
2 changes: 1 addition & 1 deletion .github/workflows/kcpp-build-release-linux-cuda12.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Koboldcpp Builder Linux CUDA12
name: Koboldcpp Linux CUDA12

on: workflow_dispatch
env:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/kcpp-build-release-linux.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Koboldcpp Builder Linux
name: Koboldcpp Linux

on: workflow_dispatch
env:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/kcpp-build-release-win-cuda.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Koboldcpp Builder Windows CUDA
name: Koboldcpp Windows CUDA

on: workflow_dispatch
env:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/kcpp-build-release-win-cuda12.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Koboldcpp Builder Windows CUDA12
name: Koboldcpp Windows CUDA12

on: workflow_dispatch
env:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/kcpp-build-release-win-full-cu12.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Koboldcpp Builder Windows Full Binaries CUDA 12
name: Koboldcpp Windows Full Binaries CUDA 12

on: workflow_dispatch
env:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/kcpp-build-release-win-full.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Koboldcpp Builder Windows Full Binaries
name: Koboldcpp Windows Full Binaries

on: workflow_dispatch
env:
Expand Down
78 changes: 78 additions & 0 deletions .github/workflows/kcpp-build-release-win-noavx2-full.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
name: Koboldcpp Windows Full Binaries

on: workflow_dispatch
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}

jobs:
windows:
runs-on: windows-2019
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
with:
ref: ${{ github.head_ref || github.ref_name }}

- name: Get Python
uses: actions/setup-python@v2
with:
python-version: 3.8.10

- name: Install python dependencies
run: |
python -m pip install --upgrade pip
pip install customtkinter==5.2.0 pyinstaller==5.11.0 psutil==5.9.5
- name: Download and install win64devkit
run: |
curl -L https://github.com/skeeto/w64devkit/releases/download/v1.22.0/w64devkit-1.22.0.zip --output w64devkit.zip
Expand-Archive w64devkit.zip -DestinationPath .
- name: Add w64devkit to PATH
run: |
echo "$(Get-Location)\w64devkit\bin" | Out-File -Append -FilePath $env:GITHUB_PATH -Encoding utf8
- name: Build Non-CUDA
id: make_build
run: |
make -j ${env:NUMBER_OF_PROCESSORS}
- uses: Jimver/[email protected]
id: cuda-toolkit
with:
cuda: '11.4.4'

- name: Build CUDA
id: cmake_build
run: |
mkdir build
cd build
cmake .. -DLLAMA_CUBLAS=ON -DLLAMA_AVX2=OFF -DCMAKE_SYSTEM_VERSION="10.0.19041.0"
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
mv bin/Release/koboldcpp_cublas.dll ../koboldcpp_cublas.dll
cd ..
# note: The libraries that come from the github cuda directory seem to be larger, so they are not recommended
- name: Download CuBLAS Libraries
run: |
curl -L https://github.com/LostRuins/koboldcpp/releases/download/cuda11_cublas_libraries/cublas64_11.dll --output cublas64_11.dll
curl -L https://github.com/LostRuins/koboldcpp/releases/download/cuda11_cublas_libraries/cublasLt64_11.dll --output cublasLt64_11.dll
ls
# - name: Copy CuBLAS Libraries
# run: |
# copy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4\bin\cublasLt64_11.dll" .
# copy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4\bin\cublas64_11.dll" .
# ls

- name: Package PyInstallers
id: make_pyinstaller
run: |
./make_pyinstaller.bat
./make_pyinstaller_cuda.bat
- name: Save artifact
uses: actions/upload-artifact@v3
with:
name: kcpp_windows_pyinstallers
path: dist/
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
set(GGML_V3_CUDA_SOURCES otherarch/ggml_v3-cuda.cu otherarch/ggml_v3-cuda.h)
set(GGML_V2_CUDA_SOURCES otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h)
set(GGML_V2_LEGACY_CUDA_SOURCES otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h)
Expand Down Expand Up @@ -185,6 +187,8 @@ if (LLAMA_HIPBLAS)
list(APPEND GGML_SOURCES_CUDA ${SRCS})
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})

# only build minimal quants required for fattn quant kv

Expand Down
14 changes: 14 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Contributing Guidelines

## Checklist

* Make sure your PR follows the [coding guidelines](https://github.com/ggerganov/llama.cpp/blob/master/README.md#coding-guidelines)
* Test your changes using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
* Execute [the full CI locally on your machine](ci/README.md) before publishing

## PR formatting

* Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
- The PR template has a series of review complexity checkboxes `[ ]` that you can mark as `[X]` for your conveience. Refer to [About task lists](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) for more information.
* If the pull request only contains documentation changes (e.g., updating READMEs, adding new wiki pages), please add `[no ci]` to the commit title. This will skip unnecessary CI checks and help reduce build times.
* When squashing multiple commits on merge, use the following format for your commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : Fix typo in utils.py (#1234)`
2 changes: 1 addition & 1 deletion MIT_LICENSE_GGML_LLAMACPP_ONLY
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@ SOFTWARE.
===================================

Note that the above license applies ONLY to the GGML library and llama.cpp by ggerganov which are licensed under the MIT License
Kobold Lite by Concedo and the provided python ctypes bindings in koboldcpp dlls are licensed under the AGPL v3.0 License
KoboldAI Lite by Concedo and the provided python ctypes bindings in koboldcpp dlls are licensed under the AGPL v3.0 License
18 changes: 15 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Add custom options to Makefile.local rather than editing this file.
-include $(abspath $(lastword ${MAKEFILE_LIST})).local

default: koboldcpp_default koboldcpp_failsafe koboldcpp_openblas koboldcpp_noavx2 koboldcpp_clblast koboldcpp_clblast_noavx2 koboldcpp_cublas koboldcpp_hipblas koboldcpp_vulkan koboldcpp_vulkan_noavx2
tools: quantize_gpt2 quantize_gptj quantize_gguf quantize_neox quantize_mpt quantize_clip whispermain sdmain gguf-split llama-bench perplexity
dev: koboldcpp_openblas
Expand Down Expand Up @@ -52,7 +55,7 @@ SIMPLECFLAGS =
FULLCFLAGS =
NONECFLAGS =

OPENBLAS_FLAGS = -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
OPENBLAS_FLAGS = -DGGML_USE_OPENBLAS -DGGML_USE_BLAS -I/usr/local/include/openblas
CLBLAST_FLAGS = -DGGML_USE_CLBLAST
FAILSAFE_FLAGS = -DUSE_FAILSAFE
VULKAN_FLAGS = -DGGML_USE_VULKAN
Expand Down Expand Up @@ -141,13 +144,16 @@ ifndef LLAMA_NO_ACCELERATE
# Mac M1 - include Accelerate framework.
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
ifeq ($(UNAME_S),Darwin)
CFLAGS += -DGGML_USE_ACCELERATE
CFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS
CXXFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS
LDFLAGS += -framework Accelerate
OBJS += ggml-blas.o
endif
endif

# it is recommended to use the CMAKE file to build for cublas if you can - will likely work better
OBJS_CUDA_TEMP_INST = $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu))
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/mmq*.cu))
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu))
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu))
Expand Down Expand Up @@ -457,6 +463,10 @@ llavaclip_default.o: examples/llava/clip.cpp examples/llava/clip.h
llavaclip_cublas.o: examples/llava/clip.cpp examples/llava/clip.h
$(CXX) $(CXXFLAGS) $(CUBLAS_FLAGS) -c $< -o $@

#this is only used for openblas and accelerate
ggml-blas.o: ggml-blas.cpp ggml-blas.h
$(CXX) $(CXXFLAGS) -c $< -o $@

#version 3 libs
ggml_v3.o: otherarch/ggml_v3.c otherarch/ggml_v3.h
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) -c $< -o $@
Expand Down Expand Up @@ -541,6 +551,8 @@ gpttype_adapter_failsafe.o: $(GPTTYPE_ADAPTER)
$(CXX) $(CXXFLAGS) $(FAILSAFE_FLAGS) -c $< -o $@
gpttype_adapter.o: $(GPTTYPE_ADAPTER)
$(CXX) $(CXXFLAGS) -c $< -o $@
gpttype_adapter_openblas.o: $(GPTTYPE_ADAPTER)
$(CXX) $(CXXFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
gpttype_adapter_clblast.o: $(GPTTYPE_ADAPTER)
$(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
gpttype_adapter_cublas.o: $(GPTTYPE_ADAPTER)
Expand Down Expand Up @@ -578,7 +590,7 @@ koboldcpp_default: ggml.o ggml_v3.o ggml_v2.o ggml_v1.o expose.o gpttype_adapter
$(DEFAULT_BUILD)

ifdef OPENBLAS_BUILD
koboldcpp_openblas: ggml_v4_openblas.o ggml_v3_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o gpttype_adapter.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FULL) $(OBJS)
koboldcpp_openblas: ggml_v4_openblas.o ggml_v3_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o gpttype_adapter_openblas.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-blas.o $(OBJS_FULL) $(OBJS)
$(OPENBLAS_BUILD)
else
koboldcpp_openblas:
Expand Down
8 changes: 3 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,7 @@ make LLAMA_HIPBLAS=1 -j4 && \
```
When the KoboldCPP GUI appears, make sure to select "Use hipBLAS (ROCm)" and set GPU layers

--------
### Quick Summary
KoboldCpp is an easy-to-use AI text-generation software for GGML and GGUF models. It's a single self contained distributable from Concedo, that builds off llama.cpp, and adds a versatile Kobold API endpoint, additional format support, Stable Diffusion image generation, backward compatibility, as well as a fancy UI with persistent stories, editing tools, save formats, memory, world info, author's note, characters, scenarios and everything Kobold and Kobold Lite have to offer.
KoboldCpp-ROCm is an easy-to-use AI text-generation software for GGML and GGUF models. It's an AI inference software from Concedo, maintained for AMD GPUs using ROCm by YellowRose, that builds off llama.cpp, and adds a versatile Kobold API endpoint, additional format support, Stable Diffusion image generation, backward compatibility, as well as a fancy UI with persistent stories, editing tools, save formats, memory, world info, author's note, characters, scenarios and everything KoboldAI and KoboldAI Lite have to offer.

![Preview](media/preview.png)
![Preview](media/preview2.png)
Expand Down Expand Up @@ -43,7 +41,7 @@ My typical start command looks like this: ``python koboldcpp.py --threads 6 --bl

- **AMD GPU Acceleration**: If you're on Windows with an AMD GPU you can get CUDA/ROCm HIPblas support out of the box using the `--usecublas` flag.
- **GPU Layer Offloading**: Want even more speedup? Combine one of the above GPU flags with `--gpulayers` to offload entire layers to the GPU! **Much faster, but uses more VRAM**. Experiment to determine number of layers to offload, and reduce by a few if you run out of memory.
- **Increasing Context Size**: Try `--contextsize 4096` to 2x your context size! without much perplexity gain. Note that you'll have to increase the max context in the Kobold Lite UI as well (click and edit the number text field).
- **Increasing Context Size**: Try `--contextsize 4096` to 2x your context size! without much perplexity gain. Note that you'll have to increase the max context in the KoboldAI Lite UI as well (click and edit the number text field).
- If you are having crashes or issues, you can try turning off BLAS with the `--noblas` flag. You can also try running in a non-avx2 compatibility mode with `--noavx2`. Lastly, you can try turning off mmap with `--nommap`.

For more information, be sure to run the program with the `--help` flag, or [check the wiki](https://github.com/LostRuins/koboldcpp/wiki).
Expand Down Expand Up @@ -227,7 +225,7 @@ Comparison with OpenCL using 6800xt (old measurement)

## License
- The original GGML library and llama.cpp by ggerganov are licensed under the MIT License
- However, Kobold Lite is licensed under the AGPL v3.0 License
- However, KoboldAI Lite is licensed under the AGPL v3.0 License
- The other files are also under the AGPL v3.0 License unless otherwise stated

## Notes
Expand Down
2 changes: 1 addition & 1 deletion cmake/arm64-windows-llvm.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ set( CMAKE_CXX_COMPILER clang++ )
set( CMAKE_C_COMPILER_TARGET ${target} )
set( CMAKE_CXX_COMPILER_TARGET ${target} )

set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast" )
set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )

set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
Expand Down
Loading

0 comments on commit 50f1ce8

Please sign in to comment.