Skip to content

Commit

Permalink
feat: auto select llama-cpp cuda runtime (#2306)
Browse files Browse the repository at this point in the history
* auto select cpu variant

Signed-off-by: Sertac Ozercan <[email protected]>

* remove cuda target for now

Signed-off-by: Sertac Ozercan <[email protected]>

* fix metal

Signed-off-by: Sertac Ozercan <[email protected]>

* fix path

Signed-off-by: Sertac Ozercan <[email protected]>

* cuda

Signed-off-by: Sertac Ozercan <[email protected]>

* auto select cuda

Signed-off-by: Sertac Ozercan <[email protected]>

* update test

Signed-off-by: Sertac Ozercan <[email protected]>

* select CUDA backend only if present

Signed-off-by: mudler <[email protected]>

* ci: keep cuda bin in path

Signed-off-by: mudler <[email protected]>

* Makefile: make dist now builds also cuda

Signed-off-by: mudler <[email protected]>

* Keep pushing fallback in case auto-flagset/nvidia fails

There could be other reasons for which the default binary may fail. For example we might have detected an Nvidia GPU,
however the user might not have the drivers/cuda libraries installed in the system, and so it would fail to start.

We keep the fallback of llama.cpp at the end of the llama.cpp backends to try to fallback loading in case things go wrong

Signed-off-by: mudler <[email protected]>

* Do not build cuda on MacOS

Signed-off-by: mudler <[email protected]>

* cleanup

Signed-off-by: Sertac Ozercan <[email protected]>

* Apply suggestions from code review

Signed-off-by: Ettore Di Giacinto <[email protected]>

---------

Signed-off-by: Sertac Ozercan <[email protected]>
Signed-off-by: Ettore Di Giacinto <[email protected]>
Signed-off-by: mudler <[email protected]>
Co-authored-by: Ettore Di Giacinto <[email protected]>
Co-authored-by: mudler <[email protected]>
  • Loading branch information
3 people authored May 14, 2024
1 parent 84e2407 commit a670318
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 58 deletions.
32 changes: 6 additions & 26 deletions .github/workflows/release.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: Build and Release

on:
on:
- push
- pull_request

Expand All @@ -16,15 +16,6 @@ concurrency:

jobs:
build-linux:
strategy:
matrix:
include:
- build: ''
defines: ''
- build: 'cuda12'
defines: ''
- build: 'cuda11'
defines: ''
runs-on: ubuntu-latest
steps:
- name: Clone
Expand All @@ -40,17 +31,13 @@ jobs:
sudo apt-get update
sudo apt-get install build-essential ffmpeg protobuf-compiler
- name: Install CUDA Dependencies
if: ${{ matrix.build == 'cuda12' || matrix.build == 'cuda11' }}
run: |
if [ "${{ matrix.build }}" == "cuda12" ]; then
export CUDA_VERSION=12-3
else
export CUDA_VERSION=11-7
fi
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update
sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION}
env:
CUDA_VERSION: 12-3
- name: Cache grpc
id: cache-grpc
uses: actions/cache@v4
Expand All @@ -69,22 +56,15 @@ jobs:
cd grpc && cd cmake/build && sudo make --jobs 5 --output-sync=target install
- name: Build
id: build
env:
BUILD_ID: "${{ matrix.build }}"
run: |
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
export PATH=$PATH:$GOPATH/bin
if [ "${{ matrix.build }}" == "cuda12" ] || [ "${{ matrix.build }}" == "cuda11" ]; then
export BUILD_TYPE=cublas
export PATH=/usr/local/cuda/bin:$PATH
make dist
else
STATIC=true make dist
fi
export PATH=/usr/local/cuda/bin:$PATH
make dist
- uses: actions/upload-artifact@v4
with:
name: LocalAI-linux-${{ matrix.build }}
name: LocalAI-linux
path: release/
- name: Release
uses: softprops/action-gh-release@v2
Expand Down
14 changes: 11 additions & 3 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ jobs:
df -h
- name: Clone
uses: actions/checkout@v4
with:
with:
submodules: true
- name: Setup Go ${{ matrix.go-version }}
uses: actions/setup-go@v5
Expand Down Expand Up @@ -87,6 +87,12 @@ jobs:
unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
rm protoc.zip
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update
sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION}
export CUDACXX=/usr/local/cuda/bin/nvcc
go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
Expand All @@ -102,6 +108,8 @@ jobs:
sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/ && \
# Pre-build stable diffusion before we install a newer version of abseil (not compatible with stablediffusion-ncn)
PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
env:
CUDA_VERSION: 12-3
- name: Cache grpc
id: cache-grpc
uses: actions/cache@v4
Expand Down Expand Up @@ -166,7 +174,7 @@ jobs:
df -h
- name: Clone
uses: actions/checkout@v4
with:
with:
submodules: true
- name: Build images
run: |
Expand All @@ -192,7 +200,7 @@ jobs:
steps:
- name: Clone
uses: actions/checkout@v4
with:
with:
submodules: true
- name: Setup Go ${{ matrix.go-version }}
uses: actions/setup-go@v5
Expand Down
16 changes: 15 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,14 @@ build-minimal:
build-api:
BUILD_GRPC_FOR_BACKEND_LLAMA=true BUILD_API_ONLY=true GO_TAGS=none $(MAKE) build

dist: build
dist:
STATIC=true $(MAKE) backend-assets/grpc/llama-cpp-avx2
ifeq ($(OS),Darwin)
$(info ${GREEN}I Skip CUDA build on MacOS${RESET})
else
$(MAKE) backend-assets/grpc/llama-cpp-cuda
endif
$(MAKE) build
mkdir -p release
# if BUILD_ID is empty, then we don't append it to the binary name
ifeq ($(BUILD_ID),)
Expand Down Expand Up @@ -677,6 +684,13 @@ ifeq ($(BUILD_TYPE),metal)
cp backend/cpp/llama-fallback/llama.cpp/build/bin/default.metallib backend-assets/grpc/
endif

backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc
cp -rf backend/cpp/llama backend/cpp/llama-cuda
$(MAKE) -C backend/cpp/llama-cuda purge
$(info ${GREEN}I llama-cpp build info:cuda${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_CUDA=ON" $(MAKE) VARIANT="llama-cuda" build-llama-cpp-grpc-server
cp -rfv backend/cpp/llama-cuda/grpc-server backend-assets/grpc/llama-cpp-cuda

backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
Expand Down
16 changes: 0 additions & 16 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@ github.com/Azure/go-ansiterm v0.0.0-20170929234023-d6e3b3328b78 h1:w+iIsaOQNcT7O
github.com/Azure/go-ansiterm v0.0.0-20170929234023-d6e3b3328b78/go.mod h1:LmzpDX56iTiv29bbRTIsUNlaFfuhWRQBWjQdVyAevI8=
github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc=
github.com/KyleBanks/depth v1.2.1/go.mod h1:jzSb9d0L43HxTQfT+oSA1EEp2q+ne2uh6XgeJcm8brE=
github.com/M0Rf30/go-tiny-dream v0.0.0-20231128165230-772a9c0d9aaf h1:UgjXLcE9I+VaVz7uBIlzAnyZIXwiDlIiTWqCh159aUI=
github.com/M0Rf30/go-tiny-dream v0.0.0-20231128165230-772a9c0d9aaf/go.mod h1:UOf2Mb/deUri5agct5OJ4SLWjhI+kZKbsUVUeRb24I0=
github.com/Masterminds/goutils v1.1.1 h1:5nUrii3FMTL5diU80unEVvNevw1nH4+ZV4DSLVJLSYI=
github.com/Masterminds/goutils v1.1.1/go.mod h1:8cTjp+g8YejhMuvIA5y2vz3BpJxksy863GQaJW2MFNU=
github.com/Masterminds/semver/v3 v3.2.0 h1:3MEsd0SM6jqZojhjLWWeBY+Kcjy9i6MQAeY7YgDP83g=
Expand Down Expand Up @@ -62,8 +60,6 @@ github.com/docker/go-connections v0.4.0 h1:El9xVISelRB7BuFusrZozjnkIM5YnzCViNKoh
github.com/docker/go-connections v0.4.0/go.mod h1:Gbd7IOopHjR8Iph03tsViu4nIes5XhDvyHbTtUxmeec=
github.com/docker/go-units v0.4.0 h1:3uh0PgVws3nIA0Q+MwDC8yjEPf9zjRfZZWXZYDct3Tw=
github.com/docker/go-units v0.4.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
github.com/donomii/go-rwkv.cpp v0.0.0-20230715075832-c898cd0f62df h1:qVcBEZlvp5A1gGWNJj02xyDtbsUI2hohlQMSB1fgER4=
github.com/donomii/go-rwkv.cpp v0.0.0-20230715075832-c898cd0f62df/go.mod h1:gWy7FIWioqYmYxkaoFyBnaKApeZVrUkHhv9EV9pz4dM=
github.com/dsnet/compress v0.0.2-0.20210315054119-f66993602bf5 h1:iFaUwBSo5Svw6L7HYpRu/0lE3e0BaElwnNO1qkNQxBY=
github.com/dsnet/compress v0.0.2-0.20210315054119-f66993602bf5/go.mod h1:qssHWj60/X5sZFNxpG4HBPDHVqxNm4DfnCKgrbZOT+s=
github.com/dsnet/golib v0.0.0-20171103203638-1ea166775780/go.mod h1:Lj+Z9rebOhdfkVLjJ8T6VcRQv3SXugXy999NBtR9aFY=
Expand All @@ -73,8 +69,6 @@ github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMo
github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ=
github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA=
github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM=
github.com/ggerganov/whisper.cpp/bindings/go v0.0.0-20230628193450-85ed71aaec8e h1:KtbU2JR3lJuXFASHG2+sVLucfMPBjWKUUKByX6C81mQ=
github.com/ggerganov/whisper.cpp/bindings/go v0.0.0-20230628193450-85ed71aaec8e/go.mod h1:QIjZ9OktHFG7p+/m3sMvrAJKKdWrr1fZIK0rM6HZlyo=
github.com/ghodss/yaml v1.0.0 h1:wQHKEahhL6wmXdzwWG11gIVCkOv05bNOh+Rxn0yngAk=
github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4=
Expand All @@ -99,10 +93,6 @@ github.com/go-openapi/spec v0.21.0 h1:LTVzPc3p/RzRnkQqLRndbAzjY0d0BCL72A6j3CdL9Z
github.com/go-openapi/spec v0.21.0/go.mod h1:78u6VdPw81XU44qEWGhtr982gJ5BWg2c0I5XwVMotYk=
github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE=
github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ=
github.com/go-skynet/go-bert.cpp v0.0.0-20230716133540-6abe312cded1 h1:yXvc7QfGtoZ51tUW/YVjoTwAfh8HG88XU7UOrbNlz5Y=
github.com/go-skynet/go-bert.cpp v0.0.0-20230716133540-6abe312cded1/go.mod h1:fYjkCDRzC+oRLHSjQoajmYK6AmeJnmEanV27CClAcDc=
github.com/go-skynet/go-llama.cpp v0.0.0-20231009155254-aeba71ee8428 h1:WYjkXL0Nw7dN2uDBMVCWQ8xLavrIhjF/DLczuh5L9TY=
github.com/go-skynet/go-llama.cpp v0.0.0-20231009155254-aeba71ee8428/go.mod h1:iub0ugfTnflE3rcIuqV2pQSo15nEw3GLW/utm5gyERo=
github.com/go-sql-driver/mysql v1.7.1 h1:lUIinVbN1DY0xBg0eMOzmmtGoHwWBbvnWubQUrtU8EI=
github.com/go-sql-driver/mysql v1.7.1/go.mod h1:OXbVy3sEdcQ2Doequ6Z5BW6fXNQTmx+9S1MCJN5yJMI=
github.com/go-task/slim-sprig v0.0.0-20210107165309-348f09dbbbc0/go.mod h1:fyg7847qk6SyHyPtNmDHnmrv/HOrqktSC+C9fM+CJOE=
Expand Down Expand Up @@ -226,18 +216,12 @@ github.com/mitchellh/reflectwalk v1.0.0 h1:9D+8oIskB4VJBN5SFlmc27fSlIBZaov1Wpk/I
github.com/mitchellh/reflectwalk v1.0.0/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw=
github.com/moby/term v0.0.0-20201216013528-df9cb8a40635 h1:rzf0wL0CHVc8CEsgyygG0Mn9CNCCPZqOPaz8RiiHYQk=
github.com/moby/term v0.0.0-20201216013528-df9cb8a40635/go.mod h1:FBS0z0QWA44HXygs7VXDUOGoN/1TV3RuWkLO04am3wc=
github.com/mudler/go-piper v0.0.0-20230621222733-56b8a81b4760 h1:OFVkSxR7CRSRSNm5dvpMRZwmSwWa8EMMnHbc84fW5tU=
github.com/mudler/go-piper v0.0.0-20230621222733-56b8a81b4760/go.mod h1:O7SwdSWMilAWhBZMK9N9Y/oBDyMMzshE3ju8Xkexwig=
github.com/mudler/go-processmanager v0.0.0-20230818213616-f204007f963c h1:CI5uGwqBpN8N7BrSKC+nmdfw+9nPQIDyjHHlaIiitZI=
github.com/mudler/go-processmanager v0.0.0-20230818213616-f204007f963c/go.mod h1:gY3wyrhkRySJtmtI/JPt4a2mKv48h/M9pEZIW+SjeC0=
github.com/mudler/go-stable-diffusion v0.0.0-20230605122230-d89260f598af h1:XFq6OUqsWQam0OrEr05okXsJK/TQur3zoZTHbiZD3Ks=
github.com/mudler/go-stable-diffusion v0.0.0-20230605122230-d89260f598af/go.mod h1:8ufRkpz/S/9ahkaxzZ5i4WMgO9w4InEhuRoT7vK5Rnw=
github.com/muesli/reflow v0.3.0 h1:IFsN6K9NfGtjeggFP+68I4chLZV2yIKsXJFNZ+eWh6s=
github.com/muesli/reflow v0.3.0/go.mod h1:pbwTDkVPibjO2kyvBQRBxTWEEGDGq0FlB1BIKtnHY/8=
github.com/muesli/termenv v0.15.2 h1:GohcuySI0QmI3wN8Ok9PtKGkgkFIk7y6Vpb5PvrY+Wo=
github.com/muesli/termenv v0.15.2/go.mod h1:Epx+iuz8sNs7mNKhxzH4fWXGNpZwUaJKRS1noLXviQ8=
github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20231022042237-c25dc5193530 h1:YXMxHwHMB9jCBo2Yu5gz3mTB3T1TnZs/HmPLv15LUSA=
github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20231022042237-c25dc5193530/go.mod h1:4T3CHXyrt+7FQHXaxULZfPjHbD8/99WuDDJa0YVZARI=
github.com/nwaples/rardecode v1.1.0 h1:vSxaY8vQhOcVr4mm5e8XllHWTiM4JF507A0Katqw7MQ=
github.com/nwaples/rardecode v1.1.0/go.mod h1:5DzqNKiOdpKKBH87u8VlvAnPZMXcGRhxWkRpHbbfGS0=
github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A=
Expand Down
42 changes: 30 additions & 12 deletions pkg/model/initializers.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"time"

grpc "github.com/go-skynet/LocalAI/pkg/grpc"
"github.com/go-skynet/LocalAI/pkg/xsysinfo"
"github.com/phayes/freeport"
"github.com/rs/zerolog/log"
"golang.org/x/sys/cpu"
Expand All @@ -29,10 +30,12 @@ const (
LlamaGGML = "llama-ggml"

LLamaCPP = "llama-cpp"

LLamaCPPCUDA12 = "llama-cpp-cuda12"
LLamaCPPAVX2 = "llama-cpp-avx2"
LLamaCPPAVX = "llama-cpp-avx"
LLamaCPPFallback = "llama-cpp-fallback"
LLamaCPPCUDA = "llama-cpp-cuda"

Gpt4AllLlamaBackend = "gpt4all-llama"
Gpt4AllMptBackend = "gpt4all-mpt"
Expand Down Expand Up @@ -72,8 +75,7 @@ ENTRY:
}
}
if !e.IsDir() {
//backends = append(backends, e.Name())
if !strings.Contains(e.Name(), LLamaCPP) {
if !strings.Contains(e.Name(), LLamaCPP) || strings.Contains(e.Name(), LLamaCPPFallback) {
backends[e.Name()] = []string{}
}
}
Expand Down Expand Up @@ -104,7 +106,7 @@ ENTRY:
// First has more priority
priorityList := []string{
// First llama.cpp and llama-ggml
LLamaCPP, LlamaGGML, Gpt4All,
LLamaCPP, LlamaGGML, Gpt4All, LLamaCPPFallback,
}

toTheEnd := []string{
Expand Down Expand Up @@ -190,17 +192,33 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
} else {
grpcProcess := backendPath(o.assetDir, backend)

foundCUDA := false
// for llama-cpp, check CPU capabilities and load the appropriate variant
if backend == LLamaCPP {
if cpu.X86.HasAVX2 {
log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend)
grpcProcess = backendPath(o.assetDir, LLamaCPPAVX2)
} else if cpu.X86.HasAVX {
log.Info().Msgf("[%s] attempting to load with AVX variant", backend)
grpcProcess = backendPath(o.assetDir, LLamaCPPAVX)
} else {
log.Info().Msgf("[%s] attempting to load with fallback variant", backend)
grpcProcess = backendPath(o.assetDir, LLamaCPPFallback)
gpus, err := xsysinfo.GPUs()
if err == nil {
for _, gpu := range gpus {
if strings.Contains(gpu.String(), "nvidia") {
log.Info().Msgf("[%s] attempting to load with CUDA variant", backend)
grpcProcess = backendPath(o.assetDir, LLamaCPPCUDA)
if _, err := os.Stat(grpcProcess); err == nil {
foundCUDA = true
}
}
}
}

if !foundCUDA {
if cpu.X86.HasAVX2 {
log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend)
grpcProcess = backendPath(o.assetDir, LLamaCPPAVX2)
} else if cpu.X86.HasAVX {
log.Info().Msgf("[%s] attempting to load with AVX variant", backend)
grpcProcess = backendPath(o.assetDir, LLamaCPPAVX)
} else {
log.Info().Msgf("[%s] attempting to load with fallback variant", backend)
grpcProcess = backendPath(o.assetDir, LLamaCPPFallback)
}
}
}

Expand Down

0 comments on commit a670318

Please sign in to comment.