From e6325a6b27cd6c7bf5c5d0cea12e9d2990844b18 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Wed, 5 Jun 2024 22:30:49 +0200 Subject: [PATCH 1/3] wip: try to build also intel variants Signed-off-by: Ettore Di Giacinto --- Makefile | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/Makefile b/Makefile index 4c4902555121..c68cf07908ea 100644 --- a/Makefile +++ b/Makefile @@ -720,6 +720,20 @@ backend-assets/grpc/llama-cpp-hipblas: backend-assets/grpc BUILD_TYPE="hipblas" $(MAKE) VARIANT="llama-hipblas" build-llama-cpp-grpc-server cp -rfv backend/cpp/llama-hipblas/grpc-server backend-assets/grpc/llama-cpp-hipblas +backend-assets/grpc/llama-cpp-sycl-f16: backend-assets/grpc + cp -rf backend/cpp/llama backend/cpp/llama-sycl_f16 + $(MAKE) -C backend/cpp/llama-sycl_f16 purge + $(info ${GREEN}I llama-cpp build info:sycl_f16${RESET}) + BUILD_TYPE="sycl_f16" $(MAKE) VARIANT="llama-sycl_f16" build-llama-cpp-grpc-server + cp -rfv backend/cpp/llama-sycl_f16/grpc-server backend-assets/grpc/llama-cpp-sycl_f16 + +backend-assets/grpc/llama-cpp-sycl-f32: backend-assets/grpc + cp -rf backend/cpp/llama backend/cpp/llama-sycl_f32 + $(MAKE) -C backend/cpp/llama-sycl_f32 purge + $(info ${GREEN}I llama-cpp build info:sycl_f32${RESET}) + BUILD_TYPE="sycl_f32" $(MAKE) VARIANT="llama-sycl_f32" build-llama-cpp-grpc-server + cp -rfv backend/cpp/llama-sycl_f32/grpc-server backend-assets/grpc/llama-cpp-sycl_f32 + backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc cp -rf backend/cpp/llama backend/cpp/llama-grpc $(MAKE) -C backend/cpp/llama-grpc purge From 9be16b5bd67931a38ed97070d84bc5c520e7c60b Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Wed, 5 Jun 2024 23:59:50 +0200 Subject: [PATCH 2/3] Add dependencies --- .github/workflows/release.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index aae9c6b5bfa1..aadd868542be 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -70,6 +70,12 @@ jobs: run: | sudo apt-get update sudo apt-get install build-essential ffmpeg protobuf-compiler ccache + - name: Intel Dependencies + run: | + wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null + echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list + sudo apt update + sudo apt install -y intel-basekit - name: Install CUDA Dependencies run: | curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb @@ -127,6 +133,7 @@ jobs: export PATH=$PATH:$GOPATH/bin export PATH=/usr/local/cuda/bin:$PATH export PATH=/opt/rocm/bin:$PATH + source /opt/intel/oneapi/setvars.sh GO_TAGS=p2p make dist - uses: actions/upload-artifact@v4 with: From 3b7419c929c3c72208ff7d4adc16363695e33244 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 6 Jun 2024 00:00:06 +0200 Subject: [PATCH 3/3] Select automatically intel backend --- Makefile | 6 ++++-- pkg/model/initializers.go | 37 ++++++++++++++++++++++++++++++++----- 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index c68cf07908ea..665020187cbc 100644 --- a/Makefile +++ b/Makefile @@ -328,6 +328,8 @@ ifeq ($(OS),Darwin) else $(MAKE) backend-assets/grpc/llama-cpp-cuda $(MAKE) backend-assets/grpc/llama-cpp-hipblas + $(MAKE) backend-assets/grpc/llama-cpp-sycl_f16 + $(MAKE) backend-assets/grpc/llama-cpp-sycl_f32 endif $(MAKE) build mkdir -p release @@ -720,14 +722,14 @@ backend-assets/grpc/llama-cpp-hipblas: backend-assets/grpc BUILD_TYPE="hipblas" $(MAKE) VARIANT="llama-hipblas" build-llama-cpp-grpc-server cp -rfv backend/cpp/llama-hipblas/grpc-server backend-assets/grpc/llama-cpp-hipblas -backend-assets/grpc/llama-cpp-sycl-f16: backend-assets/grpc +backend-assets/grpc/llama-cpp-sycl_f16: backend-assets/grpc cp -rf backend/cpp/llama backend/cpp/llama-sycl_f16 $(MAKE) -C backend/cpp/llama-sycl_f16 purge $(info ${GREEN}I llama-cpp build info:sycl_f16${RESET}) BUILD_TYPE="sycl_f16" $(MAKE) VARIANT="llama-sycl_f16" build-llama-cpp-grpc-server cp -rfv backend/cpp/llama-sycl_f16/grpc-server backend-assets/grpc/llama-cpp-sycl_f16 -backend-assets/grpc/llama-cpp-sycl-f32: backend-assets/grpc +backend-assets/grpc/llama-cpp-sycl_f32: backend-assets/grpc cp -rf backend/cpp/llama backend/cpp/llama-sycl_f32 $(MAKE) -C backend/cpp/llama-sycl_f32 purge $(info ${GREEN}I llama-cpp build info:sycl_f32${RESET}) diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go index ec58c279e355..7572735e32bb 100644 --- a/pkg/model/initializers.go +++ b/pkg/model/initializers.go @@ -38,7 +38,10 @@ const ( LLamaCPPFallback = "llama-cpp-fallback" LLamaCPPCUDA = "llama-cpp-cuda" LLamaCPPHipblas = "llama-cpp-hipblas" - LLamaCPPGRPC = "llama-cpp-grpc" + LLamaCPPSycl16 = "llama-cpp-sycl_16" + LLamaCPPSycl32 = "llama-cpp-sycl_32" + + LLamaCPPGRPC = "llama-cpp-grpc" Gpt4AllLlamaBackend = "gpt4all-llama" Gpt4AllMptBackend = "gpt4all-mpt" @@ -94,7 +97,7 @@ ENTRY: if autoDetect { // if we find the llama.cpp variants, show them of as a single backend (llama-cpp) as later we are going to pick that up // when starting the service - foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC, foundLCPPCuda, foundLCPPHipblas := false, false, false, false, false, false + foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC, foundLCPPCuda, foundLCPPHipblas, foundSycl16, foundSycl32 := false, false, false, false, false, false, false, false if _, ok := backends[LLamaCPP]; !ok { for _, e := range entry { if strings.Contains(e.Name(), LLamaCPPAVX2) && !foundLCPPAVX2 { @@ -121,6 +124,14 @@ ENTRY: backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPHipblas) foundLCPPHipblas = true } + if strings.Contains(e.Name(), LLamaCPPSycl16) && !foundSycl16 { + backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPSycl16) + foundSycl16 = true + } + if strings.Contains(e.Name(), LLamaCPPSycl32) && !foundSycl32 { + backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPSycl32) + foundSycl32 = true + } } } } @@ -172,9 +183,10 @@ ENTRY: } // selectGRPCProcess selects the GRPC process to start based on system capabilities -func selectGRPCProcess(backend, assetDir string) string { +func selectGRPCProcess(backend, assetDir string, f16 bool) string { foundCUDA := false foundAMDGPU := false + foundIntelGPU := false var grpcProcess string // Select backend now just for llama.cpp @@ -211,10 +223,24 @@ func selectGRPCProcess(backend, assetDir string) string { log.Info().Msgf("GPU device found but no HIPBLAS backend present") } } + if strings.Contains(gpu.String(), "intel") { + backend := LLamaCPPSycl16 + if !f16 { + backend = LLamaCPPSycl32 + } + p := backendPath(assetDir, backend) + if _, err := os.Stat(p); err == nil { + log.Info().Msgf("[%s] attempting to load with Intel variant", backend) + grpcProcess = p + foundIntelGPU = true + } else { + log.Info().Msgf("GPU device found but no Intel backend present") + } + } } } - if foundCUDA || foundAMDGPU { + if foundCUDA || foundAMDGPU || foundIntelGPU { return grpcProcess } @@ -236,6 +262,7 @@ func selectGRPCProcess(backend, assetDir string) string { // It also loads the model func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string) (ModelAddress, error) { return func(modelName, modelFile string) (ModelAddress, error) { + log.Debug().Msgf("Loading Model %s with gRPC (file: %s) (backend: %s): %+v", modelName, modelFile, backend, *o) var client ModelAddress @@ -284,7 +311,7 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string if autoDetect { // autoDetect GRPC process to start based on system capabilities - if selectedProcess := selectGRPCProcess(backend, o.assetDir); selectedProcess != "" { + if selectedProcess := selectGRPCProcess(backend, o.assetDir, o.gRPCOptions.F16Memory); selectedProcess != "" { grpcProcess = selectedProcess } }