diff --git a/Makefile b/Makefile index c68cf07908ea..aca7bb3f520e 100644 --- a/Makefile +++ b/Makefile @@ -328,6 +328,8 @@ ifeq ($(OS),Darwin) else $(MAKE) backend-assets/grpc/llama-cpp-cuda $(MAKE) backend-assets/grpc/llama-cpp-hipblas + $(MAKE) backend-assets/grpc/llama-cpp-sycl_f16 + $(MAKE) backend-assets/grpc/llama-cpp-sycl_f32 endif $(MAKE) build mkdir -p release diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go index ec58c279e355..7572735e32bb 100644 --- a/pkg/model/initializers.go +++ b/pkg/model/initializers.go @@ -38,7 +38,10 @@ const ( LLamaCPPFallback = "llama-cpp-fallback" LLamaCPPCUDA = "llama-cpp-cuda" LLamaCPPHipblas = "llama-cpp-hipblas" - LLamaCPPGRPC = "llama-cpp-grpc" + LLamaCPPSycl16 = "llama-cpp-sycl_16" + LLamaCPPSycl32 = "llama-cpp-sycl_32" + + LLamaCPPGRPC = "llama-cpp-grpc" Gpt4AllLlamaBackend = "gpt4all-llama" Gpt4AllMptBackend = "gpt4all-mpt" @@ -94,7 +97,7 @@ ENTRY: if autoDetect { // if we find the llama.cpp variants, show them of as a single backend (llama-cpp) as later we are going to pick that up // when starting the service - foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC, foundLCPPCuda, foundLCPPHipblas := false, false, false, false, false, false + foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC, foundLCPPCuda, foundLCPPHipblas, foundSycl16, foundSycl32 := false, false, false, false, false, false, false, false if _, ok := backends[LLamaCPP]; !ok { for _, e := range entry { if strings.Contains(e.Name(), LLamaCPPAVX2) && !foundLCPPAVX2 { @@ -121,6 +124,14 @@ ENTRY: backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPHipblas) foundLCPPHipblas = true } + if strings.Contains(e.Name(), LLamaCPPSycl16) && !foundSycl16 { + backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPSycl16) + foundSycl16 = true + } + if strings.Contains(e.Name(), LLamaCPPSycl32) && !foundSycl32 { + backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPSycl32) + foundSycl32 = true + } } } } @@ -172,9 +183,10 @@ ENTRY: } // selectGRPCProcess selects the GRPC process to start based on system capabilities -func selectGRPCProcess(backend, assetDir string) string { +func selectGRPCProcess(backend, assetDir string, f16 bool) string { foundCUDA := false foundAMDGPU := false + foundIntelGPU := false var grpcProcess string // Select backend now just for llama.cpp @@ -211,10 +223,24 @@ func selectGRPCProcess(backend, assetDir string) string { log.Info().Msgf("GPU device found but no HIPBLAS backend present") } } + if strings.Contains(gpu.String(), "intel") { + backend := LLamaCPPSycl16 + if !f16 { + backend = LLamaCPPSycl32 + } + p := backendPath(assetDir, backend) + if _, err := os.Stat(p); err == nil { + log.Info().Msgf("[%s] attempting to load with Intel variant", backend) + grpcProcess = p + foundIntelGPU = true + } else { + log.Info().Msgf("GPU device found but no Intel backend present") + } + } } } - if foundCUDA || foundAMDGPU { + if foundCUDA || foundAMDGPU || foundIntelGPU { return grpcProcess } @@ -236,6 +262,7 @@ func selectGRPCProcess(backend, assetDir string) string { // It also loads the model func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string) (ModelAddress, error) { return func(modelName, modelFile string) (ModelAddress, error) { + log.Debug().Msgf("Loading Model %s with gRPC (file: %s) (backend: %s): %+v", modelName, modelFile, backend, *o) var client ModelAddress @@ -284,7 +311,7 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string if autoDetect { // autoDetect GRPC process to start based on system capabilities - if selectedProcess := selectGRPCProcess(backend, o.assetDir); selectedProcess != "" { + if selectedProcess := selectGRPCProcess(backend, o.assetDir, o.gRPCOptions.F16Memory); selectedProcess != "" { grpcProcess = selectedProcess } }