Skip to content

Commit

Permalink
build(intel): bundle intel variants in single-binary (#2494)
Browse files Browse the repository at this point in the history
* wip: try to build also intel variants

Signed-off-by: Ettore Di Giacinto <[email protected]>

* Add dependencies

* Select automatically intel backend

---------

Signed-off-by: Ettore Di Giacinto <[email protected]>
  • Loading branch information
mudler authored Jun 6, 2024
1 parent a293aa1 commit 596cf76
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 5 deletions.
7 changes: 7 additions & 0 deletions .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,12 @@ jobs:
run: |
sudo apt-get update
sudo apt-get install build-essential ffmpeg protobuf-compiler ccache
- name: Intel Dependencies
run: |
wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
sudo apt update
sudo apt install -y intel-basekit
- name: Install CUDA Dependencies
run: |
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
Expand Down Expand Up @@ -127,6 +133,7 @@ jobs:
export PATH=$PATH:$GOPATH/bin
export PATH=/usr/local/cuda/bin:$PATH
export PATH=/opt/rocm/bin:$PATH
source /opt/intel/oneapi/setvars.sh
GO_TAGS=p2p make dist
- uses: actions/upload-artifact@v4
with:
Expand Down
16 changes: 16 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,8 @@ ifeq ($(OS),Darwin)
else
$(MAKE) backend-assets/grpc/llama-cpp-cuda
$(MAKE) backend-assets/grpc/llama-cpp-hipblas
$(MAKE) backend-assets/grpc/llama-cpp-sycl_f16
$(MAKE) backend-assets/grpc/llama-cpp-sycl_f32
endif
$(MAKE) build
mkdir -p release
Expand Down Expand Up @@ -720,6 +722,20 @@ backend-assets/grpc/llama-cpp-hipblas: backend-assets/grpc
BUILD_TYPE="hipblas" $(MAKE) VARIANT="llama-hipblas" build-llama-cpp-grpc-server
cp -rfv backend/cpp/llama-hipblas/grpc-server backend-assets/grpc/llama-cpp-hipblas

backend-assets/grpc/llama-cpp-sycl_f16: backend-assets/grpc
cp -rf backend/cpp/llama backend/cpp/llama-sycl_f16
$(MAKE) -C backend/cpp/llama-sycl_f16 purge
$(info ${GREEN}I llama-cpp build info:sycl_f16${RESET})
BUILD_TYPE="sycl_f16" $(MAKE) VARIANT="llama-sycl_f16" build-llama-cpp-grpc-server
cp -rfv backend/cpp/llama-sycl_f16/grpc-server backend-assets/grpc/llama-cpp-sycl_f16

backend-assets/grpc/llama-cpp-sycl_f32: backend-assets/grpc
cp -rf backend/cpp/llama backend/cpp/llama-sycl_f32
$(MAKE) -C backend/cpp/llama-sycl_f32 purge
$(info ${GREEN}I llama-cpp build info:sycl_f32${RESET})
BUILD_TYPE="sycl_f32" $(MAKE) VARIANT="llama-sycl_f32" build-llama-cpp-grpc-server
cp -rfv backend/cpp/llama-sycl_f32/grpc-server backend-assets/grpc/llama-cpp-sycl_f32

backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc
cp -rf backend/cpp/llama backend/cpp/llama-grpc
$(MAKE) -C backend/cpp/llama-grpc purge
Expand Down
37 changes: 32 additions & 5 deletions pkg/model/initializers.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,10 @@ const (
LLamaCPPFallback = "llama-cpp-fallback"
LLamaCPPCUDA = "llama-cpp-cuda"
LLamaCPPHipblas = "llama-cpp-hipblas"
LLamaCPPGRPC = "llama-cpp-grpc"
LLamaCPPSycl16 = "llama-cpp-sycl_16"
LLamaCPPSycl32 = "llama-cpp-sycl_32"

LLamaCPPGRPC = "llama-cpp-grpc"

Gpt4AllLlamaBackend = "gpt4all-llama"
Gpt4AllMptBackend = "gpt4all-mpt"
Expand Down Expand Up @@ -94,7 +97,7 @@ ENTRY:
if autoDetect {
// if we find the llama.cpp variants, show them of as a single backend (llama-cpp) as later we are going to pick that up
// when starting the service
foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC, foundLCPPCuda, foundLCPPHipblas := false, false, false, false, false, false
foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC, foundLCPPCuda, foundLCPPHipblas, foundSycl16, foundSycl32 := false, false, false, false, false, false, false, false
if _, ok := backends[LLamaCPP]; !ok {
for _, e := range entry {
if strings.Contains(e.Name(), LLamaCPPAVX2) && !foundLCPPAVX2 {
Expand All @@ -121,6 +124,14 @@ ENTRY:
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPHipblas)
foundLCPPHipblas = true
}
if strings.Contains(e.Name(), LLamaCPPSycl16) && !foundSycl16 {
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPSycl16)
foundSycl16 = true
}
if strings.Contains(e.Name(), LLamaCPPSycl32) && !foundSycl32 {
backends[LLamaCPP] = append(backends[LLamaCPP], LLamaCPPSycl32)
foundSycl32 = true
}
}
}
}
Expand Down Expand Up @@ -172,9 +183,10 @@ ENTRY:
}

// selectGRPCProcess selects the GRPC process to start based on system capabilities
func selectGRPCProcess(backend, assetDir string) string {
func selectGRPCProcess(backend, assetDir string, f16 bool) string {
foundCUDA := false
foundAMDGPU := false
foundIntelGPU := false
var grpcProcess string

// Select backend now just for llama.cpp
Expand Down Expand Up @@ -211,10 +223,24 @@ func selectGRPCProcess(backend, assetDir string) string {
log.Info().Msgf("GPU device found but no HIPBLAS backend present")
}
}
if strings.Contains(gpu.String(), "intel") {
backend := LLamaCPPSycl16
if !f16 {
backend = LLamaCPPSycl32
}
p := backendPath(assetDir, backend)
if _, err := os.Stat(p); err == nil {
log.Info().Msgf("[%s] attempting to load with Intel variant", backend)
grpcProcess = p
foundIntelGPU = true
} else {
log.Info().Msgf("GPU device found but no Intel backend present")
}
}
}
}

if foundCUDA || foundAMDGPU {
if foundCUDA || foundAMDGPU || foundIntelGPU {
return grpcProcess
}

Expand All @@ -236,6 +262,7 @@ func selectGRPCProcess(backend, assetDir string) string {
// It also loads the model
func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string) (ModelAddress, error) {
return func(modelName, modelFile string) (ModelAddress, error) {

log.Debug().Msgf("Loading Model %s with gRPC (file: %s) (backend: %s): %+v", modelName, modelFile, backend, *o)

var client ModelAddress
Expand Down Expand Up @@ -284,7 +311,7 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string

if autoDetect {
// autoDetect GRPC process to start based on system capabilities
if selectedProcess := selectGRPCProcess(backend, o.assetDir); selectedProcess != "" {
if selectedProcess := selectGRPCProcess(backend, o.assetDir, o.gRPCOptions.F16Memory); selectedProcess != "" {
grpcProcess = selectedProcess
}
}
Expand Down

0 comments on commit 596cf76

Please sign in to comment.