From 3c3050f68e8c72769b3bd93fc556aa16ca7b1f9f Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Wed, 27 Nov 2024 16:34:28 +0100 Subject: [PATCH] feat(backends): Drop bert.cpp (#4272) * feat(backends): Drop bert.cpp use llama.cpp 3.2 as a drop-in replacement for bert.cpp Signed-off-by: Ettore Di Giacinto * chore(tests): make test more robust Signed-off-by: Ettore Di Giacinto --------- Signed-off-by: Ettore Di Giacinto --- Makefile | 30 +------------- aio/cpu/embeddings.yaml | 4 +- backend/go/llm/bert/bert.go | 34 --------------- backend/go/llm/bert/main.go | 21 ---------- core/gallery/models_test.go | 8 ++-- core/http/app_test.go | 24 ++++++----- docs/content/docs/features/embeddings.md | 46 +++++---------------- docs/content/docs/features/model-gallery.md | 2 +- embedded/models/bert-cpp.yaml | 23 ----------- gallery/bert-embeddings.yaml | 12 ------ gallery/index.yaml | 12 +++--- pkg/model/initializers.go | 3 -- tests/models_fixtures/embeddings.yaml | 5 +-- 13 files changed, 40 insertions(+), 184 deletions(-) delete mode 100644 backend/go/llm/bert/bert.go delete mode 100644 backend/go/llm/bert/main.go delete mode 100644 embedded/models/bert-cpp.yaml delete mode 100644 gallery/bert-embeddings.yaml diff --git a/Makefile b/Makefile index b53a070cce9b..83ce9661affa 100644 --- a/Makefile +++ b/Makefile @@ -14,10 +14,6 @@ CPPLLAMA_VERSION?=30ec39832165627dd6ed98938df63adfc6e6a21a WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp WHISPER_CPP_VERSION?=6266a9f9e56a5b925e9892acf650f3eb1245814d -# bert.cpp version -BERT_REPO?=https://github.com/go-skynet/go-bert.cpp -BERT_VERSION?=710044b124545415f555e4260d16b146c725a6e4 - # go-piper version PIPER_REPO?=https://github.com/mudler/go-piper PIPER_VERSION?=e10ca041a885d4a8f3871d52924b47792d5e5aa0 @@ -198,7 +194,6 @@ ifeq ($(findstring tts,$(GO_TAGS)),tts) endif ALL_GRPC_BACKENDS=backend-assets/grpc/huggingface -ALL_GRPC_BACKENDS+=backend-assets/grpc/bert-embeddings ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback @@ -228,19 +223,6 @@ endif all: help -## BERT embeddings -sources/go-bert.cpp: - mkdir -p sources/go-bert.cpp - cd sources/go-bert.cpp && \ - git init && \ - git remote add origin $(BERT_REPO) && \ - git fetch origin && \ - git checkout $(BERT_VERSION) && \ - git submodule update --init --recursive --depth 1 --single-branch - -sources/go-bert.cpp/libgobert.a: sources/go-bert.cpp - $(MAKE) -C sources/go-bert.cpp libgobert.a - ## go-llama.cpp sources/go-llama.cpp: mkdir -p sources/go-llama.cpp @@ -320,12 +302,11 @@ sources/whisper.cpp: sources/whisper.cpp/libwhisper.a: sources/whisper.cpp cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a -get-sources: sources/go-llama.cpp sources/go-piper sources/whisper.cpp sources/go-bert.cpp sources/go-stable-diffusion sources/go-tiny-dream backend/cpp/llama/llama.cpp +get-sources: sources/go-llama.cpp sources/go-piper sources/whisper.cpp sources/go-stable-diffusion sources/go-tiny-dream backend/cpp/llama/llama.cpp replace: $(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp $(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go - $(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(CURDIR)/sources/go-bert.cpp $(GOCMD) mod edit -replace github.com/M0Rf30/go-tiny-dream=$(CURDIR)/sources/go-tiny-dream $(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper $(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(CURDIR)/sources/go-stable-diffusion @@ -334,7 +315,6 @@ replace: dropreplace: $(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp $(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go - $(GOCMD) mod edit -dropreplace github.com/go-skynet/go-bert.cpp $(GOCMD) mod edit -dropreplace github.com/M0Rf30/go-tiny-dream $(GOCMD) mod edit -dropreplace github.com/mudler/go-piper $(GOCMD) mod edit -dropreplace github.com/mudler/go-stable-diffusion @@ -349,7 +329,6 @@ rebuild: ## Rebuilds the project $(MAKE) -C sources/go-llama.cpp clean $(MAKE) -C sources/whisper.cpp clean $(MAKE) -C sources/go-stable-diffusion clean - $(MAKE) -C sources/go-bert.cpp clean $(MAKE) -C sources/go-piper clean $(MAKE) -C sources/go-tiny-dream clean $(MAKE) build @@ -707,13 +686,6 @@ backend-assets/espeak-ng-data: sources/go-piper sources/go-piper/libpiper_bindin backend-assets/grpc: protogen-go replace mkdir -p backend-assets/grpc -backend-assets/grpc/bert-embeddings: sources/go-bert.cpp sources/go-bert.cpp/libgobert.a backend-assets/grpc - CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-bert.cpp LIBRARY_PATH=$(CURDIR)/sources/go-bert.cpp \ - $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./backend/go/llm/bert/ -ifneq ($(UPX),) - $(UPX) backend-assets/grpc/bert-embeddings -endif - backend-assets/grpc/huggingface: backend-assets/grpc $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/huggingface ./backend/go/llm/langchain/ ifneq ($(UPX),) diff --git a/aio/cpu/embeddings.yaml b/aio/cpu/embeddings.yaml index 8576746f15f9..9aa845b0182c 100644 --- a/aio/cpu/embeddings.yaml +++ b/aio/cpu/embeddings.yaml @@ -1,7 +1,7 @@ name: text-embedding-ada-002 -backend: bert-embeddings +embeddings: true parameters: - model: huggingface://mudler/all-MiniLM-L6-v2/ggml-model-q4_0.bin + model: huggingface://hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/llama-3.2-1b-instruct-q4_k_m.gguf usage: | You can test this model with curl like this: diff --git a/backend/go/llm/bert/bert.go b/backend/go/llm/bert/bert.go deleted file mode 100644 index a6a1d1c58d6b..000000000000 --- a/backend/go/llm/bert/bert.go +++ /dev/null @@ -1,34 +0,0 @@ -package main - -// This is a wrapper to statisfy the GRPC service interface -// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc) -import ( - bert "github.com/go-skynet/go-bert.cpp" - - "github.com/mudler/LocalAI/pkg/grpc/base" - pb "github.com/mudler/LocalAI/pkg/grpc/proto" -) - -type Embeddings struct { - base.SingleThread - bert *bert.Bert -} - -func (llm *Embeddings) Load(opts *pb.ModelOptions) error { - model, err := bert.New(opts.ModelFile) - llm.bert = model - return err -} - -func (llm *Embeddings) Embeddings(opts *pb.PredictOptions) ([]float32, error) { - - if len(opts.EmbeddingTokens) > 0 { - tokens := []int{} - for _, t := range opts.EmbeddingTokens { - tokens = append(tokens, int(t)) - } - return llm.bert.TokenEmbeddings(tokens, bert.SetThreads(int(opts.Threads))) - } - - return llm.bert.Embeddings(opts.Embeddings, bert.SetThreads(int(opts.Threads))) -} diff --git a/backend/go/llm/bert/main.go b/backend/go/llm/bert/main.go deleted file mode 100644 index 3a022f40d0c2..000000000000 --- a/backend/go/llm/bert/main.go +++ /dev/null @@ -1,21 +0,0 @@ -package main - -// Note: this is started internally by LocalAI and a server is allocated for each model - -import ( - "flag" - - grpc "github.com/mudler/LocalAI/pkg/grpc" -) - -var ( - addr = flag.String("addr", "localhost:50051", "the address to connect to") -) - -func main() { - flag.Parse() - - if err := grpc.StartServer(*addr, &Embeddings{}); err != nil { - panic(err) - } -} diff --git a/core/gallery/models_test.go b/core/gallery/models_test.go index 5217253fdcdd..6229c983c6c9 100644 --- a/core/gallery/models_test.go +++ b/core/gallery/models_test.go @@ -12,6 +12,8 @@ import ( "gopkg.in/yaml.v3" ) +const bertEmbeddingsURL = `https://gist.githubusercontent.com/mudler/0a080b166b87640e8644b09c2aee6e3b/raw/f0e8c26bb72edc16d9fbafbfd6638072126ff225/bert-embeddings-gallery.yaml` + var _ = Describe("Model test", func() { Context("Downloading", func() { @@ -47,7 +49,7 @@ var _ = Describe("Model test", func() { gallery := []GalleryModel{{ Name: "bert", - URL: "https://raw.githubusercontent.com/go-skynet/model-gallery/main/bert-embeddings.yaml", + URL: bertEmbeddingsURL, }} out, err := yaml.Marshal(gallery) Expect(err).ToNot(HaveOccurred()) @@ -66,7 +68,7 @@ var _ = Describe("Model test", func() { Expect(err).ToNot(HaveOccurred()) Expect(len(models)).To(Equal(1)) Expect(models[0].Name).To(Equal("bert")) - Expect(models[0].URL).To(Equal("https://raw.githubusercontent.com/go-skynet/model-gallery/main/bert-embeddings.yaml")) + Expect(models[0].URL).To(Equal(bertEmbeddingsURL)) Expect(models[0].Installed).To(BeFalse()) err = InstallModelFromGallery(galleries, "test@bert", tempdir, GalleryModel{}, func(s1, s2, s3 string, f float64) {}, true) @@ -78,7 +80,7 @@ var _ = Describe("Model test", func() { content := map[string]interface{}{} err = yaml.Unmarshal(dat, &content) Expect(err).ToNot(HaveOccurred()) - Expect(content["backend"]).To(Equal("bert-embeddings")) + Expect(content["usage"]).To(ContainSubstring("You can test this model with curl like this")) models, err = AvailableGalleryModels(galleries, tempdir) Expect(err).ToNot(HaveOccurred()) diff --git a/core/http/app_test.go b/core/http/app_test.go index e5431c50d514..28ed0ab9bd6a 100644 --- a/core/http/app_test.go +++ b/core/http/app_test.go @@ -240,6 +240,8 @@ func postInvalidRequest(url string) (error, int) { return nil, resp.StatusCode } +const bertEmbeddingsURL = `https://gist.githubusercontent.com/mudler/0a080b166b87640e8644b09c2aee6e3b/raw/f0e8c26bb72edc16d9fbafbfd6638072126ff225/bert-embeddings-gallery.yaml` + //go:embed backend-assets/* var backendAssets embed.FS @@ -279,13 +281,13 @@ var _ = Describe("API test", func() { g := []gallery.GalleryModel{ { Name: "bert", - URL: "https://raw.githubusercontent.com/go-skynet/model-gallery/main/bert-embeddings.yaml", + URL: bertEmbeddingsURL, }, { Name: "bert2", - URL: "https://raw.githubusercontent.com/go-skynet/model-gallery/main/bert-embeddings.yaml", + URL: bertEmbeddingsURL, Overrides: map[string]interface{}{"foo": "bar"}, - AdditionalFiles: []gallery.File{{Filename: "foo.yaml", URI: "https://raw.githubusercontent.com/go-skynet/model-gallery/main/bert-embeddings.yaml"}}, + AdditionalFiles: []gallery.File{{Filename: "foo.yaml", URI: bertEmbeddingsURL}}, }, } out, err := yaml.Marshal(g) @@ -383,7 +385,7 @@ var _ = Describe("API test", func() { content := map[string]interface{}{} err = yaml.Unmarshal(dat, &content) Expect(err).ToNot(HaveOccurred()) - Expect(content["backend"]).To(Equal("bert-embeddings")) + Expect(content["usage"]).To(ContainSubstring("You can test this model with curl like this")) Expect(content["foo"]).To(Equal("bar")) models, err = getModels("http://127.0.0.1:9090/models/available") @@ -402,7 +404,7 @@ var _ = Describe("API test", func() { It("overrides models", func() { response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{ - URL: "https://raw.githubusercontent.com/go-skynet/model-gallery/main/bert-embeddings.yaml", + URL: bertEmbeddingsURL, Name: "bert", Overrides: map[string]interface{}{ "backend": "llama", @@ -451,7 +453,7 @@ var _ = Describe("API test", func() { }) It("apply models without overrides", func() { response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{ - URL: "https://raw.githubusercontent.com/go-skynet/model-gallery/main/bert-embeddings.yaml", + URL: bertEmbeddingsURL, Name: "bert", Overrides: map[string]interface{}{}, }) @@ -471,7 +473,7 @@ var _ = Describe("API test", func() { content := map[string]interface{}{} err = yaml.Unmarshal(dat, &content) Expect(err).ToNot(HaveOccurred()) - Expect(content["backend"]).To(Equal("bert-embeddings")) + Expect(content["usage"]).To(ContainSubstring("You can test this model with curl like this")) }) It("runs openllama(llama-ggml backend)", Label("llama"), func() { @@ -806,7 +808,7 @@ var _ = Describe("API test", func() { It("returns the models list", func() { models, err := client.ListModels(context.TODO()) Expect(err).ToNot(HaveOccurred()) - Expect(len(models.Models)).To(Equal(6)) // If "config.yaml" should be included, this should be 8? + Expect(len(models.Models)).To(Equal(7)) // If "config.yaml" should be included, this should be 8? }) It("can generate completions via ggml", func() { resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "testmodel.ggml", Prompt: testPrompt}) @@ -866,8 +868,8 @@ var _ = Describe("API test", func() { }, ) Expect(err).ToNot(HaveOccurred(), err) - Expect(len(resp.Data[0].Embedding)).To(BeNumerically("==", 384)) - Expect(len(resp.Data[1].Embedding)).To(BeNumerically("==", 384)) + Expect(len(resp.Data[0].Embedding)).To(BeNumerically("==", 2048)) + Expect(len(resp.Data[1].Embedding)).To(BeNumerically("==", 2048)) sunEmbedding := resp.Data[0].Embedding resp2, err := client.CreateEmbeddings( @@ -951,7 +953,7 @@ var _ = Describe("API test", func() { openai.ChatCompletionRequest{Model: "rwkv_test", Messages: []openai.ChatCompletionMessage{{Content: "Can you count up to five?", Role: "user"}}}) Expect(err).ToNot(HaveOccurred()) Expect(len(resp.Choices) > 0).To(BeTrue()) - Expect(strings.ToLower(resp.Choices[0].Message.Content)).To(Or(ContainSubstring("sure"), ContainSubstring("five"))) + Expect(strings.ToLower(resp.Choices[0].Message.Content)).To(Or(ContainSubstring("sure"), ContainSubstring("five"), ContainSubstring("5"))) stream, err := client.CreateChatCompletionStream(context.TODO(), openai.ChatCompletionRequest{Model: "rwkv_test", Messages: []openai.ChatCompletionMessage{{Content: "Can you count up to five?", Role: "user"}}}) Expect(err).ToNot(HaveOccurred()) diff --git a/docs/content/docs/features/embeddings.md b/docs/content/docs/features/embeddings.md index ae8d2c786b9c..92c41eb64ef4 100644 --- a/docs/content/docs/features/embeddings.md +++ b/docs/content/docs/features/embeddings.md @@ -27,39 +27,6 @@ embeddings: true # .. other parameters ``` -## Bert embeddings - -To use `bert.cpp` models you can use the `bert` embedding backend. - -An example model config file: - -```yaml -name: text-embedding-ada-002 -parameters: - model: bert -backend: bert-embeddings -embeddings: true -# .. other parameters -``` - -The `bert` backend uses [bert.cpp](https://github.com/skeskinen/bert.cpp) and uses `ggml` models. - -For instance you can download the `ggml` quantized version of `all-MiniLM-L6-v2` from https://huggingface.co/skeskinen/ggml: - -```bash -wget https://huggingface.co/skeskinen/ggml/resolve/main/all-MiniLM-L6-v2/ggml-model-q4_0.bin -O models/bert -``` - -To test locally (LocalAI server running on `localhost`), -you can use `curl` (and `jq` at the end to prettify): - -```bash -curl http://localhost:8080/embeddings -X POST -H "Content-Type: application/json" -d '{ - "input": "Your text string goes here", - "model": "text-embedding-ada-002" -}' | jq "." -``` - ## Huggingface embeddings To use `sentence-transformers` and models in `huggingface` you can use the `sentencetransformers` embedding backend. @@ -87,17 +54,26 @@ The `sentencetransformers` backend uses Python [sentence-transformers](https://g ## Llama.cpp embeddings -Embeddings with `llama.cpp` are supported with the `llama` backend. +Embeddings with `llama.cpp` are supported with the `llama-cpp` backend, it needs to be enabled with `embeddings` set to `true`. ```yaml name: my-awesome-model -backend: llama +backend: llama-cpp embeddings: true parameters: model: ggml-file.bin # ... ``` +Then you can use the API to generate embeddings: + +```bash +curl http://localhost:8080/embeddings -X POST -H "Content-Type: application/json" -d '{ + "input": "My text", + "model": "my-awesome-model" +}' | jq "." +``` + ## 💡 Examples - Example that uses LLamaIndex and LocalAI as embedding: [here](https://github.com/go-skynet/LocalAI/tree/master/examples/query_data/). diff --git a/docs/content/docs/features/model-gallery.md b/docs/content/docs/features/model-gallery.md index 5d2a9a8bab3a..c17a59469607 100644 --- a/docs/content/docs/features/model-gallery.md +++ b/docs/content/docs/features/model-gallery.md @@ -300,7 +300,7 @@ curl $LOCALAI/models/apply -H "Content-Type: application/json" -d '{ ```bash curl $LOCALAI/models/apply -H "Content-Type: application/json" -d '{ - "url": "github:mudler/LocalAI/gallery/bert-embeddings.yaml", + "id": "bert-embeddings", "name": "text-embedding-ada-002" }' ``` diff --git a/embedded/models/bert-cpp.yaml b/embedded/models/bert-cpp.yaml deleted file mode 100644 index 63d3c7b645b8..000000000000 --- a/embedded/models/bert-cpp.yaml +++ /dev/null @@ -1,23 +0,0 @@ -backend: bert-embeddings -embeddings: true -f16: true - -gpu_layers: 90 -mmap: true -name: bert-cpp-minilm-v6 - -parameters: - model: bert-MiniLM-L6-v2q4_0.bin - -download_files: -- filename: "bert-MiniLM-L6-v2q4_0.bin" - sha256: "a5a174d8772c8a569faf9f3136c441f2c3855b5bf35ed32274294219533feaad" - uri: "https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin" - -usage: | - You can test this model with curl like this: - - curl http://localhost:8080/embeddings -X POST -H "Content-Type: application/json" -d '{ - "input": "Your text string goes here", - "model": "bert-cpp-minilm-v6" - }' \ No newline at end of file diff --git a/gallery/bert-embeddings.yaml b/gallery/bert-embeddings.yaml deleted file mode 100644 index 7ce61799d3cb..000000000000 --- a/gallery/bert-embeddings.yaml +++ /dev/null @@ -1,12 +0,0 @@ ---- -name: "bert-embeddings" - -config_file: | - parameters: - model: bert-MiniLM-L6-v2q4_0.bin - backend: bert-embeddings - embeddings: true -files: - - filename: "bert-MiniLM-L6-v2q4_0.bin" - sha256: "a5a174d8772c8a569faf9f3136c441f2c3855b5bf35ed32274294219533feaad" - uri: "https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin" diff --git a/gallery/index.yaml b/gallery/index.yaml index bdd952c82750..64b68a5546d1 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -380,6 +380,7 @@ urls: - https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF overrides: + embeddings: true parameters: model: llama-3.2-1b-instruct-q4_k_m.gguf files: @@ -8732,16 +8733,13 @@ - filename: "ggml-model-whisper-tiny.en-q8_0.bin" uri: "https://ggml.ggerganov.com/ggml-model-whisper-tiny.en-q8_0.bin" sha256: 5bc2b3860aa151a4c6e7bb095e1fcce7cf12c7b020ca08dcec0c6d018bb7dd94 -## Bert embeddings -- url: "github:mudler/LocalAI/gallery/bert-embeddings.yaml@master" +## Bert embeddings (llama3.2 drop-in) +- !!merge <<: *llama32 name: "bert-embeddings" - license: "Apache 2.0" - urls: - - https://huggingface.co/skeskinen/ggml + description: | + llama3.2 embeddings model. Using as drop-in replacement for bert-embeddings tags: - embeddings - description: | - Bert model that can be used for embeddings ## Stable Diffusion - url: github:mudler/LocalAI/gallery/stablediffusion.yaml@master license: "BSD-3" diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go index dc8e84f7cf6e..3d03514a17ff 100644 --- a/pkg/model/initializers.go +++ b/pkg/model/initializers.go @@ -45,7 +45,6 @@ const ( LLamaCPPGRPC = "llama-cpp-grpc" - BertEmbeddingsBackend = "bert-embeddings" WhisperBackend = "whisper" StableDiffusionBackend = "stablediffusion" TinyDreamBackend = "tinydream" @@ -154,8 +153,6 @@ func orderBackends(backends map[string][]string) ([]string, error) { toTheEnd := []string{ // last has to be huggingface LCHuggingFaceBackend, - // then bert embeddings - BertEmbeddingsBackend, } // create an ordered map diff --git a/tests/models_fixtures/embeddings.yaml b/tests/models_fixtures/embeddings.yaml index 46a08502f179..76c4a56add50 100644 --- a/tests/models_fixtures/embeddings.yaml +++ b/tests/models_fixtures/embeddings.yaml @@ -1,5 +1,4 @@ name: text-embedding-ada-002 -parameters: - model: bert -backend: bert-embeddings embeddings: true +parameters: + model: huggingface://hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/llama-3.2-1b-instruct-q4_k_m.gguf \ No newline at end of file