Merge branch 'master' into test-authv2

mudler · Sep 19, 2024 · 8f3849a · 8f3849a
2 parents 0656fab + 191bc2e
commit 8f3849a
Show file tree

Hide file tree

Showing 14 changed files with 75 additions and 33 deletions.
diff --git a/.github/workflows/secscan.yaml b/.github/workflows/secscan.yaml
@@ -18,7 +18,7 @@ jobs:
         if: ${{ github.actor != 'dependabot[bot]' }}
       - name: Run Gosec Security Scanner
         if: ${{ github.actor != 'dependabot[bot]' }}
-        uses: securego/[email protected].2
+        uses: securego/[email protected].0
         with:
           # we let the report trigger content trigger a failure using the GitHub Security features.
           args: '-no-fail -fmt sarif -out results.sarif ./...'

diff --git a/Makefile b/Makefile
@@ -8,15 +8,15 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=23e0d70bacaaca1429d365a44aa9e7434f17823b
+CPPLLAMA_VERSION?=64c6af3195c3cd4aa3328a1282d29cd2635c34c9
 
 # go-rwkv version
 RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
 RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
-WHISPER_CPP_VERSION?=049b3a0e53c8a8e4c4576c06a1a4fccf0063a73f
+WHISPER_CPP_VERSION?=5b1ce40fa882e9cb8630b48032067a1ed2f1534f
 
 # bert.cpp version
 BERT_REPO?=https://github.com/go-skynet/go-bert.cpp

diff --git a/aio/cpu/vision.yaml b/aio/cpu/vision.yaml
@@ -2,7 +2,7 @@ backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
-name: gpt-4-vision-preview
+name: gpt-4o
 
 roles:
   user: "USER:"

diff --git a/aio/gpu-8g/vision.yaml b/aio/gpu-8g/vision.yaml
@@ -2,7 +2,7 @@ backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
-name: gpt-4-vision-preview
+name: gpt-4o
 
 roles:
   user: "USER:"

diff --git a/aio/intel/vision.yaml b/aio/intel/vision.yaml
@@ -2,7 +2,7 @@ backend: llama-cpp
 context_size: 4096
 mmap: false
 f16: false
-name: gpt-4-vision-preview
+name: gpt-4o
 
 roles:
   user: "USER:"

diff --git a/backend/backend.proto b/backend/backend.proto
@@ -134,6 +134,8 @@ message PredictOptions {
   repeated string Images = 42;
   bool UseTokenizerTemplate = 43;
   repeated Message Messages = 44;
+  repeated string Videos = 45;
+  repeated string Audios = 46;
 }
 
 // The response message containing the result

diff --git a/core/backend/llm.go b/core/backend/llm.go
@@ -31,7 +31,7 @@ type TokenUsage struct {
 	Completion int
 }
 
-func ModelInference(ctx context.Context, s string, messages []schema.Message, images []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
+func ModelInference(ctx context.Context, s string, messages []schema.Message, images, videos, audios []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
 	modelFile := c.Model
 	threads := c.Threads
 	if *threads == 0 && o.Threads != 0 {
@@ -101,6 +101,8 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 		opts.Messages = protoMessages
 		opts.UseTokenizerTemplate = c.TemplateConfig.UseTokenizerTemplate
 		opts.Images = images
+		opts.Videos = videos
+		opts.Audios = audios
 
 		tokenUsage := TokenUsage{}
 

diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go
@@ -640,8 +640,16 @@ func handleQuestion(config *config.BackendConfig, input *schema.OpenAIRequest, m
 	for _, m := range input.Messages {
 		images = append(images, m.StringImages...)
 	}
+	videos := []string{}
+	for _, m := range input.Messages {
+		videos = append(videos, m.StringVideos...)
+	}
+	audios := []string{}
+	for _, m := range input.Messages {
+		audios = append(audios, m.StringAudios...)
+	}
 
-	predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, ml, *config, o, nil)
+	predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, videos, audios, ml, *config, o, nil)
 	if err != nil {
 		log.Error().Err(err).Msg("model inference failed")
 		return "", err

diff --git a/core/http/endpoints/openai/inference.go b/core/http/endpoints/openai/inference.go
@@ -27,9 +27,17 @@ func ComputeChoices(
 	for _, m := range req.Messages {
 		images = append(images, m.StringImages...)
 	}
+	videos := []string{}
+	for _, m := range req.Messages {
+		videos = append(videos, m.StringVideos...)
+	}
+	audios := []string{}
+	for _, m := range req.Messages {
+		audios = append(audios, m.StringAudios...)
+	}
 
 	// get the model function to call for the result
-	predFunc, err := backend.ModelInference(req.Context, predInput, req.Messages, images, loader, *config, o, tokenCallback)
+	predFunc, err := backend.ModelInference(req.Context, predInput, req.Messages, images, videos, audios, loader, *config, o, tokenCallback)
 	if err != nil {
 		return result, backend.TokenUsage{}, err
 	}

diff --git a/core/http/endpoints/openai/request.go b/core/http/endpoints/openai/request.go
@@ -135,7 +135,7 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
 	}
 
 	// Decode each request's message content
-	index := 0
+	imgIndex, vidIndex, audioIndex := 0, 0, 0
 	for i, m := range input.Messages {
 		switch content := m.Content.(type) {
 		case string:
@@ -144,20 +144,44 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
 			dat, _ := json.Marshal(content)
 			c := []schema.Content{}
 			json.Unmarshal(dat, &c)
+		CONTENT:
 			for _, pp := range c {
-				if pp.Type == "text" {
+				switch pp.Type {
+				case "text":
 					input.Messages[i].StringContent = pp.Text
-				} else if pp.Type == "image_url" {
-					// Detect if pp.ImageURL is an URL, if it is download the image and encode it in base64:
-					base64, err := utils.GetImageURLAsBase64(pp.ImageURL.URL)
-					if err == nil {
-						input.Messages[i].StringImages = append(input.Messages[i].StringImages, base64) // TODO: make sure that we only return base64 stuff
-						// set a placeholder for each image
-						input.Messages[i].StringContent = fmt.Sprintf("[img-%d]", index) + input.Messages[i].StringContent
-						index++
-					} else {
+				case "video", "video_url":
+					// Decode content as base64 either if it's an URL or base64 text
+					base64, err := utils.GetContentURIAsBase64(pp.VideoURL.URL)
+					if err != nil {
+						log.Error().Msgf("Failed encoding video: %s", err)
+						continue CONTENT
+					}
+					input.Messages[i].StringVideos = append(input.Messages[i].StringVideos, base64) // TODO: make sure that we only return base64 stuff
+					// set a placeholder for each image
+					input.Messages[i].StringContent = fmt.Sprintf("[vid-%d]", vidIndex) + input.Messages[i].StringContent
+					vidIndex++
+				case "audio_url", "audio":
+					// Decode content as base64 either if it's an URL or base64 text
+					base64, err := utils.GetContentURIAsBase64(pp.AudioURL.URL)
+					if err != nil {
+						log.Error().Msgf("Failed encoding image: %s", err)
+						continue CONTENT
+					}
+					input.Messages[i].StringAudios = append(input.Messages[i].StringAudios, base64) // TODO: make sure that we only return base64 stuff
+					// set a placeholder for each image
+					input.Messages[i].StringContent = fmt.Sprintf("[audio-%d]", audioIndex) + input.Messages[i].StringContent
+					audioIndex++
+				case "image_url", "image":
+					// Decode content as base64 either if it's an URL or base64 text
+					base64, err := utils.GetContentURIAsBase64(pp.ImageURL.URL)
+					if err != nil {
 						log.Error().Msgf("Failed encoding image: %s", err)
+						continue CONTENT
 					}
+					input.Messages[i].StringImages = append(input.Messages[i].StringImages, base64) // TODO: make sure that we only return base64 stuff
+					// set a placeholder for each image
+					input.Messages[i].StringContent = fmt.Sprintf("[img-%d]", imgIndex) + input.Messages[i].StringContent
+					imgIndex++
 				}
 			}
 		}

diff --git a/core/schema/openai.go b/core/schema/openai.go
@@ -58,6 +58,8 @@ type Content struct {
 	Type     string     `json:"type" yaml:"type"`
 	Text     string     `json:"text" yaml:"text"`
 	ImageURL ContentURL `json:"image_url" yaml:"image_url"`
+	AudioURL ContentURL `json:"audio_url" yaml:"audio_url"`
+	VideoURL ContentURL `json:"video_url" yaml:"video_url"`
 }
 
 type ContentURL struct {
@@ -76,6 +78,8 @@ type Message struct {
 
 	StringContent string   `json:"string_content,omitempty" yaml:"string_content,omitempty"`
 	StringImages  []string `json:"string_images,omitempty" yaml:"string_images,omitempty"`
+	StringVideos  []string `json:"string_videos,omitempty" yaml:"string_videos,omitempty"`
+	StringAudios  []string `json:"string_audios,omitempty" yaml:"string_audios,omitempty"`
 
 	// A result of a function call
 	FunctionCall interface{} `json:"function_call,omitempty" yaml:"function_call,omitempty"`

diff --git a/pkg/utils/base64.go b/pkg/utils/base64.go
@@ -13,14 +13,8 @@ var base64DownloadClient http.Client = http.Client{
 	Timeout: 30 * time.Second,
 }
 
-// this function check if the string is an URL, if it's an URL downloads the image in memory
-// encodes it in base64 and returns the base64 string
-
-// This may look weird down in pkg/utils while it is currently only used in core/config
-//
-//	but I believe it may be useful for MQTT as well in the near future, so I'm
-//	extracting it while I'm thinking of it.
-func GetImageURLAsBase64(s string) (string, error) {
+// GetContentURIAsBase64 checks if the string is an URL, if it's an URL downloads the content in memory encodes it in base64 and returns the base64 string, otherwise returns the string by stripping base64 data headers
+func GetContentURIAsBase64(s string) (string, error) {
 	if strings.HasPrefix(s, "http") {
 		// download the image
 		resp, err := base64DownloadClient.Get(s)

diff --git a/pkg/utils/base64_test.go b/pkg/utils/base64_test.go
@@ -10,28 +10,28 @@ var _ = Describe("utils/base64 tests", func() {
 	It("GetImageURLAsBase64 can strip jpeg data url prefixes", func() {
 		// This one doesn't actually _care_ that it's base64, so feed "bad" data in this test in order to catch a change in that behavior for informational purposes.
 		input := "data:image/jpeg;base64,FOO"
-		b64, err := GetImageURLAsBase64(input)
+		b64, err := GetContentURIAsBase64(input)
 		Expect(err).To(BeNil())
 		Expect(b64).To(Equal("FOO"))
 	})
 	It("GetImageURLAsBase64 can strip png data url prefixes", func() {
 		// This one doesn't actually _care_ that it's base64, so feed "bad" data in this test in order to catch a change in that behavior for informational purposes.
 		input := "data:image/png;base64,BAR"
-		b64, err := GetImageURLAsBase64(input)
+		b64, err := GetContentURIAsBase64(input)
 		Expect(err).To(BeNil())
 		Expect(b64).To(Equal("BAR"))
 	})
 	It("GetImageURLAsBase64 returns an error for bogus data", func() {
 		input := "FOO"
-		b64, err := GetImageURLAsBase64(input)
+		b64, err := GetContentURIAsBase64(input)
 		Expect(b64).To(Equal(""))
 		Expect(err).ToNot(BeNil())
 		Expect(err).To(MatchError("not valid string"))
 	})
 	It("GetImageURLAsBase64 can actually download images and calculates something", func() {
 		// This test doesn't actually _check_ the results at this time, which is bad, but there wasn't a test at all before...
 		input := "https://upload.wikimedia.org/wikipedia/en/2/29/Wargames.jpg"
-		b64, err := GetImageURLAsBase64(input)
+		b64, err := GetContentURIAsBase64(input)
 		Expect(err).To(BeNil())
 		Expect(b64).ToNot(BeNil())
 	})

diff --git a/tests/e2e-aio/e2e_test.go b/tests/e2e-aio/e2e_test.go
@@ -171,7 +171,7 @@ var _ = Describe("E2E test", func() {
 		})
 		Context("vision", func() {
 			It("correctly", func() {
-				model := "gpt-4-vision-preview"
+				model := "gpt-4o"
 				resp, err := client.CreateChatCompletion(context.TODO(),
 					openai.ChatCompletionRequest{
 						Model: model, Messages: []openai.ChatCompletionMessage{