diff --git a/.github/workflows/secscan.yaml b/.github/workflows/secscan.yaml index 08d7dfc6b712..db9db586947d 100644 --- a/.github/workflows/secscan.yaml +++ b/.github/workflows/secscan.yaml @@ -18,7 +18,7 @@ jobs: if: ${{ github.actor != 'dependabot[bot]' }} - name: Run Gosec Security Scanner if: ${{ github.actor != 'dependabot[bot]' }} - uses: securego/gosec@v2.21.2 + uses: securego/gosec@v2.21.0 with: # we let the report trigger content trigger a failure using the GitHub Security features. args: '-no-fail -fmt sarif -out results.sarif ./...' diff --git a/Makefile b/Makefile index f9fa54762008..286f4b5a2a19 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ DETECT_LIBS?=true # llama.cpp versions GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be -CPPLLAMA_VERSION?=23e0d70bacaaca1429d365a44aa9e7434f17823b +CPPLLAMA_VERSION?=64c6af3195c3cd4aa3328a1282d29cd2635c34c9 # go-rwkv version RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp @@ -16,7 +16,7 @@ RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6 # whisper.cpp version WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp -WHISPER_CPP_VERSION?=049b3a0e53c8a8e4c4576c06a1a4fccf0063a73f +WHISPER_CPP_VERSION?=5b1ce40fa882e9cb8630b48032067a1ed2f1534f # bert.cpp version BERT_REPO?=https://github.com/go-skynet/go-bert.cpp diff --git a/aio/cpu/vision.yaml b/aio/cpu/vision.yaml index 3b466d377b22..4052fa3924a8 100644 --- a/aio/cpu/vision.yaml +++ b/aio/cpu/vision.yaml @@ -2,7 +2,7 @@ backend: llama-cpp context_size: 4096 f16: true mmap: true -name: gpt-4-vision-preview +name: gpt-4o roles: user: "USER:" diff --git a/aio/gpu-8g/vision.yaml b/aio/gpu-8g/vision.yaml index db039279485f..4f5e10b3d2f3 100644 --- a/aio/gpu-8g/vision.yaml +++ b/aio/gpu-8g/vision.yaml @@ -2,7 +2,7 @@ backend: llama-cpp context_size: 4096 f16: true mmap: true -name: gpt-4-vision-preview +name: gpt-4o roles: user: "USER:" diff --git a/aio/intel/vision.yaml b/aio/intel/vision.yaml index 528431626efb..37067362713a 100644 --- a/aio/intel/vision.yaml +++ b/aio/intel/vision.yaml @@ -2,7 +2,7 @@ backend: llama-cpp context_size: 4096 mmap: false f16: false -name: gpt-4-vision-preview +name: gpt-4o roles: user: "USER:" diff --git a/backend/backend.proto b/backend/backend.proto index 4a8f31a94d4c..31bd63e50867 100644 --- a/backend/backend.proto +++ b/backend/backend.proto @@ -134,6 +134,8 @@ message PredictOptions { repeated string Images = 42; bool UseTokenizerTemplate = 43; repeated Message Messages = 44; + repeated string Videos = 45; + repeated string Audios = 46; } // The response message containing the result diff --git a/core/backend/llm.go b/core/backend/llm.go index 2b4564a886fe..f74071ba8528 100644 --- a/core/backend/llm.go +++ b/core/backend/llm.go @@ -31,7 +31,7 @@ type TokenUsage struct { Completion int } -func ModelInference(ctx context.Context, s string, messages []schema.Message, images []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) { +func ModelInference(ctx context.Context, s string, messages []schema.Message, images, videos, audios []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) { modelFile := c.Model threads := c.Threads if *threads == 0 && o.Threads != 0 { @@ -101,6 +101,8 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im opts.Messages = protoMessages opts.UseTokenizerTemplate = c.TemplateConfig.UseTokenizerTemplate opts.Images = images + opts.Videos = videos + opts.Audios = audios tokenUsage := TokenUsage{} diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go index 8144bdcd3341..b937120a3331 100644 --- a/core/http/endpoints/openai/chat.go +++ b/core/http/endpoints/openai/chat.go @@ -640,8 +640,16 @@ func handleQuestion(config *config.BackendConfig, input *schema.OpenAIRequest, m for _, m := range input.Messages { images = append(images, m.StringImages...) } + videos := []string{} + for _, m := range input.Messages { + videos = append(videos, m.StringVideos...) + } + audios := []string{} + for _, m := range input.Messages { + audios = append(audios, m.StringAudios...) + } - predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, ml, *config, o, nil) + predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, videos, audios, ml, *config, o, nil) if err != nil { log.Error().Err(err).Msg("model inference failed") return "", err diff --git a/core/http/endpoints/openai/inference.go b/core/http/endpoints/openai/inference.go index 4950ce208a6f..da75d3a1ea5b 100644 --- a/core/http/endpoints/openai/inference.go +++ b/core/http/endpoints/openai/inference.go @@ -27,9 +27,17 @@ func ComputeChoices( for _, m := range req.Messages { images = append(images, m.StringImages...) } + videos := []string{} + for _, m := range req.Messages { + videos = append(videos, m.StringVideos...) + } + audios := []string{} + for _, m := range req.Messages { + audios = append(audios, m.StringAudios...) + } // get the model function to call for the result - predFunc, err := backend.ModelInference(req.Context, predInput, req.Messages, images, loader, *config, o, tokenCallback) + predFunc, err := backend.ModelInference(req.Context, predInput, req.Messages, images, videos, audios, loader, *config, o, tokenCallback) if err != nil { return result, backend.TokenUsage{}, err } diff --git a/core/http/endpoints/openai/request.go b/core/http/endpoints/openai/request.go index a99ebea2ec41..e24dd28f2e4b 100644 --- a/core/http/endpoints/openai/request.go +++ b/core/http/endpoints/openai/request.go @@ -135,7 +135,7 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque } // Decode each request's message content - index := 0 + imgIndex, vidIndex, audioIndex := 0, 0, 0 for i, m := range input.Messages { switch content := m.Content.(type) { case string: @@ -144,20 +144,44 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque dat, _ := json.Marshal(content) c := []schema.Content{} json.Unmarshal(dat, &c) + CONTENT: for _, pp := range c { - if pp.Type == "text" { + switch pp.Type { + case "text": input.Messages[i].StringContent = pp.Text - } else if pp.Type == "image_url" { - // Detect if pp.ImageURL is an URL, if it is download the image and encode it in base64: - base64, err := utils.GetImageURLAsBase64(pp.ImageURL.URL) - if err == nil { - input.Messages[i].StringImages = append(input.Messages[i].StringImages, base64) // TODO: make sure that we only return base64 stuff - // set a placeholder for each image - input.Messages[i].StringContent = fmt.Sprintf("[img-%d]", index) + input.Messages[i].StringContent - index++ - } else { + case "video", "video_url": + // Decode content as base64 either if it's an URL or base64 text + base64, err := utils.GetContentURIAsBase64(pp.VideoURL.URL) + if err != nil { + log.Error().Msgf("Failed encoding video: %s", err) + continue CONTENT + } + input.Messages[i].StringVideos = append(input.Messages[i].StringVideos, base64) // TODO: make sure that we only return base64 stuff + // set a placeholder for each image + input.Messages[i].StringContent = fmt.Sprintf("[vid-%d]", vidIndex) + input.Messages[i].StringContent + vidIndex++ + case "audio_url", "audio": + // Decode content as base64 either if it's an URL or base64 text + base64, err := utils.GetContentURIAsBase64(pp.AudioURL.URL) + if err != nil { + log.Error().Msgf("Failed encoding image: %s", err) + continue CONTENT + } + input.Messages[i].StringAudios = append(input.Messages[i].StringAudios, base64) // TODO: make sure that we only return base64 stuff + // set a placeholder for each image + input.Messages[i].StringContent = fmt.Sprintf("[audio-%d]", audioIndex) + input.Messages[i].StringContent + audioIndex++ + case "image_url", "image": + // Decode content as base64 either if it's an URL or base64 text + base64, err := utils.GetContentURIAsBase64(pp.ImageURL.URL) + if err != nil { log.Error().Msgf("Failed encoding image: %s", err) + continue CONTENT } + input.Messages[i].StringImages = append(input.Messages[i].StringImages, base64) // TODO: make sure that we only return base64 stuff + // set a placeholder for each image + input.Messages[i].StringContent = fmt.Sprintf("[img-%d]", imgIndex) + input.Messages[i].StringContent + imgIndex++ } } } diff --git a/core/schema/openai.go b/core/schema/openai.go index fe4745bfcbd6..15bcd13df3e9 100644 --- a/core/schema/openai.go +++ b/core/schema/openai.go @@ -58,6 +58,8 @@ type Content struct { Type string `json:"type" yaml:"type"` Text string `json:"text" yaml:"text"` ImageURL ContentURL `json:"image_url" yaml:"image_url"` + AudioURL ContentURL `json:"audio_url" yaml:"audio_url"` + VideoURL ContentURL `json:"video_url" yaml:"video_url"` } type ContentURL struct { @@ -76,6 +78,8 @@ type Message struct { StringContent string `json:"string_content,omitempty" yaml:"string_content,omitempty"` StringImages []string `json:"string_images,omitempty" yaml:"string_images,omitempty"` + StringVideos []string `json:"string_videos,omitempty" yaml:"string_videos,omitempty"` + StringAudios []string `json:"string_audios,omitempty" yaml:"string_audios,omitempty"` // A result of a function call FunctionCall interface{} `json:"function_call,omitempty" yaml:"function_call,omitempty"` diff --git a/pkg/utils/base64.go b/pkg/utils/base64.go index 3fbb405b38af..50109eaa4b30 100644 --- a/pkg/utils/base64.go +++ b/pkg/utils/base64.go @@ -13,14 +13,8 @@ var base64DownloadClient http.Client = http.Client{ Timeout: 30 * time.Second, } -// this function check if the string is an URL, if it's an URL downloads the image in memory -// encodes it in base64 and returns the base64 string - -// This may look weird down in pkg/utils while it is currently only used in core/config -// -// but I believe it may be useful for MQTT as well in the near future, so I'm -// extracting it while I'm thinking of it. -func GetImageURLAsBase64(s string) (string, error) { +// GetContentURIAsBase64 checks if the string is an URL, if it's an URL downloads the content in memory encodes it in base64 and returns the base64 string, otherwise returns the string by stripping base64 data headers +func GetContentURIAsBase64(s string) (string, error) { if strings.HasPrefix(s, "http") { // download the image resp, err := base64DownloadClient.Get(s) diff --git a/pkg/utils/base64_test.go b/pkg/utils/base64_test.go index 3b3dc9fb2853..1f0d1352be1f 100644 --- a/pkg/utils/base64_test.go +++ b/pkg/utils/base64_test.go @@ -10,20 +10,20 @@ var _ = Describe("utils/base64 tests", func() { It("GetImageURLAsBase64 can strip jpeg data url prefixes", func() { // This one doesn't actually _care_ that it's base64, so feed "bad" data in this test in order to catch a change in that behavior for informational purposes. input := "data:image/jpeg;base64,FOO" - b64, err := GetImageURLAsBase64(input) + b64, err := GetContentURIAsBase64(input) Expect(err).To(BeNil()) Expect(b64).To(Equal("FOO")) }) It("GetImageURLAsBase64 can strip png data url prefixes", func() { // This one doesn't actually _care_ that it's base64, so feed "bad" data in this test in order to catch a change in that behavior for informational purposes. input := "data:image/png;base64,BAR" - b64, err := GetImageURLAsBase64(input) + b64, err := GetContentURIAsBase64(input) Expect(err).To(BeNil()) Expect(b64).To(Equal("BAR")) }) It("GetImageURLAsBase64 returns an error for bogus data", func() { input := "FOO" - b64, err := GetImageURLAsBase64(input) + b64, err := GetContentURIAsBase64(input) Expect(b64).To(Equal("")) Expect(err).ToNot(BeNil()) Expect(err).To(MatchError("not valid string")) @@ -31,7 +31,7 @@ var _ = Describe("utils/base64 tests", func() { It("GetImageURLAsBase64 can actually download images and calculates something", func() { // This test doesn't actually _check_ the results at this time, which is bad, but there wasn't a test at all before... input := "https://upload.wikimedia.org/wikipedia/en/2/29/Wargames.jpg" - b64, err := GetImageURLAsBase64(input) + b64, err := GetContentURIAsBase64(input) Expect(err).To(BeNil()) Expect(b64).ToNot(BeNil()) }) diff --git a/tests/e2e-aio/e2e_test.go b/tests/e2e-aio/e2e_test.go index f3f7b10660fc..36d127d2834e 100644 --- a/tests/e2e-aio/e2e_test.go +++ b/tests/e2e-aio/e2e_test.go @@ -171,7 +171,7 @@ var _ = Describe("E2E test", func() { }) Context("vision", func() { It("correctly", func() { - model := "gpt-4-vision-preview" + model := "gpt-4o" resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{ Model: model, Messages: []openai.ChatCompletionMessage{