merge?

Signed-off-by: Dave Lee <[email protected]>
mudler · Jun 1, 2024 · 277bc1d · 277bc1d
2 parents b792901 + b99182c
commit 277bc1d
Show file tree

Hide file tree

Showing 11 changed files with 322 additions and 98 deletions.
diff --git a/Makefile b/Makefile
@@ -447,7 +447,7 @@ protogen-clean: protogen-go-clean protogen-python-clean
 .PHONY: protogen-go
 protogen-go:
 	mkdir -p pkg/grpc/proto
-	protoc -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
+	protoc --experimental_allow_proto3_optional -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
     backend/backend.proto
 
 .PHONY: protogen-go-clean

diff --git a/backend/backend.proto b/backend/backend.proto
@@ -266,6 +266,7 @@ message TTSRequest {
   string model = 2;
   string dst = 3;
   string voice = 4;
+  optional string language = 5;
 }
 
 message TokenizationResponse {

diff --git a/backend/python/coqui/backend.py b/backend/python/coqui/backend.py
@@ -66,7 +66,21 @@ def LoadModel(self, request, context):
 
     def TTS(self, request, context):
         try:
-            self.tts.tts_to_file(text=request.text, speaker_wav=self.AudioPath, language=COQUI_LANGUAGE, file_path=request.dst)
+            # if model is multilangual add language from request or env as fallback
+            lang = request.language or COQUI_LANGUAGE
+            if lang == "":
+                lang = None
+            if self.tts.is_multi_lingual and lang is None:
+               return backend_pb2.Result(success=False, message=f"Model is multi-lingual, but no language was provided")
+
+            # if model is multi-speaker, use speaker_wav or the speaker_id from request.voice
+            if self.tts.is_multi_speaker and self.AudioPath is None and request.voice is None:
+                return backend_pb2.Result(success=False, message=f"Model is multi-speaker, but no speaker was provided")
+
+            if self.tts.is_multi_speaker and request.voice is not None:
+               self.tts.tts_to_file(text=request.text, speaker=request.voice, language=lang, file_path=request.dst)
+            else:
+                self.tts.tts_to_file(text=request.text, speaker_wav=self.AudioPath, language=lang, file_path=request.dst)
         except Exception as err:
             return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
         return backend_pb2.Result(success=True)

diff --git a/core/backend/tts.go b/core/backend/tts.go
@@ -50,15 +50,22 @@ func (ttsbs *TextToSpeechBackendService) TextToAudioFile(request *schema.TTSRequ
 			bc.Backend = request.Backend
 		}
 		// TODO consider merging the below function in, but leave it seperated for diff reasons in the first PR
-		dst, err := ttsbs.modelTTS(request.Backend, request.Input, bc.Model, request.Voice, *bc)
+		dst, err := ttsbs.modelTTS(request.Backend, request.Input, bc.Model, request.Voice, request.Language, *bc)
 		log.Debug().Str("dst", dst).Err(err).Msg("modelTTS result in goroutine")
 		wjr.SetResult(dst, err)
 	}(wjr)
 
 	return jr
 }
 
-func (ttsbs *TextToSpeechBackendService) modelTTS(backend, text, modelFile, voice string, backendConfig config.BackendConfig) (string, error) {
+func (ttsbs *TextToSpeechBackendService) modelTTS(
+	backend string,
+	text string,
+	modelFile string,
+	voice string,
+	language string,
+	backendConfig config.BackendConfig,
+) (string, error) {
 	bb := backend
 	if bb == "" {
 		bb = model.PiperBackend
@@ -114,13 +121,19 @@ func (ttsbs *TextToSpeechBackendService) modelTTS(backend, text, modelFile, voic
 		}
 	}
 
-	_, err = ttsModel.TTS(context.Background(), &proto.TTSRequest{
-		Text:  text,
-		Model: modelPath,
-		Voice: voice,
-		Dst:   filePath,
+	res, err := ttsModel.TTS(context.Background(), &proto.TTSRequest{
+		Text:     text,
+		Model:    modelPath,
+		Voice:    voice,
+		Dst:      filePath,
+		Language: &language,
 	})
 
+	// return RPC error if any
+	if !res.Success {
+		return "", nil, fmt.Errorf(res.Message)
+	}
+
 	return filePath, err
 }
 

diff --git a/core/cli/tts.go b/core/cli/tts.go
@@ -21,6 +21,7 @@ type TTSCMD struct {
 	Backend           string `short:"b" default:"piper" help:"Backend to run the TTS model"`
 	Model             string `short:"m" required:"" help:"Model name to run the TTS"`
 	Voice             string `short:"v" help:"Voice name to run the TTS"`
+	Language          string `short:"l" help:"Language to use with the TTS"`
 	OutputFile        string `short:"o" type:"path" help:"The path to write the output wav file"`
 	ModelsPath        string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
 	BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
@@ -51,10 +52,11 @@ func (t *TTSCMD) Run(ctx *cliContext.Context) error {
 	}()
 
 	request := &schema.TTSRequest{
-		Backend: t.Backend,
-		Input:   text,
-		Model:   t.Model,
-		Voice:   t.Voice,
+		Backend:  t.Backend,
+		Input:    text,
+		Model:    t.Model,
+		Voice:    t.Voice,
+		Language: t.Language,
 	}
 
 	ttsbs := backend.NewTextToSpeechBackendService(ml, config.NewBackendConfigLoader(), opts)

diff --git a/core/config/backend_config.go b/core/config/backend_config.go
@@ -18,6 +18,15 @@ const (
 	RAND_SEED = -1
 )
 
+type TTSConfig struct {
+
+	// Voice wav path or id
+	Voice string `yaml:"voice"`
+
+	// Vall-e-x
+	VallE    VallE  `yaml:"vall-e"`
+}
+
 type BackendConfig struct {
 	schema.PredictionOptions `yaml:"parameters"`
 	Name                     string `yaml:"name"`
@@ -52,8 +61,8 @@ type BackendConfig struct {
 	// GRPC Options
 	GRPC GRPC `yaml:"grpc"`
 
-	// Vall-e-x
-	VallE VallE `yaml:"vall-e"`
+	// TTS specifics
+	TTSConfig `yaml:"tts"`
 
 	// CUDA
 	// Explicitly enable CUDA or not (some backends might need it)

diff --git a/core/http/endpoints/localai/tts.go b/core/http/endpoints/localai/tts.go
@@ -12,10 +12,14 @@ import (
 )
 
 // TTSEndpoint is the OpenAI Speech API endpoint https://platform.openai.com/docs/api-reference/audio/createSpeech
-// @Summary Generates audio from the input text.
-// @Param request body schema.TTSRequest true "query params"
-// @Success 200 {string} binary	 "Response"
-// @Router /v1/audio/speech [post]
+//
+//		@Summary	Generates audio from the input text.
+//	 @Accept json
+//	 @Produce audio/x-wav
+//		@Param		request	body		schema.TTSRequest	true	"query params"
+//		@Success	200		{string}	binary				"generated audio/wav file"
+//		@Router		/v1/audio/speech [post]
+//		@Router		/tts [post]
 func TTSEndpoint(ttsbs *backend.TextToSpeechBackendService, fce *ctx.FiberContentExtractor) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 

diff --git a/core/schema/localai.go b/core/schema/localai.go
@@ -1,64 +1,66 @@
-package schema
-
-import (
-	gopsutil "github.com/shirou/gopsutil/v3/process"
-)
-
-type BackendMonitorRequest struct {
-	Model string `json:"model" yaml:"model"`
-}
-
-type BackendMonitorResponse struct {
-	MemoryInfo    *gopsutil.MemoryInfoStat
-	MemoryPercent float32
-	CPUPercent    float64
-}
-
-type TTSRequest struct {
-	Model   string `json:"model" yaml:"model"`
-	Input   string `json:"input" yaml:"input"`
-	Voice   string `json:"voice" yaml:"voice"`
-	Backend string `json:"backend" yaml:"backend"`
-}
-
-type RerankRequest struct {
-	JINARerankRequest
-	Backend string `json:"backend" yaml:"backend"`
-}
-
-type StoresSet struct {
-	Store string `json:"store,omitempty" yaml:"store,omitempty"`
-
-	Keys   [][]float32 `json:"keys" yaml:"keys"`
-	Values []string    `json:"values" yaml:"values"`
-}
-
-type StoresDelete struct {
-	Store string `json:"store,omitempty" yaml:"store,omitempty"`
-
-	Keys [][]float32 `json:"keys"`
-}
-
-type StoresGet struct {
-	Store string `json:"store,omitempty" yaml:"store,omitempty"`
-
-	Keys [][]float32 `json:"keys" yaml:"keys"`
-}
-
-type StoresGetResponse struct {
-	Keys   [][]float32 `json:"keys" yaml:"keys"`
-	Values []string    `json:"values" yaml:"values"`
-}
-
-type StoresFind struct {
-	Store string `json:"store,omitempty" yaml:"store,omitempty"`
-
-	Key  []float32 `json:"key" yaml:"key"`
-	Topk int       `json:"topk" yaml:"topk"`
-}
-
-type StoresFindResponse struct {
-	Keys         [][]float32 `json:"keys" yaml:"keys"`
-	Values       []string    `json:"values" yaml:"values"`
-	Similarities []float32   `json:"similarities" yaml:"similarities"`
-}
+package schema
+
+import (
+	gopsutil "github.com/shirou/gopsutil/v3/process"
+)
+
+type BackendMonitorRequest struct {
+	Model string `json:"model" yaml:"model"`
+}
+
+type BackendMonitorResponse struct {
+	MemoryInfo    *gopsutil.MemoryInfoStat
+	MemoryPercent float32
+	CPUPercent    float64
+}
+
+// @Description TTS request body
+type TTSRequest struct {
+	Model    string `json:"model" yaml:"model"` // model name or full path
+	Input    string `json:"input" yaml:"input"` // text input
+	Voice    string `json:"voice" yaml:"voice"` // voice audio file or speaker id
+	Backend  string `json:"backend" yaml:"backend"`
+	Language string `json:"language,omitempty" yaml:"language,omitempty"` // (optional) language to use with TTS model
+}
+
+type RerankRequest struct {
+	JINARerankRequest
+	Backend string `json:"backend" yaml:"backend"`
+}
+
+type StoresSet struct {
+	Store string `json:"store,omitempty" yaml:"store,omitempty"`
+
+	Keys   [][]float32 `json:"keys" yaml:"keys"`
+	Values []string    `json:"values" yaml:"values"`
+}
+
+type StoresDelete struct {
+	Store string `json:"store,omitempty" yaml:"store,omitempty"`
+
+	Keys [][]float32 `json:"keys"`
+}
+
+type StoresGet struct {
+	Store string `json:"store,omitempty" yaml:"store,omitempty"`
+
+	Keys [][]float32 `json:"keys" yaml:"keys"`
+}
+
+type StoresGetResponse struct {
+	Keys   [][]float32 `json:"keys" yaml:"keys"`
+	Values []string    `json:"values" yaml:"values"`
+}
+
+type StoresFind struct {
+	Store string `json:"store,omitempty" yaml:"store,omitempty"`
+
+	Key  []float32 `json:"key" yaml:"key"`
+	Topk int       `json:"topk" yaml:"topk"`
+}
+
+type StoresFindResponse struct {
+	Keys         [][]float32 `json:"keys" yaml:"keys"`
+	Values       []string    `json:"values" yaml:"values"`
+	Similarities []float32   `json:"similarities" yaml:"similarities"`
+}
diff --git a/docs/content/docs/features/text-to-audio.md b/docs/content/docs/features/text-to-audio.md
@@ -46,6 +46,10 @@ Coqui works without any configuration, to test it, you can run the following cur
         }'
 ```
 
+You can use the env variable COQUI_LANGUAGE to set the language used by the coqui backend.
+
+You can also use config files to configure tts models (see section below on how to use config files).
+
 ### Bark
 
 [Bark](https://github.com/suno-ai/bark) allows to generate audio from text prompts.
@@ -148,11 +152,12 @@ name: cloned-voice
 backend: vall-e-x
 parameters:
   model: "cloned-voice"
-vall-e:
-  # The path to the audio file to be cloned
-  # relative to the models directory
-  # Max 15s
-  audio_path: "audio-sample.wav"
+tts:
+    vall-e:
+      # The path to the audio file to be cloned
+      # relative to the models directory
+      # Max 15s
+      audio_path: "audio-sample.wav"
 ```
 
 Then you can specify the model name in the requests:
@@ -164,6 +169,35 @@ curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
    }' | aplay
 ```
 
-## Parler-tts
+### Parler-tts
+
+`parler-tts`. It is possible to install and configure the model directly from the gallery. https://github.com/huggingface/parler-tts
+
+
+## Using config files
+
+You can also use a `config-file` to specify TTS models and their parameters.
+
+In the following example we define a custom config to load the `xtts_v2` model, and specify a voice and language.
+
+```yaml
+
+name: xtts_v2
+backend: coqui
+parameters:
+  language: fr
+  model: tts_models/multilingual/multi-dataset/xtts_v2
+
+tts:
+  voice: Ana Florence
+```
 
-`parler-tts`. It is possible to install and configure the model directly from the gallery. https://github.com/huggingface/parler-tts
+With this config, you can now use the following curl command to generate a text-to-speech audio file:
+```bash
+curl -L http://localhost:8080/tts \
+    -H "Content-Type: application/json" \
+    -d '{
+"model": "xtts_v2",
+"input": "Bonjour, je suis Ana Florence. Comment puis-je vous aider?"
+}' | aplay
+```