diff --git a/Makefile b/Makefile index 20a5f2e00015..71ce394f430a 100644 --- a/Makefile +++ b/Makefile @@ -447,7 +447,7 @@ protogen-clean: protogen-go-clean protogen-python-clean .PHONY: protogen-go protogen-go: mkdir -p pkg/grpc/proto - protoc -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \ + protoc --experimental_allow_proto3_optional -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \ backend/backend.proto .PHONY: protogen-go-clean diff --git a/backend/backend.proto b/backend/backend.proto index cb87fe02d46f..aec0c00e74e4 100644 --- a/backend/backend.proto +++ b/backend/backend.proto @@ -266,6 +266,7 @@ message TTSRequest { string model = 2; string dst = 3; string voice = 4; + optional string language = 5; } message TokenizationResponse { diff --git a/backend/python/coqui/backend.py b/backend/python/coqui/backend.py index c6432208f5e9..02ab56f4a589 100644 --- a/backend/python/coqui/backend.py +++ b/backend/python/coqui/backend.py @@ -66,7 +66,21 @@ def LoadModel(self, request, context): def TTS(self, request, context): try: - self.tts.tts_to_file(text=request.text, speaker_wav=self.AudioPath, language=COQUI_LANGUAGE, file_path=request.dst) + # if model is multilangual add language from request or env as fallback + lang = request.language or COQUI_LANGUAGE + if lang == "": + lang = None + if self.tts.is_multi_lingual and lang is None: + return backend_pb2.Result(success=False, message=f"Model is multi-lingual, but no language was provided") + + # if model is multi-speaker, use speaker_wav or the speaker_id from request.voice + if self.tts.is_multi_speaker and self.AudioPath is None and request.voice is None: + return backend_pb2.Result(success=False, message=f"Model is multi-speaker, but no speaker was provided") + + if self.tts.is_multi_speaker and request.voice is not None: + self.tts.tts_to_file(text=request.text, speaker=request.voice, language=lang, file_path=request.dst) + else: + self.tts.tts_to_file(text=request.text, speaker_wav=self.AudioPath, language=lang, file_path=request.dst) except Exception as err: return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") return backend_pb2.Result(success=True) diff --git a/core/backend/tts.go b/core/backend/tts.go index 4532cf00adbc..b1c23ebb3e5d 100644 --- a/core/backend/tts.go +++ b/core/backend/tts.go @@ -29,7 +29,16 @@ func generateUniqueFileName(dir, baseName, ext string) string { } } -func ModelTTS(backend, text, modelFile, voice string, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (string, *proto.Result, error) { +func ModelTTS( + backend, + text, + modelFile, + voice , + language string, + loader *model.ModelLoader, + appConfig *config.ApplicationConfig, + backendConfig config.BackendConfig, +) (string, *proto.Result, error) { bb := backend if bb == "" { bb = model.PiperBackend @@ -83,7 +92,13 @@ func ModelTTS(backend, text, modelFile, voice string, loader *model.ModelLoader, Model: modelPath, Voice: voice, Dst: filePath, + Language: &language, }) + // return RPC error if any + if !res.Success { + return "", nil, fmt.Errorf(res.Message) + } + return filePath, res, err } diff --git a/core/cli/tts.go b/core/cli/tts.go index 8b54ed281118..cbba0fc5fb7e 100644 --- a/core/cli/tts.go +++ b/core/cli/tts.go @@ -20,6 +20,7 @@ type TTSCMD struct { Backend string `short:"b" default:"piper" help:"Backend to run the TTS model"` Model string `short:"m" required:"" help:"Model name to run the TTS"` Voice string `short:"v" help:"Voice name to run the TTS"` + Language string `short:"l" help:"Language to use with the TTS"` OutputFile string `short:"o" type:"path" help:"The path to write the output wav file"` ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"` BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"` @@ -52,7 +53,7 @@ func (t *TTSCMD) Run(ctx *cliContext.Context) error { options := config.BackendConfig{} options.SetDefaults() - filePath, _, err := backend.ModelTTS(t.Backend, text, t.Model, t.Voice, ml, opts, options) + filePath, _, err := backend.ModelTTS(t.Backend, text, t.Model, t.Voice, t.Language, ml, opts, options) if err != nil { return err } diff --git a/core/config/backend_config.go b/core/config/backend_config.go index eda663603055..1ca117165505 100644 --- a/core/config/backend_config.go +++ b/core/config/backend_config.go @@ -15,6 +15,15 @@ const ( RAND_SEED = -1 ) +type TTSConfig struct { + + // Voice wav path or id + Voice string `yaml:"voice"` + + // Vall-e-x + VallE VallE `yaml:"vall-e"` +} + type BackendConfig struct { schema.PredictionOptions `yaml:"parameters"` Name string `yaml:"name"` @@ -49,8 +58,8 @@ type BackendConfig struct { // GRPC Options GRPC GRPC `yaml:"grpc"` - // Vall-e-x - VallE VallE `yaml:"vall-e"` + // TTS specifics + TTSConfig `yaml:"tts"` // CUDA // Explicitly enable CUDA or not (some backends might need it) diff --git a/core/http/endpoints/elevenlabs/tts.go b/core/http/endpoints/elevenlabs/tts.go index 841f9b5f7846..e7bfe0f7bbfb 100644 --- a/core/http/endpoints/elevenlabs/tts.go +++ b/core/http/endpoints/elevenlabs/tts.go @@ -52,7 +52,7 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi } log.Debug().Msgf("Request for model: %s", modelFile) - filePath, _, err := backend.ModelTTS(cfg.Backend, input.Text, modelFile, voiceID, ml, appConfig, *cfg) + filePath, _, err := backend.ModelTTS(cfg.Backend, input.Text, modelFile, "", voiceID, ml, appConfig, *cfg) if err != nil { return err } diff --git a/core/http/endpoints/localai/tts.go b/core/http/endpoints/localai/tts.go index 7822e0242c2c..4e5a1b5b16d3 100644 --- a/core/http/endpoints/localai/tts.go +++ b/core/http/endpoints/localai/tts.go @@ -12,10 +12,13 @@ import ( ) // TTSEndpoint is the OpenAI Speech API endpoint https://platform.openai.com/docs/api-reference/audio/createSpeech -// @Summary Generates audio from the input text. -// @Param request body schema.TTSRequest true "query params" -// @Success 200 {string} binary "Response" -// @Router /v1/audio/speech [post] +// @Summary Generates audio from the input text. +// @Accept json +// @Produce audio/x-wav +// @Param request body schema.TTSRequest true "query params" +// @Success 200 {string} binary "generated audio/wav file" +// @Router /v1/audio/speech [post] +// @Router /tts [post] func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error { return func(c *fiber.Ctx) error { @@ -40,6 +43,7 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi ) if err != nil { + log.Err(err) modelFile = input.Model log.Warn().Msgf("Model not found in context: %s", input.Model) } else { @@ -51,7 +55,15 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi cfg.Backend = input.Backend } - filePath, _, err := backend.ModelTTS(cfg.Backend, input.Input, modelFile, input.Voice, ml, appConfig, *cfg) + if input.Language != "" { + cfg.Language = input.Language + } + + if input.Voice != "" { + cfg.Voice = input.Voice + } + + filePath, _, err := backend.ModelTTS(cfg.Backend, input.Input, modelFile, cfg.Voice, cfg.Language, ml, appConfig, *cfg) if err != nil { return err } diff --git a/core/schema/localai.go b/core/schema/localai.go index e9b61cf3d502..9bbfe28b5e4c 100644 --- a/core/schema/localai.go +++ b/core/schema/localai.go @@ -1,59 +1,61 @@ -package schema - -import ( - gopsutil "github.com/shirou/gopsutil/v3/process" -) - -type BackendMonitorRequest struct { - Model string `json:"model" yaml:"model"` -} - -type BackendMonitorResponse struct { - MemoryInfo *gopsutil.MemoryInfoStat - MemoryPercent float32 - CPUPercent float64 -} - -type TTSRequest struct { - Model string `json:"model" yaml:"model"` - Input string `json:"input" yaml:"input"` - Voice string `json:"voice" yaml:"voice"` - Backend string `json:"backend" yaml:"backend"` -} - -type StoresSet struct { - Store string `json:"store,omitempty" yaml:"store,omitempty"` - - Keys [][]float32 `json:"keys" yaml:"keys"` - Values []string `json:"values" yaml:"values"` -} - -type StoresDelete struct { - Store string `json:"store,omitempty" yaml:"store,omitempty"` - - Keys [][]float32 `json:"keys"` -} - -type StoresGet struct { - Store string `json:"store,omitempty" yaml:"store,omitempty"` - - Keys [][]float32 `json:"keys" yaml:"keys"` -} - -type StoresGetResponse struct { - Keys [][]float32 `json:"keys" yaml:"keys"` - Values []string `json:"values" yaml:"values"` -} - -type StoresFind struct { - Store string `json:"store,omitempty" yaml:"store,omitempty"` - - Key []float32 `json:"key" yaml:"key"` - Topk int `json:"topk" yaml:"topk"` -} - -type StoresFindResponse struct { - Keys [][]float32 `json:"keys" yaml:"keys"` - Values []string `json:"values" yaml:"values"` - Similarities []float32 `json:"similarities" yaml:"similarities"` -} +package schema + +import ( + gopsutil "github.com/shirou/gopsutil/v3/process" +) + +type BackendMonitorRequest struct { + Model string `json:"model" yaml:"model"` +} + +type BackendMonitorResponse struct { + MemoryInfo *gopsutil.MemoryInfoStat + MemoryPercent float32 + CPUPercent float64 +} + +// @Description TTS request body +type TTSRequest struct { + Model string `json:"model" yaml:"model"` // model name or full path + Input string `json:"input" yaml:"input"` // text input + Voice string `json:"voice" yaml:"voice"` // voice audio file or speaker id + Backend string `json:"backend" yaml:"backend"` + Language string `json:"language,omitempty" yaml:"language,omitempty"` // (optional) language to use with TTS model +} + +type StoresSet struct { + Store string `json:"store,omitempty" yaml:"store,omitempty"` + + Keys [][]float32 `json:"keys" yaml:"keys"` + Values []string `json:"values" yaml:"values"` +} + +type StoresDelete struct { + Store string `json:"store,omitempty" yaml:"store,omitempty"` + + Keys [][]float32 `json:"keys"` +} + +type StoresGet struct { + Store string `json:"store,omitempty" yaml:"store,omitempty"` + + Keys [][]float32 `json:"keys" yaml:"keys"` +} + +type StoresGetResponse struct { + Keys [][]float32 `json:"keys" yaml:"keys"` + Values []string `json:"values" yaml:"values"` +} + +type StoresFind struct { + Store string `json:"store,omitempty" yaml:"store,omitempty"` + + Key []float32 `json:"key" yaml:"key"` + Topk int `json:"topk" yaml:"topk"` +} + +type StoresFindResponse struct { + Keys [][]float32 `json:"keys" yaml:"keys"` + Values []string `json:"values" yaml:"values"` + Similarities []float32 `json:"similarities" yaml:"similarities"` +} diff --git a/docs/content/docs/features/text-to-audio.md b/docs/content/docs/features/text-to-audio.md index ebfdda1d0e9d..0e82f7f07ba8 100644 --- a/docs/content/docs/features/text-to-audio.md +++ b/docs/content/docs/features/text-to-audio.md @@ -46,6 +46,10 @@ Coqui works without any configuration, to test it, you can run the following cur }' ``` +You can use the env variable COQUI_LANGUAGE to set the language used by the coqui backend. + +You can also use config files to configure tts models (see section below on how to use config files). + ### Bark [Bark](https://github.com/suno-ai/bark) allows to generate audio from text prompts. @@ -148,11 +152,12 @@ name: cloned-voice backend: vall-e-x parameters: model: "cloned-voice" -vall-e: - # The path to the audio file to be cloned - # relative to the models directory - # Max 15s - audio_path: "audio-sample.wav" +tts: + vall-e: + # The path to the audio file to be cloned + # relative to the models directory + # Max 15s + audio_path: "audio-sample.wav" ``` Then you can specify the model name in the requests: @@ -164,6 +169,35 @@ curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{ }' | aplay ``` -## Parler-tts +### Parler-tts + +`parler-tts`. It is possible to install and configure the model directly from the gallery. https://github.com/huggingface/parler-tts + + +## Using config files + +You can also use a `config-file` to specify TTS models and their parameters. + +In the following example we define a custom config to load the `xtts_v2` model, and specify a voice and language. + +```yaml + +name: xtts_v2 +backend: coqui +parameters: + language: fr + model: tts_models/multilingual/multi-dataset/xtts_v2 + +tts: + voice: Ana Florence +``` -`parler-tts`. It is possible to install and configure the model directly from the gallery. https://github.com/huggingface/parler-tts \ No newline at end of file +With this config, you can now use the following curl command to generate a text-to-speech audio file: +```bash +curl -L http://localhost:8080/tts \ + -H "Content-Type: application/json" \ + -d '{ +"model": "xtts_v2", +"input": "Bonjour, je suis Ana Florence. Comment puis-je vous aider?" +}' | aplay +```