Skip to content

Commit

Permalink
merge?
Browse files Browse the repository at this point in the history
Signed-off-by: Dave Lee <[email protected]>
  • Loading branch information
dave-gray101 committed Jun 1, 2024
2 parents b792901 + b99182c commit 277bc1d
Show file tree
Hide file tree
Showing 11 changed files with 322 additions and 98 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,7 @@ protogen-clean: protogen-go-clean protogen-python-clean
.PHONY: protogen-go
protogen-go:
mkdir -p pkg/grpc/proto
protoc -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
protoc --experimental_allow_proto3_optional -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
backend/backend.proto

.PHONY: protogen-go-clean
Expand Down
1 change: 1 addition & 0 deletions backend/backend.proto
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,7 @@ message TTSRequest {
string model = 2;
string dst = 3;
string voice = 4;
optional string language = 5;
}

message TokenizationResponse {
Expand Down
16 changes: 15 additions & 1 deletion backend/python/coqui/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,21 @@ def LoadModel(self, request, context):

def TTS(self, request, context):
try:
self.tts.tts_to_file(text=request.text, speaker_wav=self.AudioPath, language=COQUI_LANGUAGE, file_path=request.dst)
# if model is multilangual add language from request or env as fallback
lang = request.language or COQUI_LANGUAGE
if lang == "":
lang = None
if self.tts.is_multi_lingual and lang is None:
return backend_pb2.Result(success=False, message=f"Model is multi-lingual, but no language was provided")

# if model is multi-speaker, use speaker_wav or the speaker_id from request.voice
if self.tts.is_multi_speaker and self.AudioPath is None and request.voice is None:
return backend_pb2.Result(success=False, message=f"Model is multi-speaker, but no speaker was provided")

if self.tts.is_multi_speaker and request.voice is not None:
self.tts.tts_to_file(text=request.text, speaker=request.voice, language=lang, file_path=request.dst)
else:
self.tts.tts_to_file(text=request.text, speaker_wav=self.AudioPath, language=lang, file_path=request.dst)
except Exception as err:
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
return backend_pb2.Result(success=True)
Expand Down
27 changes: 20 additions & 7 deletions core/backend/tts.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,15 +50,22 @@ func (ttsbs *TextToSpeechBackendService) TextToAudioFile(request *schema.TTSRequ
bc.Backend = request.Backend
}
// TODO consider merging the below function in, but leave it seperated for diff reasons in the first PR
dst, err := ttsbs.modelTTS(request.Backend, request.Input, bc.Model, request.Voice, *bc)
dst, err := ttsbs.modelTTS(request.Backend, request.Input, bc.Model, request.Voice, request.Language, *bc)
log.Debug().Str("dst", dst).Err(err).Msg("modelTTS result in goroutine")
wjr.SetResult(dst, err)
}(wjr)

return jr
}

func (ttsbs *TextToSpeechBackendService) modelTTS(backend, text, modelFile, voice string, backendConfig config.BackendConfig) (string, error) {
func (ttsbs *TextToSpeechBackendService) modelTTS(
backend string,
text string,
modelFile string,
voice string,
language string,
backendConfig config.BackendConfig,
) (string, error) {
bb := backend
if bb == "" {
bb = model.PiperBackend
Expand Down Expand Up @@ -114,13 +121,19 @@ func (ttsbs *TextToSpeechBackendService) modelTTS(backend, text, modelFile, voic
}
}

_, err = ttsModel.TTS(context.Background(), &proto.TTSRequest{
Text: text,
Model: modelPath,
Voice: voice,
Dst: filePath,
res, err := ttsModel.TTS(context.Background(), &proto.TTSRequest{
Text: text,
Model: modelPath,
Voice: voice,
Dst: filePath,
Language: &language,
})

// return RPC error if any
if !res.Success {
return "", nil, fmt.Errorf(res.Message)

Check failure on line 134 in core/backend/tts.go

View workflow job for this annotation

GitHub Actions / build-macOS-arm64

too many return values

Check failure on line 134 in core/backend/tts.go

View workflow job for this annotation

GitHub Actions / tests-linux (1.21.x)

too many return values

Check failure on line 134 in core/backend/tts.go

View workflow job for this annotation

GitHub Actions / build-linux

too many return values
}

return filePath, err
}

Expand Down
10 changes: 6 additions & 4 deletions core/cli/tts.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ type TTSCMD struct {
Backend string `short:"b" default:"piper" help:"Backend to run the TTS model"`
Model string `short:"m" required:"" help:"Model name to run the TTS"`
Voice string `short:"v" help:"Voice name to run the TTS"`
Language string `short:"l" help:"Language to use with the TTS"`
OutputFile string `short:"o" type:"path" help:"The path to write the output wav file"`
ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
Expand Down Expand Up @@ -51,10 +52,11 @@ func (t *TTSCMD) Run(ctx *cliContext.Context) error {
}()

request := &schema.TTSRequest{
Backend: t.Backend,
Input: text,
Model: t.Model,
Voice: t.Voice,
Backend: t.Backend,
Input: text,
Model: t.Model,
Voice: t.Voice,
Language: t.Language,
}

ttsbs := backend.NewTextToSpeechBackendService(ml, config.NewBackendConfigLoader(), opts)
Expand Down
13 changes: 11 additions & 2 deletions core/config/backend_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,15 @@ const (
RAND_SEED = -1
)

type TTSConfig struct {

// Voice wav path or id
Voice string `yaml:"voice"`

// Vall-e-x
VallE VallE `yaml:"vall-e"`
}

type BackendConfig struct {
schema.PredictionOptions `yaml:"parameters"`
Name string `yaml:"name"`
Expand Down Expand Up @@ -52,8 +61,8 @@ type BackendConfig struct {
// GRPC Options
GRPC GRPC `yaml:"grpc"`

// Vall-e-x
VallE VallE `yaml:"vall-e"`
// TTS specifics
TTSConfig `yaml:"tts"`

// CUDA
// Explicitly enable CUDA or not (some backends might need it)
Expand Down
12 changes: 8 additions & 4 deletions core/http/endpoints/localai/tts.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,14 @@ import (
)

// TTSEndpoint is the OpenAI Speech API endpoint https://platform.openai.com/docs/api-reference/audio/createSpeech
// @Summary Generates audio from the input text.
// @Param request body schema.TTSRequest true "query params"
// @Success 200 {string} binary "Response"
// @Router /v1/audio/speech [post]
//
// @Summary Generates audio from the input text.
// @Accept json
// @Produce audio/x-wav
// @Param request body schema.TTSRequest true "query params"
// @Success 200 {string} binary "generated audio/wav file"
// @Router /v1/audio/speech [post]
// @Router /tts [post]
func TTSEndpoint(ttsbs *backend.TextToSpeechBackendService, fce *ctx.FiberContentExtractor) func(c *fiber.Ctx) error {
return func(c *fiber.Ctx) error {

Expand Down
130 changes: 66 additions & 64 deletions core/schema/localai.go
Original file line number Diff line number Diff line change
@@ -1,64 +1,66 @@
package schema

import (
gopsutil "github.com/shirou/gopsutil/v3/process"
)

type BackendMonitorRequest struct {
Model string `json:"model" yaml:"model"`
}

type BackendMonitorResponse struct {
MemoryInfo *gopsutil.MemoryInfoStat
MemoryPercent float32
CPUPercent float64
}

type TTSRequest struct {
Model string `json:"model" yaml:"model"`
Input string `json:"input" yaml:"input"`
Voice string `json:"voice" yaml:"voice"`
Backend string `json:"backend" yaml:"backend"`
}

type RerankRequest struct {
JINARerankRequest
Backend string `json:"backend" yaml:"backend"`
}

type StoresSet struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`

Keys [][]float32 `json:"keys" yaml:"keys"`
Values []string `json:"values" yaml:"values"`
}

type StoresDelete struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`

Keys [][]float32 `json:"keys"`
}

type StoresGet struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`

Keys [][]float32 `json:"keys" yaml:"keys"`
}

type StoresGetResponse struct {
Keys [][]float32 `json:"keys" yaml:"keys"`
Values []string `json:"values" yaml:"values"`
}

type StoresFind struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`

Key []float32 `json:"key" yaml:"key"`
Topk int `json:"topk" yaml:"topk"`
}

type StoresFindResponse struct {
Keys [][]float32 `json:"keys" yaml:"keys"`
Values []string `json:"values" yaml:"values"`
Similarities []float32 `json:"similarities" yaml:"similarities"`
}
package schema

import (
gopsutil "github.com/shirou/gopsutil/v3/process"
)

type BackendMonitorRequest struct {
Model string `json:"model" yaml:"model"`
}

type BackendMonitorResponse struct {
MemoryInfo *gopsutil.MemoryInfoStat
MemoryPercent float32
CPUPercent float64
}

// @Description TTS request body
type TTSRequest struct {
Model string `json:"model" yaml:"model"` // model name or full path
Input string `json:"input" yaml:"input"` // text input
Voice string `json:"voice" yaml:"voice"` // voice audio file or speaker id
Backend string `json:"backend" yaml:"backend"`
Language string `json:"language,omitempty" yaml:"language,omitempty"` // (optional) language to use with TTS model
}

type RerankRequest struct {
JINARerankRequest
Backend string `json:"backend" yaml:"backend"`
}

type StoresSet struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`

Keys [][]float32 `json:"keys" yaml:"keys"`
Values []string `json:"values" yaml:"values"`
}

type StoresDelete struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`

Keys [][]float32 `json:"keys"`
}

type StoresGet struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`

Keys [][]float32 `json:"keys" yaml:"keys"`
}

type StoresGetResponse struct {
Keys [][]float32 `json:"keys" yaml:"keys"`
Values []string `json:"values" yaml:"values"`
}

type StoresFind struct {
Store string `json:"store,omitempty" yaml:"store,omitempty"`

Key []float32 `json:"key" yaml:"key"`
Topk int `json:"topk" yaml:"topk"`
}

type StoresFindResponse struct {
Keys [][]float32 `json:"keys" yaml:"keys"`
Values []string `json:"values" yaml:"values"`
Similarities []float32 `json:"similarities" yaml:"similarities"`
}
48 changes: 41 additions & 7 deletions docs/content/docs/features/text-to-audio.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ Coqui works without any configuration, to test it, you can run the following cur
}'
```

You can use the env variable COQUI_LANGUAGE to set the language used by the coqui backend.

You can also use config files to configure tts models (see section below on how to use config files).

### Bark

[Bark](https://github.com/suno-ai/bark) allows to generate audio from text prompts.
Expand Down Expand Up @@ -148,11 +152,12 @@ name: cloned-voice
backend: vall-e-x
parameters:
model: "cloned-voice"
vall-e:
# The path to the audio file to be cloned
# relative to the models directory
# Max 15s
audio_path: "audio-sample.wav"
tts:
vall-e:
# The path to the audio file to be cloned
# relative to the models directory
# Max 15s
audio_path: "audio-sample.wav"
```
Then you can specify the model name in the requests:
Expand All @@ -164,6 +169,35 @@ curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
}' | aplay
```

## Parler-tts
### Parler-tts

`parler-tts`. It is possible to install and configure the model directly from the gallery. https://github.com/huggingface/parler-tts


## Using config files

You can also use a `config-file` to specify TTS models and their parameters.

In the following example we define a custom config to load the `xtts_v2` model, and specify a voice and language.

```yaml

name: xtts_v2
backend: coqui
parameters:
language: fr
model: tts_models/multilingual/multi-dataset/xtts_v2

tts:
voice: Ana Florence
```
`parler-tts`. It is possible to install and configure the model directly from the gallery. https://github.com/huggingface/parler-tts
With this config, you can now use the following curl command to generate a text-to-speech audio file:
```bash
curl -L http://localhost:8080/tts \
-H "Content-Type: application/json" \
-d '{
"model": "xtts_v2",
"input": "Bonjour, je suis Ana Florence. Comment puis-je vous aider?"
}' | aplay
```
Loading

0 comments on commit 277bc1d

Please sign in to comment.