Skip to content

Commit

Permalink
Add C++ and Python API for Kokoro TTS models. (#1715)
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj authored Jan 16, 2025
1 parent 9efe26a commit ffc6b48
Show file tree
Hide file tree
Showing 27 changed files with 1,193 additions and 29 deletions.
25 changes: 25 additions & 0 deletions .github/scripts/test-offline-tts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,31 @@ which $EXE
# test waves are saved in ./tts
mkdir ./tts

log "------------------------------------------------------------"
log "kokoro-en-v0_19"
log "------------------------------------------------------------"
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
tar xf kokoro-en-v0_19.tar.bz2
rm kokoro-en-v0_19.tar.bz2

# mapping of sid to voice name
# 0->af, 1->af_bella, 2->af_nicole, 3->af_sarah, 4->af_sky, 5->am_adam
# 6->am_michael, 7->bf_emma, 8->bf_isabella, 9->bm_george, 10->bm_lewis

for sid in $(seq 0 10); do
$EXE \
--debug=1 \
--kokoro-model=./kokoro-en-v0_19/model.onnx \
--kokoro-voices=./kokoro-en-v0_19/voices.bin \
--kokoro-tokens=./kokoro-en-v0_19/tokens.txt \
--kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \
--num-threads=2 \
--sid=$sid \
--output-filename="./tts/kokoro-$sid.wav" \
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar."
done
rm -rf kokoro-en-v0_19

log "------------------------------------------------------------"
log "matcha-icefall-en_US-ljspeech"
log "------------------------------------------------------------"
Expand Down
19 changes: 19 additions & 0 deletions .github/scripts/test-python.sh
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,25 @@ log "Offline TTS test"
# test waves are saved in ./tts
mkdir ./tts

log "kokoro-en-v0_19 test"

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
tar xf kokoro-en-v0_19.tar.bz2
rm kokoro-en-v0_19.tar.bz2

python3 ./python-api-examples/offline-tts.py \
--debug=1 \
--kokoro-model=./kokoro-en-v0_19/model.onnx \
--kokoro-voices=./kokoro-en-v0_19/voices.bin \
--kokoro-tokens=./kokoro-en-v0_19/tokens.txt \
--kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \
--num-threads=2 \
--sid=10 \
--output-filename="./tts/kokoro-10.wav" \
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar."

rm -rf kokoro-en-v0_19

log "matcha-ljspeech-en test"

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
Expand Down
63 changes: 58 additions & 5 deletions python-api-examples/offline-tts-play.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
Usage:
Example (1/5)
Example (1/6)
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
tar xf vits-piper-en_US-amy-low.tar.bz2
Expand All @@ -23,7 +23,7 @@
--output-filename=./generated.wav \
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
Example (2/5)
Example (2/6)
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2
tar xvf vits-zh-aishell3.tar.bz2
Expand All @@ -37,7 +37,7 @@
--output-filename=./liubei-21.wav \
"勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334"
Example (3/5)
Example (3/6)
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2
tar xvf sherpa-onnx-vits-zh-ll.tar.bz2
Expand All @@ -53,7 +53,7 @@
--output-filename=./test-2.wav \
"当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。"
Example (4/5)
Example (4/6)
curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
tar xvf matcha-icefall-zh-baker.tar.bz2
Expand All @@ -71,7 +71,7 @@
--output-filename=./test-matcha.wav \
"某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
Example (5/5)
Example (5/6)
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
Expand All @@ -88,6 +88,22 @@
--num-threads=2 \
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
Example (6/6)
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
tar xf kokoro-en-v0_19.tar.bz2
rm kokoro-en-v0_19.tar.bz2
python3 ./python-api-examples/offline-tts.py \
--debug=1 \
--kokoro-model=./kokoro-en-v0_19/model.onnx \
--kokoro-voices=./kokoro-en-v0_19/voices.bin \
--kokoro-tokens=./kokoro-en-v0_19/tokens.txt \
--kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \
--num-threads=2 \
--sid=10 \
--output-filename="./kokoro-10.wav" \
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar."
You can find more models at
https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
Expand Down Expand Up @@ -202,13 +218,44 @@ def add_matcha_args(parser):
)


def add_kokoro_args(parser):
parser.add_argument(
"--kokoro-model",
type=str,
default="",
help="Path to model.onnx for kokoro",
)

parser.add_argument(
"--kokoro-voices",
type=str,
default="",
help="Path to voices.bin for kokoro",
)

parser.add_argument(
"--kokoro-tokens",
type=str,
default="",
help="Path to tokens.txt for kokoro",
)

parser.add_argument(
"--kokoro-data-dir",
type=str,
default="",
help="Path to the dict directory of espeak-ng.",
)


def get_args():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)

add_vits_args(parser)
add_matcha_args(parser)
add_kokoro_args(parser)

parser.add_argument(
"--tts-rule-fsts",
Expand Down Expand Up @@ -407,6 +454,12 @@ def main():
data_dir=args.matcha_data_dir,
dict_dir=args.matcha_dict_dir,
),
kokoro=sherpa_onnx.OfflineTtsKokoroModelConfig(
model=args.kokoro_model,
voices=args.kokoro_voices,
tokens=args.kokoro_tokens,
data_dir=args.kokoro_data_dir,
),
provider=args.provider,
debug=args.debug,
num_threads=args.num_threads,
Expand Down
66 changes: 60 additions & 6 deletions python-api-examples/offline-tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
Usage:
Example (1/5)
Example (1/6)
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
tar xf vits-piper-en_US-amy-low.tar.bz2
Expand All @@ -24,7 +24,7 @@
--output-filename=./generated.wav \
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
Example (2/5)
Example (2/6)
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
tar xvf vits-icefall-zh-aishell3.tar.bz2
Expand All @@ -38,7 +38,7 @@
--output-filename=./liubei-21.wav \
"勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334"
Example (3/5)
Example (3/6)
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2
tar xvf sherpa-onnx-vits-zh-ll.tar.bz2
Expand All @@ -54,7 +54,7 @@
--output-filename=./test-2.wav \
"当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。"
Example (4/5)
Example (4/6)
curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
tar xvf matcha-icefall-zh-baker.tar.bz2
Expand All @@ -72,7 +72,7 @@
--output-filename=./test-matcha.wav \
"某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
Example (5/5)
Example (5/6)
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
Expand All @@ -89,6 +89,23 @@
--num-threads=2 \
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar."
Example (6/6)
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
tar xf kokoro-en-v0_19.tar.bz2
rm kokoro-en-v0_19.tar.bz2
python3 ./python-api-examples/offline-tts.py \
--debug=1 \
--kokoro-model=./kokoro-en-v0_19/model.onnx \
--kokoro-voices=./kokoro-en-v0_19/voices.bin \
--kokoro-tokens=./kokoro-en-v0_19/tokens.txt \
--kokoro-data-dir=./kokoro-en-v0_19/espeak-ng-data \
--num-threads=2 \
--sid=10 \
--output-filename="./kokoro-10.wav" \
"Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be a statesman, a businessman, an official, or a scholar."
You can find more models at
https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
Expand Down Expand Up @@ -188,13 +205,44 @@ def add_matcha_args(parser):
)


def add_kokoro_args(parser):
parser.add_argument(
"--kokoro-model",
type=str,
default="",
help="Path to model.onnx for kokoro",
)

parser.add_argument(
"--kokoro-voices",
type=str,
default="",
help="Path to voices.bin for kokoro",
)

parser.add_argument(
"--kokoro-tokens",
type=str,
default="",
help="Path to tokens.txt for kokoro",
)

parser.add_argument(
"--kokoro-data-dir",
type=str,
default="",
help="Path to the dict directory of espeak-ng.",
)


def get_args():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)

add_vits_args(parser)
add_matcha_args(parser)
add_kokoro_args(parser)

parser.add_argument(
"--tts-rule-fsts",
Expand All @@ -206,7 +254,7 @@ def get_args():
parser.add_argument(
"--max-num-sentences",
type=int,
default=2,
default=1,
help="""Max number of sentences in a batch to avoid OOM if the input
text is very long. Set it to -1 to process all the sentences in a
single batch. A smaller value does not mean it is slower compared
Expand Down Expand Up @@ -289,6 +337,12 @@ def main():
data_dir=args.matcha_data_dir,
dict_dir=args.matcha_dict_dir,
),
kokoro=sherpa_onnx.OfflineTtsKokoroModelConfig(
model=args.kokoro_model,
voices=args.kokoro_voices,
tokens=args.kokoro_tokens,
data_dir=args.kokoro_data_dir,
),
provider=args.provider,
debug=args.debug,
num_threads=args.num_threads,
Expand Down
2 changes: 2 additions & 0 deletions sherpa-onnx/csrc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,8 @@ if(SHERPA_ONNX_ENABLE_TTS)
offline-tts-character-frontend.cc
offline-tts-frontend.cc
offline-tts-impl.cc
offline-tts-kokoro-model-config.cc
offline-tts-kokoro-model.cc
offline-tts-matcha-model-config.cc
offline-tts-matcha-model.cc
offline-tts-model-config.cc
Expand Down
2 changes: 1 addition & 1 deletion sherpa-onnx/csrc/melo-tts-lexicon.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include <vector>

#include "sherpa-onnx/csrc/offline-tts-frontend.h"
#include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h"
#include "sherpa-onnx/csrc/offline-tts-vits-model-meta-data.h"

namespace sherpa_onnx {

Expand Down
2 changes: 1 addition & 1 deletion sherpa-onnx/csrc/offline-tts-character-frontend.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#include <vector>

#include "sherpa-onnx/csrc/offline-tts-frontend.h"
#include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h"
#include "sherpa-onnx/csrc/offline-tts-vits-model-meta-data.h"

namespace sherpa_onnx {

Expand Down
10 changes: 8 additions & 2 deletions sherpa-onnx/csrc/offline-tts-impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "rawfile/raw_file_manager.h"
#endif

#include "sherpa-onnx/csrc/offline-tts-kokoro-impl.h"
#include "sherpa-onnx/csrc/offline-tts-matcha-impl.h"
#include "sherpa-onnx/csrc/offline-tts-vits-impl.h"

Expand All @@ -37,18 +38,23 @@ std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create(
const OfflineTtsConfig &config) {
if (!config.model.vits.model.empty()) {
return std::make_unique<OfflineTtsVitsImpl>(config);
} else if (!config.model.matcha.acoustic_model.empty()) {
return std::make_unique<OfflineTtsMatchaImpl>(config);
}
return std::make_unique<OfflineTtsMatchaImpl>(config);

return std::make_unique<OfflineTtsKokoroImpl>(config);
}

template <typename Manager>
std::unique_ptr<OfflineTtsImpl> OfflineTtsImpl::Create(
Manager *mgr, const OfflineTtsConfig &config) {
if (!config.model.vits.model.empty()) {
return std::make_unique<OfflineTtsVitsImpl>(mgr, config);
} else if (!config.model.matcha.acoustic_model.empty()) {
return std::make_unique<OfflineTtsMatchaImpl>(mgr, config);
}

return std::make_unique<OfflineTtsMatchaImpl>(mgr, config);
return std::make_unique<OfflineTtsKokoroImpl>(mgr, config);
}

#if __ANDROID_API__ >= 9
Expand Down
Loading

0 comments on commit ffc6b48

Please sign in to comment.