Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Dart API for MatchaTTS models #1687

Merged
merged 2 commits into from
Jan 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 25 additions & 16 deletions .github/scripts/test-dart.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,31 @@ set -ex

cd dart-api-examples

pushd tts

echo '----------matcha tts----------'
./run-matcha-zh.sh
./run-matcha-en.sh
ls -lh *.wav
rm -rf matcha-icefall-*
rm *.onnx

echo '----------piper tts----------'
./run-piper.sh
rm -rf vits-piper-*

echo '----------coqui tts----------'
./run-coqui.sh
rm -rf vits-coqui-*

echo '----------zh tts----------'
./run-vits-zh.sh
rm -rf sherpa-onnx-*

ls -lh *.wav

popd # tts

pushd speaker-diarization
echo '----------speaker diarization----------'
./run.sh
Expand Down Expand Up @@ -106,22 +131,6 @@ rm -rf sherpa-onnx-*

popd # non-streaming-asr

pushd tts

echo '----------piper tts----------'
./run-piper.sh
rm -rf vits-piper-*

echo '----------coqui tts----------'
./run-coqui.sh
rm -rf vits-coqui-*

echo '----------zh tts----------'
./run-zh.sh
rm -rf sherpa-onnx-*

popd # tts

pushd streaming-asr

echo '----------streaming zipformer ctc HLG----------'
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/checksum.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ on:

jobs:
checksum:
if: github.repository_owner == 'k2-fsa'
runs-on: macos-latest
strategy:
matrix:
Expand Down
86 changes: 86 additions & 0 deletions dart-api-examples/tts/bin/matcha-en.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
// Copyright (c) 2025 Xiaomi Corporation
import 'dart:io';

import 'package:args/args.dart';
import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;

import './init.dart';

void main(List<String> arguments) async {
await initSherpaOnnx();

final parser = ArgParser()
..addOption('acoustic-model', help: 'Path to the acoustic model')
..addOption('vocoder', help: 'Path to the vocoder model')
..addOption('tokens', help: 'Path to tokens.txt')
..addOption(
'data-dir',
help: 'Path to espeak-ng-data directory',
defaultsTo: '',
)
..addOption('rule-fsts', help: 'Path to rule fsts', defaultsTo: '')
..addOption('rule-fars', help: 'Path to rule fars', defaultsTo: '')
..addOption('text', help: 'Text to generate TTS for')
..addOption('output-wav', help: 'Filename to save the generated audio')
..addOption('speed', help: 'Speech speed', defaultsTo: '1.0')
..addOption(
'sid',
help: 'Speaker ID to select. Used only for multi-speaker TTS',
defaultsTo: '0',
);
final res = parser.parse(arguments);
if (res['acoustic-model'] == null ||
res['vocoder'] == null ||
res['tokens'] == null ||
res['data-dir'] == null ||
res['output-wav'] == null ||
res['text'] == null) {
print(parser.usage);
exit(1);
}
final acousticModel = res['acoustic-model'] as String;
final vocoder = res['vocoder'] as String;
final tokens = res['tokens'] as String;
final dataDir = res['data-dir'] as String;
final ruleFsts = res['rule-fsts'] as String;
final ruleFars = res['rule-fars'] as String;
final text = res['text'] as String;
final outputWav = res['output-wav'] as String;
var speed = double.tryParse(res['speed'] as String) ?? 1.0;
final sid = int.tryParse(res['sid'] as String) ?? 0;

if (speed == 0) {
speed = 1.0;
}

final matcha = sherpa_onnx.OfflineTtsMatchaModelConfig(
acousticModel: acousticModel,
vocoder: vocoder,
tokens: tokens,
dataDir: dataDir,
lengthScale: 1 / speed,
);

final modelConfig = sherpa_onnx.OfflineTtsModelConfig(
matcha: matcha,
numThreads: 1,
debug: true,
);
final config = sherpa_onnx.OfflineTtsConfig(
model: modelConfig,
maxNumSenetences: 1,
ruleFsts: ruleFsts,
ruleFars: ruleFars,
);

final tts = sherpa_onnx.OfflineTts(config);
final audio = tts.generate(text: text, sid: sid, speed: speed);
tts.free();

sherpa_onnx.writeWave(
filename: outputWav,
samples: audio.samples,
sampleRate: audio.sampleRate,
);
print('Saved to $outputWav');
}
90 changes: 90 additions & 0 deletions dart-api-examples/tts/bin/matcha-zh.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
// Copyright (c) 2025 Xiaomi Corporation
import 'dart:io';

import 'package:args/args.dart';
import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx;

import './init.dart';

void main(List<String> arguments) async {
await initSherpaOnnx();

final parser = ArgParser()
..addOption('acoustic-model', help: 'Path to the acoustic model')
..addOption('vocoder', help: 'Path to the vocoder model')
..addOption('tokens', help: 'Path to tokens.txt')
..addOption('lexicon', help: 'Path to lexicon.txt')
..addOption(
'dict-dir',
help: 'Path to jieba dict directory',
defaultsTo: '',
)
..addOption('rule-fsts', help: 'Path to rule fsts', defaultsTo: '')
..addOption('rule-fars', help: 'Path to rule fars', defaultsTo: '')
..addOption('text', help: 'Text to generate TTS for')
..addOption('output-wav', help: 'Filename to save the generated audio')
..addOption('speed', help: 'Speech speed', defaultsTo: '1.0')
..addOption(
'sid',
help: 'Speaker ID to select. Used only for multi-speaker TTS',
defaultsTo: '0',
);
final res = parser.parse(arguments);
if (res['acoustic-model'] == null ||
res['vocoder'] == null ||
res['lexicon'] == null ||
res['tokens'] == null ||
res['dict-dir'] == null ||
res['output-wav'] == null ||
res['text'] == null) {
print(parser.usage);
exit(1);
}
final acousticModel = res['acoustic-model'] as String;
final vocoder = res['vocoder'] as String;
final lexicon = res['lexicon'] as String;
final tokens = res['tokens'] as String;
final dictDir = res['dict-dir'] as String;
final ruleFsts = res['rule-fsts'] as String;
final ruleFars = res['rule-fars'] as String;
final text = res['text'] as String;
final outputWav = res['output-wav'] as String;
var speed = double.tryParse(res['speed'] as String) ?? 1.0;
final sid = int.tryParse(res['sid'] as String) ?? 0;

if (speed == 0) {
speed = 1.0;
}

final matcha = sherpa_onnx.OfflineTtsMatchaModelConfig(
acousticModel: acousticModel,
vocoder: vocoder,
lexicon: lexicon,
tokens: tokens,
dictDir: dictDir,
lengthScale: 1 / speed,
);

final modelConfig = sherpa_onnx.OfflineTtsModelConfig(
matcha: matcha,
numThreads: 1,
debug: true,
);
final config = sherpa_onnx.OfflineTtsConfig(
model: modelConfig,
maxNumSenetences: 1,
ruleFsts: ruleFsts,
ruleFars: ruleFars,
);

final tts = sherpa_onnx.OfflineTts(config);
final audio = tts.generate(text: text, sid: sid, speed: speed);
tts.free();

sherpa_onnx.writeWave(
filename: outputWav,
samples: audio.samples,
sampleRate: audio.sampleRate,
);
print('Saved to $outputWav');
}
32 changes: 32 additions & 0 deletions dart-api-examples/tts/run-matcha-en.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/usr/bin/env bash

set -ex

dart pub get

# please visit
# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
# matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
# to download more models
if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
tar xf matcha-icefall-en_US-ljspeech.tar.bz2
rm matcha-icefall-en_US-ljspeech.tar.bz2
fi

if [ ! -f ./hifigan_v2.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
fi

dart run \
./bin/matcha-en.dart \
--acoustic-model ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \
--vocoder ./hifigan_v2.onnx \
--tokens ./matcha-icefall-en_US-ljspeech/tokens.txt \
--data-dir ./matcha-icefall-en_US-ljspeech/espeak-ng-data \
--sid 0 \
--speed 1.0 \
--output-wav matcha-en-1.wav \
--text "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." \

ls -lh *.wav
45 changes: 45 additions & 0 deletions dart-api-examples/tts/run-matcha-zh.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/usr/bin/env bash

set -ex

dart pub get

# please visit
# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker
# to download more models
if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
tar xvf matcha-icefall-zh-baker.tar.bz2
rm matcha-icefall-zh-baker.tar.bz2
fi

if [ ! -f ./hifigan_v2.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
fi

dart run \
./bin/matcha-zh.dart \
--acoustic-model ./matcha-icefall-zh-baker/model-steps-3.onnx \
--vocoder ./hifigan_v2.onnx \
--lexicon ./matcha-icefall-zh-baker/lexicon.txt \
--tokens ./matcha-icefall-zh-baker/tokens.txt \
--dict-dir ./matcha-icefall-zh-baker/dict \
--rule-fsts ./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \
--sid 0 \
--speed 1.0 \
--output-wav matcha-zh-1.wav \
--text "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" \

dart run \
./bin/matcha-zh.dart \
--acoustic-model ./matcha-icefall-zh-baker/model-steps-3.onnx \
--vocoder ./hifigan_v2.onnx \
--lexicon ./matcha-icefall-zh-baker/lexicon.txt \
--tokens ./matcha-icefall-zh-baker/tokens.txt \
--dict-dir ./matcha-icefall-zh-baker/dict \
--sid 0 \
--speed 1.0 \
--output-wav matcha-zh-2.wav \
--text "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔." \

ls -lh *.wav
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,18 @@ if [[ ! -f ./sherpa-onnx-vits-zh-ll/tokens.txt ]]; then
fi

dart run \
./bin/zh.dart \
./bin/vits-zh.dart \
--model ./sherpa-onnx-vits-zh-ll/model.onnx \
--lexicon ./sherpa-onnx-vits-zh-ll/lexicon.txt \
--tokens ./sherpa-onnx-vits-zh-ll/tokens.txt \
--dict-dir ./sherpa-onnx-vits-zh-ll/dict \
--sid 2 \
--speed 1.0 \
--text '当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。' \
--output-wav zh-jieba-2.wav
--output-wav vits-zh-jieba-2.wav

dart run \
./bin/zh.dart \
./bin/vits-zh.dart \
--model ./sherpa-onnx-vits-zh-ll/model.onnx \
--lexicon ./sherpa-onnx-vits-zh-ll/lexicon.txt \
--tokens ./sherpa-onnx-vits-zh-ll/tokens.txt \
Expand All @@ -36,6 +36,6 @@ dart run \
--sid 3 \
--speed 1.0 \
--text '今天是2024年6月15号,13点23分。如果有困难,请拨打110或者18920240511。123456块钱。' \
--output-wav zh-jieba-3.wav
--output-wav vits-zh-jieba-3.wav

ls -lh *.wav
17 changes: 17 additions & 0 deletions flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,22 @@ final class SherpaOnnxOfflineTtsVitsModelConfig extends Struct {
external Pointer<Utf8> dictDir;
}

final class SherpaOnnxOfflineTtsMatchaModelConfig extends Struct {
external Pointer<Utf8> acousticModel;
external Pointer<Utf8> vocoder;
external Pointer<Utf8> lexicon;
external Pointer<Utf8> tokens;
external Pointer<Utf8> dataDir;

@Float()
external double noiseScale;

@Float()
external double lengthScale;

external Pointer<Utf8> dictDir;
}

final class SherpaOnnxOfflineTtsModelConfig extends Struct {
external SherpaOnnxOfflineTtsVitsModelConfig vits;
@Int32()
Expand All @@ -140,6 +156,7 @@ final class SherpaOnnxOfflineTtsModelConfig extends Struct {
external int debug;

external Pointer<Utf8> provider;
external SherpaOnnxOfflineTtsMatchaModelConfig matcha;
}

final class SherpaOnnxOfflineTtsConfig extends Struct {
Expand Down
Loading
Loading