From c6f22f25fad81c1053064323bba675df1666af0c Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Tue, 4 Jun 2024 12:10:39 +0800 Subject: [PATCH 001/237] export telespeech ctc models to sherpa-onnx (#968) --- .github/workflows/export-telespeech-ctc.yaml | 50 ++++++ scripts/tele-speech/.gitignore | 1 + scripts/tele-speech/README.md | 13 ++ scripts/tele-speech/add-metadata.py | 74 +++++++++ scripts/tele-speech/run.sh | 48 ++++++ scripts/tele-speech/test.py | 156 +++++++++++++++++++ 6 files changed, 342 insertions(+) create mode 100644 .github/workflows/export-telespeech-ctc.yaml create mode 100644 scripts/tele-speech/.gitignore create mode 100644 scripts/tele-speech/README.md create mode 100755 scripts/tele-speech/add-metadata.py create mode 100755 scripts/tele-speech/run.sh create mode 100755 scripts/tele-speech/test.py diff --git a/.github/workflows/export-telespeech-ctc.yaml b/.github/workflows/export-telespeech-ctc.yaml new file mode 100644 index 000000000..38b3ce68f --- /dev/null +++ b/.github/workflows/export-telespeech-ctc.yaml @@ -0,0 +1,50 @@ +name: export-telespeech-ctc-to-onnx + +on: + workflow_dispatch: + +concurrency: + group: export-telespeech-ctc-to-onnx-${{ github.ref }} + cancel-in-progress: true + +jobs: + export-telespeech-ctc-to-onnx: + if: github.repository_owner == 'k2-fsa' || github.repository_owner == 'csukuangfj' + name: telespeech + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-version: ["3.10"] + + steps: + - uses: actions/checkout@v4 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Python dependencies + shell: bash + run: | + pip install onnx onnxruntime soundfile librosa numpy kaldi-native-fbank + + - name: Run + shell: bash + run: | + cd scripts/tele-speech + ./run.sh + + ./test.py + + - name: Release + uses: svenstaro/upload-release-action@v2 + with: + file_glob: true + file: ./*.tar.bz2 + overwrite: true + repo_name: k2-fsa/sherpa-onnx + repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} + tag: asr-models diff --git a/scripts/tele-speech/.gitignore b/scripts/tele-speech/.gitignore new file mode 100644 index 000000000..a6c57f5fb --- /dev/null +++ b/scripts/tele-speech/.gitignore @@ -0,0 +1 @@ +*.json diff --git a/scripts/tele-speech/README.md b/scripts/tele-speech/README.md new file mode 100644 index 000000000..fbd808c2d --- /dev/null +++ b/scripts/tele-speech/README.md @@ -0,0 +1,13 @@ +# Introduction + +This folder contains scripts about adding metadata to +onnx models from +https://hf-mirror.com/lovemefan/telespeech/tree/main + +Please see + + - https://github.com/Tele-AI/TeleSpeech-ASR + - https://github.com/lovemefan/telespeech-asr-python + - [TeleSpeech模型社区许可协议.pdf](https://github.com/Tele-AI/TeleSpeech-ASR/blob/master/TeleSpeech%E6%A8%A1%E5%9E%8B%E7%A4%BE%E5%8C%BA%E8%AE%B8%E5%8F%AF%E5%8D%8F%E8%AE%AE.pdf) + +for more details. diff --git a/scripts/tele-speech/add-metadata.py b/scripts/tele-speech/add-metadata.py new file mode 100755 index 000000000..63f749430 --- /dev/null +++ b/scripts/tele-speech/add-metadata.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 + +import json +from typing import Dict + +import onnx +from onnxruntime.quantization import QuantType, quantize_dynamic + + +def add_meta_data(filename: str, meta_data: Dict[str, str]): + """Add meta data to an ONNX model. It is changed in-place. + + Args: + filename: + Filename of the ONNX model to be changed. + meta_data: + Key-value pairs. + """ + model = onnx.load(filename) + + while len(model.metadata_props): + model.metadata_props.pop() + + for key, value in meta_data.items(): + meta = model.metadata_props.add() + meta.key = key + meta.value = value + + onnx.save(model, filename) + + +def main(): + with open("./vocab.json", "r", encoding="utf-8") as f: + tokens = json.load(f) + + vocab_size = len(tokens) + with open("tokens.txt", "w", encoding="utf-8") as f: + for token, idx in tokens.items(): + if idx == 0: + f.write(" 0\n") + else: + f.write(f"{token} {idx}\n") + + filename = "model.onnx" + meta_data = { + "model_type": "telespeech_ctc", + "version": "1", + "model_author": "Tele-AI", + "comment": "See also https://github.com/lovemefan/telespeech-asr-python", + "license": "https://github.com/Tele-AI/TeleSpeech-ASR/blob/master/TeleSpeech%E6%A8%A1%E5%9E%8B%E7%A4%BE%E5%8C%BA%E8%AE%B8%E5%8F%AF%E5%8D%8F%E8%AE%AE.pdf", + "url": "https://github.com/Tele-AI/TeleSpeech-ASR", + } + + add_meta_data(filename, meta_data) + + filename_int8 = f"model.int8.onnx" + quantize_dynamic( + model_input=filename, + model_output=filename_int8, + op_types_to_quantize=["MatMul"], + weight_type=QuantType.QInt8, + ) + + # filename_uint8 = f"model.uint8.onnx" + # quantize_dynamic( + # model_input=filename, + # model_output=filename_uint8, + # op_types_to_quantize=["MatMul"], + # weight_type=QuantType.QUInt8, + # ) + + +if __name__ == "__main__": + main() diff --git a/scripts/tele-speech/run.sh b/scripts/tele-speech/run.sh new file mode 100755 index 000000000..2ee2107ec --- /dev/null +++ b/scripts/tele-speech/run.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash + +curl -SL -O https://hf-mirror.com/lovemefan/telespeech/resolve/main/model_export.onnx + +mv model_export.onnx model.onnx + +curl -SL -O https://hf-mirror.com/lovemefan/telespeech/resolve/main/vocab.json + +curl -SL -O https://github.com/csukuangfj/models/releases/download/a/TeleSpeech.pdf +curl -SL -O https://hf-mirror.com/csukuangfj/sherpa-onnx-paraformer-zh-small-2024-03-09/resolve/main/test_wavs/3-sichuan.wav +curl -SL -O https://hf-mirror.com/csukuangfj/sherpa-onnx-paraformer-zh-small-2024-03-09/resolve/main/test_wavs/4-tianjin.wav +curl -SL -O https://hf-mirror.com/csukuangfj/sherpa-onnx-paraformer-zh-small-2024-03-09/resolve/main/test_wavs/5-henan.wav + +ls -lh + +./add-metadata.py + +dst=sherpa-onnx-telespeech-ctc-zh-2024-06-04 +mkdir $dst +mkdir $dst/test_wavs +cp -v model.onnx $dst/ +cp -v tokens.txt $dst +cp -v *.wav $dst/test_wavs +cp -v *.pdf $dst +cp -v README.md $dst +cp -v *.py $dst + +ls -lh $dst + +tar cvjfv ${dst}.tar.bz2 $dst + +dst=sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04 +mkdir $dst +mkdir $dst/test_wavs +cp -v model.int8.onnx $dst/ +cp -v tokens.txt $dst +cp -v *.wav $dst/test_wavs +cp -v *.pdf $dst +cp -v README.md $dst +cp -v *.py $dst + +ls -lh $dst + +tar cvjfv ${dst}.tar.bz2 $dst + +cp -v *.tar.bz2 ../.. + +ls -lh ../../ diff --git a/scripts/tele-speech/test.py b/scripts/tele-speech/test.py new file mode 100755 index 000000000..71953700f --- /dev/null +++ b/scripts/tele-speech/test.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 +# Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang) + +from typing import Tuple + +import kaldi_native_fbank as knf +import numpy as np +import onnxruntime as ort +import soundfile as sf + +""" +NodeArg(name='feats', type='tensor(float)', shape=[1, 'T', 40]) +----- +NodeArg(name='logits', type='tensor(float)', shape=['Addlogits_dim_0', 1, 7535]) +""" + + +class OnnxModel: + def __init__( + self, + filename: str, + ): + session_opts = ort.SessionOptions() + session_opts.inter_op_num_threads = 1 + session_opts.intra_op_num_threads = 1 + + self.session_opts = session_opts + + self.model = ort.InferenceSession( + filename, + sess_options=self.session_opts, + providers=["CPUExecutionProvider"], + ) + + self.show() + + def show(self): + for i in self.model.get_inputs(): + print(i) + + print("-----") + + for i in self.model.get_outputs(): + print(i) + + def __call__(self, x): + """ + Args: + x: a float32 tensor of shape (N, T, C) + """ + logits = self.model.run( + [ + self.model.get_outputs()[0].name, + ], + { + self.model.get_inputs()[0].name: x, + }, + )[0] + + return logits + + +def load_audio(filename: str) -> Tuple[np.ndarray, int]: + data, sample_rate = sf.read( + filename, + always_2d=True, + dtype="float32", + ) + data = data[:, 0] # use only the first channel + samples = np.ascontiguousarray(data) + return samples, sample_rate + + +def get_features(test_wav_filename): + samples, sample_rate = load_audio(test_wav_filename) + + if sample_rate != 16000: + import librosa + + samples = librosa.resample(samples, orig_sr=sample_rate, target_sr=16000) + sample_rate = 16000 + + samples *= 372768 + + opts = knf.MfccOptions() + # See https://github.com/Tele-AI/TeleSpeech-ASR/blob/master/mfcc_hires.conf + opts.frame_opts.dither = 0 + + opts.num_ceps = 40 + opts.use_energy = False + + opts.mel_opts.num_bins = 40 + opts.mel_opts.low_freq = 40 + opts.mel_opts.high_freq = -200 + + mfcc = knf.OnlineMfcc(opts) + mfcc.accept_waveform(16000, samples) + frames = [] + for i in range(mfcc.num_frames_ready): + frames.append(mfcc.get_frame(i)) + + frames = np.stack(frames, axis=0) + return frames + + +def cmvn(features): + # See https://github.com/Tele-AI/TeleSpeech-ASR/blob/master/wenet_representation/conf/train_d2v2_ark_conformer.yaml#L70 + # https://github.com/Tele-AI/TeleSpeech-ASR/blob/master/wenet_representation/wenet/dataset/dataset.py#L184 + # https://github.com/Tele-AI/TeleSpeech-ASR/blob/master/wenet_representation/wenet/dataset/processor.py#L278 + mean = features.mean(axis=0, keepdims=True) + std = features.std(axis=0, keepdims=True) + return (features - mean) / (std + 1e-5) + + +def main(): + # Please download the test data from + # https://hf-mirror.com/csukuangfj/sherpa-onnx-paraformer-zh-small-2024-03-09/tree/main/test_wavs + test_wav_filename = "./3-sichuan.wav" + test_wav_filename = "./4-tianjin.wav" + test_wav_filename = "./5-henan.wav" + + features = get_features(test_wav_filename) + + features = cmvn(features) + + features = np.expand_dims(features, axis=0) # (T, C) -> (N, T, C) + + model_filename = "./model.int8.onnx" + model = OnnxModel(model_filename) + logits = model(features) + logits = logits.squeeze(axis=1) # remove batch axis + ids = logits.argmax(axis=-1) + + id2token = dict() + with open("./tokens.txt", encoding="utf-8") as f: + for line in f: + t, idx = line.split() + id2token[int(idx)] = t + + tokens = [] + + blank = 0 + prev = -1 + + for k in ids: + if k != blank and k != prev: + tokens.append(k) + prev = k + + tokens = [id2token[i] for i in tokens] + text = "".join(tokens) + print(text) + + +if __name__ == "__main__": + main() From f8dbc1014620c01eb0d27b0a20222dae31b8cae5 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Tue, 4 Jun 2024 17:05:49 +0800 Subject: [PATCH 002/237] Fix CI (#964) --- scripts/dotnet/OfflineModelConfig.cs | 8 +++ scripts/dotnet/OnlineModelConfig.cs | 8 +++ scripts/go/sherpa_onnx.go | 17 ++++++ .../node-addon-api/src/non-streaming-asr.cc | 10 ++++ scripts/node-addon-api/src/streaming-asr.cc | 10 ++++ swift-api-examples/SherpaOnnx.swift | 16 ++++-- wasm/asr/sherpa-onnx-asr.js | 56 +++++++++++++++++-- wasm/asr/sherpa-onnx-wasm-main-asr.cc | 4 +- wasm/nodejs/sherpa-onnx-wasm-nodejs.cc | 4 +- 9 files changed, 123 insertions(+), 10 deletions(-) diff --git a/scripts/dotnet/OfflineModelConfig.cs b/scripts/dotnet/OfflineModelConfig.cs index 58b24dbbe..2dc2347c1 100644 --- a/scripts/dotnet/OfflineModelConfig.cs +++ b/scripts/dotnet/OfflineModelConfig.cs @@ -23,6 +23,8 @@ public OfflineModelConfig() Debug = 0; Provider = "cpu"; ModelType = ""; + ModelingUnit = "cjkchar"; + BpeVocab = ""; } public OfflineTransducerModelConfig Transducer; public OfflineParaformerModelConfig Paraformer; @@ -42,5 +44,11 @@ public OfflineModelConfig() [MarshalAs(UnmanagedType.LPStr)] public string ModelType; + + [MarshalAs(UnmanagedType.LPStr)] + public string ModelingUnit; + + [MarshalAs(UnmanagedType.LPStr)] + public string BpeVocab; } } diff --git a/scripts/dotnet/OnlineModelConfig.cs b/scripts/dotnet/OnlineModelConfig.cs index 1471959d8..dcba23cf8 100644 --- a/scripts/dotnet/OnlineModelConfig.cs +++ b/scripts/dotnet/OnlineModelConfig.cs @@ -23,6 +23,8 @@ public OnlineModelConfig() Provider = "cpu"; Debug = 0; ModelType = ""; + ModelingUnit = "cjkchar"; + BpeVocab = ""; } public OnlineTransducerModelConfig Transducer; @@ -43,5 +45,11 @@ public OnlineModelConfig() [MarshalAs(UnmanagedType.LPStr)] public string ModelType; + + [MarshalAs(UnmanagedType.LPStr)] + public string ModelingUnit; + + [MarshalAs(UnmanagedType.LPStr)] + public string BpeVocab; } } diff --git a/scripts/go/sherpa_onnx.go b/scripts/go/sherpa_onnx.go index af60d959f..e89787da9 100644 --- a/scripts/go/sherpa_onnx.go +++ b/scripts/go/sherpa_onnx.go @@ -87,6 +87,8 @@ type OnlineModelConfig struct { Provider string // Optional. Valid values are: cpu, cuda, coreml Debug int // 1 to show model meta information while loading it. ModelType string // Optional. You can specify it for faster model initialization + ModelingUnit string // Optional. cjkchar, bpe, cjkchar+bpe + BpeVocab string // Optional. } // Configuration for the feature extractor @@ -187,6 +189,12 @@ func NewOnlineRecognizer(config *OnlineRecognizerConfig) *OnlineRecognizer { c.model_config.model_type = C.CString(config.ModelConfig.ModelType) defer C.free(unsafe.Pointer(c.model_config.model_type)) + c.model_config.modeling_unit = C.CString(config.ModelConfig.ModelingUnit) + defer C.free(unsafe.Pointer(c.model_config.modeling_unit)) + + c.model_config.bpe_vocab = C.CString(config.ModelConfig.BpeVocab) + defer C.free(unsafe.Pointer(c.model_config.bpe_vocab)) + c.decoding_method = C.CString(config.DecodingMethod) defer C.free(unsafe.Pointer(c.decoding_method)) @@ -372,6 +380,9 @@ type OfflineModelConfig struct { // Optional. Specify it for faster model initialization. ModelType string + + ModelingUnit string // Optional. cjkchar, bpe, cjkchar+bpe + BpeVocab string // Optional. } // Configuration for the offline/non-streaming recognizer. @@ -460,6 +471,12 @@ func NewOfflineRecognizer(config *OfflineRecognizerConfig) *OfflineRecognizer { c.model_config.model_type = C.CString(config.ModelConfig.ModelType) defer C.free(unsafe.Pointer(c.model_config.model_type)) + c.model_config.modeling_unit = C.CString(config.ModelConfig.ModelingUnit) + defer C.free(unsafe.Pointer(c.model_config.modeling_unit)) + + c.model_config.bpe_vocab = C.CString(config.ModelConfig.BpeVocab) + defer C.free(unsafe.Pointer(c.model_config.bpe_vocab)) + c.lm_config.model = C.CString(config.LmConfig.Model) defer C.free(unsafe.Pointer(c.lm_config.model)) diff --git a/scripts/node-addon-api/src/non-streaming-asr.cc b/scripts/node-addon-api/src/non-streaming-asr.cc index a1749a47e..d101c7eb6 100644 --- a/scripts/node-addon-api/src/non-streaming-asr.cc +++ b/scripts/node-addon-api/src/non-streaming-asr.cc @@ -126,6 +126,8 @@ static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) { SHERPA_ONNX_ASSIGN_ATTR_STR(provider, provider); SHERPA_ONNX_ASSIGN_ATTR_STR(model_type, modelType); + SHERPA_ONNX_ASSIGN_ATTR_STR(modeling_unit, modelingUnit); + SHERPA_ONNX_ASSIGN_ATTR_STR(bpe_vocab, bpeVocab); return c; } @@ -232,6 +234,14 @@ CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) { delete[] c.model_config.model_type; } + if (c.model_config.modeling_unit) { + delete[] c.model_config.modeling_unit; + } + + if (c.model_config.bpe_vocab) { + delete[] c.model_config.bpe_vocab; + } + if (c.lm_config.model) { delete[] c.lm_config.model; } diff --git a/scripts/node-addon-api/src/streaming-asr.cc b/scripts/node-addon-api/src/streaming-asr.cc index fec4a46fc..59312a230 100644 --- a/scripts/node-addon-api/src/streaming-asr.cc +++ b/scripts/node-addon-api/src/streaming-asr.cc @@ -118,6 +118,8 @@ SherpaOnnxOnlineModelConfig GetOnlineModelConfig(Napi::Object obj) { } SHERPA_ONNX_ASSIGN_ATTR_STR(model_type, modelType); + SHERPA_ONNX_ASSIGN_ATTR_STR(modeling_unit, modelingUnit); + SHERPA_ONNX_ASSIGN_ATTR_STR(bpe_vocab, bpeVocab); return c; } @@ -228,6 +230,14 @@ static Napi::External CreateOnlineRecognizerWrapper( delete[] c.model_config.model_type; } + if (c.model_config.modeling_unit) { + delete[] c.model_config.modeling_unit; + } + + if (c.model_config.bpe_vocab) { + delete[] c.model_config.bpe_vocab; + } + if (c.decoding_method) { delete[] c.decoding_method; } diff --git a/swift-api-examples/SherpaOnnx.swift b/swift-api-examples/SherpaOnnx.swift index f39d5ebee..0c8d22f3d 100644 --- a/swift-api-examples/SherpaOnnx.swift +++ b/swift-api-examples/SherpaOnnx.swift @@ -88,7 +88,9 @@ func sherpaOnnxOnlineModelConfig( numThreads: Int = 1, provider: String = "cpu", debug: Int = 0, - modelType: String = "" + modelType: String = "", + modelingUnit: String = "cjkchar", + bpeVocab: String = "" ) -> SherpaOnnxOnlineModelConfig { return SherpaOnnxOnlineModelConfig( transducer: transducer, @@ -98,7 +100,9 @@ func sherpaOnnxOnlineModelConfig( num_threads: Int32(numThreads), provider: toCPointer(provider), debug: Int32(debug), - model_type: toCPointer(modelType) + model_type: toCPointer(modelType), + modeling_unit: toCPointer(modelingUnit), + bpeVocab: toCPointer(bpeVocab) ) } @@ -354,7 +358,9 @@ func sherpaOnnxOfflineModelConfig( numThreads: Int = 1, provider: String = "cpu", debug: Int = 0, - modelType: String = "" + modelType: String = "", + modelingUnit: String = "cjkchar", + bpeVocab: String = "" ) -> SherpaOnnxOfflineModelConfig { return SherpaOnnxOfflineModelConfig( transducer: transducer, @@ -366,7 +372,9 @@ func sherpaOnnxOfflineModelConfig( num_threads: Int32(numThreads), debug: Int32(debug), provider: toCPointer(provider), - model_type: toCPointer(modelType) + model_type: toCPointer(modelType), + modeling_unit: toCPointer(modelingUnit), + bpeVocab: toCPointer(bpeVocab) ) } diff --git a/wasm/asr/sherpa-onnx-asr.js b/wasm/asr/sherpa-onnx-asr.js index d68b22e20..c77794a68 100644 --- a/wasm/asr/sherpa-onnx-asr.js +++ b/wasm/asr/sherpa-onnx-asr.js @@ -137,7 +137,7 @@ function initSherpaOnnxOnlineModelConfig(config, Module) { const ctc = initSherpaOnnxOnlineZipformer2CtcModelConfig( config.zipformer2Ctc, Module); - const len = transducer.len + paraformer.len + ctc.len + 5 * 4; + const len = transducer.len + paraformer.len + ctc.len + 7 * 4; const ptr = Module._malloc(len); let offset = 0; @@ -153,7 +153,11 @@ function initSherpaOnnxOnlineModelConfig(config, Module) { const tokensLen = Module.lengthBytesUTF8(config.tokens) + 1; const providerLen = Module.lengthBytesUTF8(config.provider) + 1; const modelTypeLen = Module.lengthBytesUTF8(config.modelType) + 1; - const bufferLen = tokensLen + providerLen + modelTypeLen; + const modelingUnitLen = Module.lengthBytesUTF8(config.modelingUnit || '') + 1; + const bpeVocabLen = Module.lengthBytesUTF8(config.bpeVocab || '') + 1; + + const bufferLen = + tokensLen + providerLen + modelTypeLen + modelingUnitLen + bpeVocabLen; const buffer = Module._malloc(bufferLen); offset = 0; @@ -164,6 +168,14 @@ function initSherpaOnnxOnlineModelConfig(config, Module) { offset += providerLen; Module.stringToUTF8(config.modelType, buffer + offset, modelTypeLen); + offset += modelTypeLen; + + Module.stringToUTF8( + config.modelingUnit || '', buffer + offset, modelingUnitLen); + offset += modelingUnitLen; + + Module.stringToUTF8(config.bpeVocab || '', buffer + offset, bpeVocabLen); + offset += bpeVocabLen; offset = transducer.len + paraformer.len + ctc.len; Module.setValue(ptr + offset, buffer, 'i8*'); // tokens @@ -182,6 +194,17 @@ function initSherpaOnnxOnlineModelConfig(config, Module) { ptr + offset, buffer + tokensLen + providerLen, 'i8*'); // modelType offset += 4; + Module.setValue( + ptr + offset, buffer + tokensLen + providerLen + modelTypeLen, + 'i8*'); // modelingUnit + offset += 4; + + Module.setValue( + ptr + offset, + buffer + tokensLen + providerLen + modelTypeLen + modelingUnitLen, + 'i8*'); // bpeVocab + offset += 4; + return { buffer: buffer, ptr: ptr, len: len, transducer: transducer, paraformer: paraformer, ctc: ctc @@ -317,6 +340,8 @@ function createOnlineRecognizer(Module, myConfig) { provider: 'cpu', debug: 1, modelType: '', + modelingUnit: 'cjkchar', + bpeVocab: '', }; const featureConfig = { @@ -504,7 +529,7 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { const tdnn = initSherpaOnnxOfflineTdnnModelConfig(config.tdnn, Module); const len = transducer.len + paraformer.len + nemoCtc.len + whisper.len + - tdnn.len + 5 * 4; + tdnn.len + 7 * 4; const ptr = Module._malloc(len); let offset = 0; @@ -526,7 +551,11 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { const tokensLen = Module.lengthBytesUTF8(config.tokens) + 1; const providerLen = Module.lengthBytesUTF8(config.provider) + 1; const modelTypeLen = Module.lengthBytesUTF8(config.modelType) + 1; - const bufferLen = tokensLen + providerLen + modelTypeLen; + const modelingUnitLen = Module.lengthBytesUTF8(config.modelingUnit || '') + 1; + const bpeVocabLen = Module.lengthBytesUTF8(config.bpeVocab || '') + 1; + + const bufferLen = + tokensLen + providerLen + modelTypeLen + modelingUnitLen + bpeVocabLen; const buffer = Module._malloc(bufferLen); offset = 0; @@ -537,6 +566,14 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { offset += providerLen; Module.stringToUTF8(config.modelType, buffer + offset, modelTypeLen); + offset += modelTypeLen; + + Module.stringToUTF8( + config.modelingUnit || '', buffer + offset, modelingUnitLen); + offset += modelingUnitLen; + + Module.stringToUTF8(config.bpeVocab || '', buffer + offset, bpeVocabLen); + offset += bpeVocabLen; offset = transducer.len + paraformer.len + nemoCtc.len + whisper.len + tdnn.len; @@ -556,6 +593,17 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { ptr + offset, buffer + tokensLen + providerLen, 'i8*'); // modelType offset += 4; + Module.setValue( + ptr + offset, buffer + tokensLen + providerLen + modelTypeLen, + 'i8*'); // modelingUnit + offset += 4; + + Module.setValue( + ptr + offset, + buffer + tokensLen + providerLen + modelTypeLen + modelingUnitLen, + 'i8*'); // bpeVocab + offset += 4; + return { buffer: buffer, ptr: ptr, len: len, transducer: transducer, paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn diff --git a/wasm/asr/sherpa-onnx-wasm-main-asr.cc b/wasm/asr/sherpa-onnx-wasm-main-asr.cc index 70d13f1c4..de0cf1430 100644 --- a/wasm/asr/sherpa-onnx-wasm-main-asr.cc +++ b/wasm/asr/sherpa-onnx-wasm-main-asr.cc @@ -19,7 +19,7 @@ static_assert(sizeof(SherpaOnnxOnlineZipformer2CtcModelConfig) == 1 * 4, ""); static_assert(sizeof(SherpaOnnxOnlineModelConfig) == sizeof(SherpaOnnxOnlineTransducerModelConfig) + sizeof(SherpaOnnxOnlineParaformerModelConfig) + - sizeof(SherpaOnnxOnlineZipformer2CtcModelConfig) + 5 * 4, + sizeof(SherpaOnnxOnlineZipformer2CtcModelConfig) + 7 * 4, ""); static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, ""); static_assert(sizeof(SherpaOnnxOnlineCtcFstDecoderConfig) == 2 * 4, ""); @@ -52,6 +52,8 @@ void MyPrint(SherpaOnnxOnlineRecognizerConfig *config) { fprintf(stdout, "provider: %s\n", model_config->provider); fprintf(stdout, "debug: %d\n", model_config->debug); fprintf(stdout, "model type: %s\n", model_config->model_type); + fprintf(stdout, "modeling unit: %s\n", model_config->modeling_unit); + fprintf(stdout, "bpe vocab: %s\n", model_config->bpe_vocab); fprintf(stdout, "----------feat config----------\n"); fprintf(stdout, "sample rate: %d\n", feat->sample_rate); diff --git a/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc b/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc index 539699cc4..ceb5a2442 100644 --- a/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc +++ b/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc @@ -23,7 +23,7 @@ static_assert(sizeof(SherpaOnnxOfflineModelConfig) == sizeof(SherpaOnnxOfflineParaformerModelConfig) + sizeof(SherpaOnnxOfflineNemoEncDecCtcModelConfig) + sizeof(SherpaOnnxOfflineWhisperModelConfig) + - sizeof(SherpaOnnxOfflineTdnnModelConfig) + 5 * 4, + sizeof(SherpaOnnxOfflineTdnnModelConfig) + 7 * 4, ""); static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, ""); static_assert(sizeof(SherpaOnnxOfflineRecognizerConfig) == @@ -90,6 +90,8 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { fprintf(stdout, "provider: %s\n", model_config->provider); fprintf(stdout, "debug: %d\n", model_config->debug); fprintf(stdout, "model type: %s\n", model_config->model_type); + fprintf(stdout, "modeling unit: %s\n", model_config->modeling_unit); + fprintf(stdout, "bpe vocab: %s\n", model_config->bpe_vocab); fprintf(stdout, "----------feat config----------\n"); fprintf(stdout, "sample rate: %d\n", feat->sample_rate); From fd5a0d1e00ffa11b2a3ded064fc6f9a0910dc845 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Wed, 5 Jun 2024 00:26:40 +0800 Subject: [PATCH 003/237] Add C++ runtime for Tele-AI/TeleSpeech-ASR (#970) --- .github/scripts/test-dot-net.sh | 19 +-- .github/scripts/test-offline-ctc.sh | 33 ++++ .github/scripts/test-python.sh | 12 ++ .../workflows/build-wheels-macos-arm64.yaml | 4 +- .../build-wheels-macos-universal2.yaml | 90 +++++++++++ .github/workflows/build-wheels-macos-x64.yaml | 4 +- .github/workflows/export-telespeech-ctc.yaml | 46 ++++++ .github/workflows/linux.yaml | 16 +- .github/workflows/macos.yaml | 14 +- .github/workflows/swift.yaml | 2 +- .github/workflows/test-go.yaml | 18 ++- .github/workflows/test-piper-phonemize.yaml | 2 +- .gitignore | 1 + CMakeLists.txt | 2 +- cmake/espeak-ng-for-piper.cmake | 4 +- cmake/kaldi-native-fbank.cmake | 16 +- .../offline-decode-files/Program.cs | 11 ++ .../run-telespeech-ctc.sh | 15 ++ .../non-streaming-decode-files/main.go | 3 + .../run-telespeech-ctc.sh | 19 +++ .../NonStreamingDecodeFileParaformer.java | 2 +- .../NonStreamingDecodeFileTeleSpeechCtc.java | 47 ++++++ ...n-streaming-decode-file-tele-speech-ctc.sh | 37 +++++ .../offline-telespeech-ctc-decode-files.py | 60 ++++++++ scripts/apk/generate-vad-asr-apk-script.py | 16 ++ scripts/dotnet/OfflineModelConfig.cs | 4 + scripts/dotnet/run.sh | 4 +- .../run-telespeech-ctc.sh | 1 + scripts/go/sherpa_onnx.go | 8 +- .../node-addon-api/src/non-streaming-asr.cc | 5 + sherpa-onnx/c-api/c-api.cc | 3 + sherpa-onnx/c-api/c-api.h | 1 + sherpa-onnx/csrc/CMakeLists.txt | 1 + sherpa-onnx/csrc/features.cc | 101 ++++++++---- sherpa-onnx/csrc/features.h | 11 +- sherpa-onnx/csrc/offline-ctc-model.cc | 17 +++ sherpa-onnx/csrc/offline-model-config.cc | 18 ++- sherpa-onnx/csrc/offline-model-config.h | 3 + .../csrc/offline-recognizer-ctc-impl.h | 11 ++ sherpa-onnx/csrc/offline-recognizer-impl.cc | 30 +++- sherpa-onnx/csrc/offline-stream.cc | 90 +++++++---- .../csrc/offline-telespeech-ctc-model.cc | 144 ++++++++++++++++++ .../csrc/offline-telespeech-ctc-model.h | 81 ++++++++++ sherpa-onnx/csrc/online-model-config.cc | 2 +- .../k2fsa/sherpa/onnx/OfflineModelConfig.java | 13 ++ sherpa-onnx/jni/offline-recognizer.cc | 6 + sherpa-onnx/kotlin-api/OfflineRecognizer.kt | 10 ++ .../python/csrc/offline-model-config.cc | 41 ++--- .../python/sherpa_onnx/offline_recognizer.py | 65 ++++++++ swift-api-examples/SherpaOnnx.swift | 8 +- wasm/asr/sherpa-onnx-asr.js | 19 ++- wasm/nodejs/sherpa-onnx-wasm-nodejs.cc | 3 +- 52 files changed, 1050 insertions(+), 143 deletions(-) create mode 100644 .github/workflows/build-wheels-macos-universal2.yaml create mode 100755 dotnet-examples/offline-decode-files/run-telespeech-ctc.sh create mode 100755 go-api-examples/non-streaming-decode-files/run-telespeech-ctc.sh create mode 100644 java-api-examples/NonStreamingDecodeFileTeleSpeechCtc.java create mode 100755 java-api-examples/run-non-streaming-decode-file-tele-speech-ctc.sh create mode 100755 python-api-examples/offline-telespeech-ctc-decode-files.py create mode 120000 scripts/go/_internal/non-streaming-decode-files/run-telespeech-ctc.sh create mode 100644 sherpa-onnx/csrc/offline-telespeech-ctc-model.cc create mode 100644 sherpa-onnx/csrc/offline-telespeech-ctc-model.h diff --git a/.github/scripts/test-dot-net.sh b/.github/scripts/test-dot-net.sh index 1843cdf42..6ae126037 100755 --- a/.github/scripts/test-dot-net.sh +++ b/.github/scripts/test-dot-net.sh @@ -2,7 +2,16 @@ cd dotnet-examples/ -cd vad-non-streaming-asr-paraformer +cd ./offline-decode-files +./run-telespeech-ctc.sh +./run-nemo-ctc.sh +./run-paraformer.sh +./run-zipformer.sh +./run-hotwords.sh +./run-whisper.sh +./run-tdnn-yesno.sh + +cd ../vad-non-streaming-asr-paraformer ./run.sh cd ../offline-punctuation @@ -22,14 +31,6 @@ cd ../online-decode-files ./run-transducer.sh ./run-paraformer.sh -cd ../offline-decode-files -./run-nemo-ctc.sh -./run-paraformer.sh -./run-zipformer.sh -./run-hotwords.sh -./run-whisper.sh -./run-tdnn-yesno.sh - cd ../offline-tts ./run-aishell3.sh ./run-piper.sh diff --git a/.github/scripts/test-offline-ctc.sh b/.github/scripts/test-offline-ctc.sh index 4a5955464..7fc5e4501 100755 --- a/.github/scripts/test-offline-ctc.sh +++ b/.github/scripts/test-offline-ctc.sh @@ -15,6 +15,39 @@ echo "PATH: $PATH" which $EXE +log "test offline TeleSpeech CTC" +url=https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 +name=$(basename $url) +repo=$(basename -s .tar.bz2 $name) + +curl -SL -O $url +tar xvf $name +rm $name +ls -lh $repo + +test_wavs=( +3-sichuan.wav +4-tianjin.wav +5-henan.wav +) +for w in ${test_wavs[@]}; do + time $EXE \ + --tokens=$repo/tokens.txt \ + --telespeech-ctc=$repo/model.int8.onnx \ + --debug=1 \ + $repo/test_wavs/$w +done + +time $EXE \ + --tokens=$repo/tokens.txt \ + --telespeech-ctc=$repo/model.int8.onnx \ + --debug=1 \ + $repo/test_wavs/3-sichuan.wav \ + $repo/test_wavs/4-tianjin.wav \ + $repo/test_wavs/5-henan.wav + +rm -rf $repo + log "-----------------------------------------------------------------" log "Run Nemo fast conformer hybrid transducer ctc models (CTC branch)" log "-----------------------------------------------------------------" diff --git a/.github/scripts/test-python.sh b/.github/scripts/test-python.sh index e908f62b2..a52b5b910 100755 --- a/.github/scripts/test-python.sh +++ b/.github/scripts/test-python.sh @@ -10,6 +10,18 @@ log() { export GIT_CLONE_PROTECTION_ACTIVE=false +log "test offline TeleSpeech CTC" +url=https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 +name=$(basename $url) +repo=$(basename -s .tar.bz2 $name) + +curl -SL -O $url +tar xvf $name +rm $name +ls -lh $repo +python3 ./python-api-examples/offline-telespeech-ctc-decode-files.py +rm -rf $repo + log "test online NeMo CTC" url=https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-streaming-fast-conformer-ctc-en-80ms.tar.bz2 diff --git a/.github/workflows/build-wheels-macos-arm64.yaml b/.github/workflows/build-wheels-macos-arm64.yaml index 2cdea3f78..9a8edd504 100644 --- a/.github/workflows/build-wheels-macos-arm64.yaml +++ b/.github/workflows/build-wheels-macos-arm64.yaml @@ -82,7 +82,7 @@ jobs: TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} run: | - python3 -m pip install --upgrade pip - python3 -m pip install wheel twine setuptools + python3 -m pip install --break-system-packages --upgrade pip + python3 -m pip install --break-system-packages wheel twine setuptools twine upload ./wheelhouse/*.whl diff --git a/.github/workflows/build-wheels-macos-universal2.yaml b/.github/workflows/build-wheels-macos-universal2.yaml new file mode 100644 index 000000000..4d52110ee --- /dev/null +++ b/.github/workflows/build-wheels-macos-universal2.yaml @@ -0,0 +1,90 @@ +name: build-wheels-macos-universal2 + +on: + push: + branches: + - wheel + tags: + - '*' + workflow_dispatch: + +env: + SHERPA_ONNX_IS_IN_GITHUB_ACTIONS: 1 + +concurrency: + group: build-wheels-macos-universal2-${{ github.ref }} + cancel-in-progress: true + +jobs: + build_wheels_macos_universal2: + name: ${{ matrix.python-version }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [macos-latest] + python-version: ["cp38", "cp39", "cp310", "cp311", "cp312"] + + steps: + - uses: actions/checkout@v4 + + - name: Build wheels + uses: pypa/cibuildwheel@v2.15.0 + env: + CIBW_BUILD: "${{ matrix.python-version}}-* " + CIBW_ENVIRONMENT: SHERPA_ONNX_CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES='arm64;x86_64'" + CIBW_ARCHS: "universal2" + CIBW_BUILD_VERBOSITY: 3 + + # Don't repair macOS wheels + CIBW_REPAIR_WHEEL_COMMAND_MACOS: "" + + - name: Display wheels + shell: bash + run: | + ls -lh ./wheelhouse/ + + - uses: actions/upload-artifact@v4 + with: + name: wheel-${{ matrix.python-version }} + path: ./wheelhouse/*.whl + + - name: Publish to huggingface + if: matrix.python-version == 'cp38' + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + uses: nick-fields/retry@v3 + with: + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + rm -rf huggingface + export GIT_LFS_SKIP_SMUDGE=1 + export GIT_CLONE_PROTECTION_ACTIVE=false + + git clone https://huggingface.co/csukuangfj/sherpa-onnx-wheels huggingface + cd huggingface + git fetch + git pull + git merge -m "merge remote" --ff origin main + + cp -v ../wheelhouse/*.whl . + + git status + git add . + git commit -m "add more wheels" + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-wheels main + + - name: Publish wheels to PyPI + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: | + python3 -m pip install --break-system-packages --upgrade pip + python3 -m pip install --break-system-packages wheel twine setuptools + + twine upload ./wheelhouse/*.whl diff --git a/.github/workflows/build-wheels-macos-x64.yaml b/.github/workflows/build-wheels-macos-x64.yaml index 13dc292dc..fbd7781b5 100644 --- a/.github/workflows/build-wheels-macos-x64.yaml +++ b/.github/workflows/build-wheels-macos-x64.yaml @@ -99,7 +99,7 @@ jobs: TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} run: | - python3 -m pip install --upgrade pip - python3 -m pip install wheel twine setuptools + python3 -m pip install --break-system-packages --upgrade pip + python3 -m pip install --break-system-packages wheel twine setuptools twine upload ./wheelhouse/*.whl diff --git a/.github/workflows/export-telespeech-ctc.yaml b/.github/workflows/export-telespeech-ctc.yaml index 38b3ce68f..102c3884e 100644 --- a/.github/workflows/export-telespeech-ctc.yaml +++ b/.github/workflows/export-telespeech-ctc.yaml @@ -48,3 +48,49 @@ jobs: repo_name: k2-fsa/sherpa-onnx repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} tag: asr-models + + - name: Publish float32 model to huggingface + shell: bash + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + src=scripts/tele-speech/sherpa-onnx-telespeech-ctc-zh-2024-06-04 + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + export GIT_CLONE_PROTECTION_ACTIVE=false + + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-telespeech-ctc-zh-2024-06-04 hf + cp -a $src/* hf/ + cd hf + git lfs track "*.pdf" + git lfs track "*.onnx" + git add . + git commit -m 'add model files' || true + git status + ls -lh + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-telespeech-ctc-zh-2024-06-04 main || true + rm -rf hf + + - name: Publish int8 model to huggingface + shell: bash + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + src=scripts/tele-speech/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04 + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + export GIT_CLONE_PROTECTION_ACTIVE=false + + rm -rf hf + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04 hf + cp -a $src/* hf/ + cd hf + git lfs track "*.pdf" + git lfs track "*.onnx" + git add . + git commit -m 'add model files' || true + git status + ls -lh + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04 main || true diff --git a/.github/workflows/linux.yaml b/.github/workflows/linux.yaml index 92f2b02f0..861a2df0e 100644 --- a/.github/workflows/linux.yaml +++ b/.github/workflows/linux.yaml @@ -130,34 +130,34 @@ jobs: name: release-${{ matrix.build_type }}-with-shared-lib-${{ matrix.shared_lib }}-with-tts-${{ matrix.with_tts }} path: install/* - - name: Test online transducer + - name: Test offline CTC shell: bash run: | du -h -d1 . export PATH=$PWD/build/bin:$PATH - export EXE=sherpa-onnx + export EXE=sherpa-onnx-offline - .github/scripts/test-online-transducer.sh + .github/scripts/test-offline-ctc.sh du -h -d1 . - - name: Test online transducer (C API) + - name: Test online transducer shell: bash run: | du -h -d1 . export PATH=$PWD/build/bin:$PATH - export EXE=decode-file-c-api + export EXE=sherpa-onnx .github/scripts/test-online-transducer.sh du -h -d1 . - - name: Test offline CTC + - name: Test online transducer (C API) shell: bash run: | du -h -d1 . export PATH=$PWD/build/bin:$PATH - export EXE=sherpa-onnx-offline + export EXE=decode-file-c-api - .github/scripts/test-offline-ctc.sh + .github/scripts/test-online-transducer.sh du -h -d1 . - name: Test spoken language identification (C++ API) diff --git a/.github/workflows/macos.yaml b/.github/workflows/macos.yaml index 95030e573..1c7fe2ee1 100644 --- a/.github/workflows/macos.yaml +++ b/.github/workflows/macos.yaml @@ -107,6 +107,14 @@ jobs: otool -L build/bin/sherpa-onnx otool -l build/bin/sherpa-onnx + - name: Test offline CTC + shell: bash + run: | + export PATH=$PWD/build/bin:$PATH + export EXE=sherpa-onnx-offline + + .github/scripts/test-offline-ctc.sh + - name: Test offline transducer shell: bash run: | @@ -192,13 +200,7 @@ jobs: .github/scripts/test-offline-whisper.sh - - name: Test offline CTC - shell: bash - run: | - export PATH=$PWD/build/bin:$PATH - export EXE=sherpa-onnx-offline - .github/scripts/test-offline-ctc.sh - name: Test online transducer shell: bash diff --git a/.github/workflows/swift.yaml b/.github/workflows/swift.yaml index 5d3a01252..6b4ef10df 100644 --- a/.github/workflows/swift.yaml +++ b/.github/workflows/swift.yaml @@ -39,7 +39,7 @@ jobs: strategy: fail-fast: false matrix: - os: [macos-13] + os: [macos-latest, macos-14] steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/test-go.yaml b/.github/workflows/test-go.yaml index fa88d794a..5724d9cb9 100644 --- a/.github/workflows/test-go.yaml +++ b/.github/workflows/test-go.yaml @@ -30,14 +30,12 @@ concurrency: jobs: test-go: - name: ${{ matrix.os }} ${{matrix.arch }} + name: ${{ matrix.os }} runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: - include: - - os: macos-latest - arch: amd64 + os: [macos-latest, macos-14] steps: - uses: actions/checkout@v4 @@ -47,7 +45,7 @@ jobs: - name: ccache uses: hendrikmuhs/ccache-action@v1.2 with: - key: ${{ matrix.os }}-${{ matrix.arch }} + key: ${{ matrix.os }}-go - uses: actions/setup-go@v5 with: @@ -109,8 +107,6 @@ jobs: go build ls -lh - git lfs install - echo "Test vits-ljs" ./run-vits-ljs.sh rm -rf vits-ljs @@ -144,7 +140,13 @@ jobs: go build ls -lh - git lfs install + echo "Test telespeech ctc" + ./run-telespeech-ctc.sh + rm -rf sherpa-onnx-telespeech-ctc-* + + echo "Test transducer" + ./run-transducer.sh + rm -rf sherpa-onnx-zipformer-en-2023-06-26 echo "Test transducer" ./run-transducer.sh diff --git a/.github/workflows/test-piper-phonemize.yaml b/.github/workflows/test-piper-phonemize.yaml index dd00a4901..1edbae6d2 100644 --- a/.github/workflows/test-piper-phonemize.yaml +++ b/.github/workflows/test-piper-phonemize.yaml @@ -57,7 +57,7 @@ jobs: mkdir build cd build - cmake -DCMAKE_VERBOSE_MAKEFILE=ON -D SHERPA_ONNX_ENABLE_TESTS=ON -D CMAKE_BUILD_TYPE=${{ matrix.build_type }} -D BUILD_SHARED_LIBS=${{ matrix.shared_lib }} -DCMAKE_INSTALL_PREFIX=./install .. + cmake -DSHERPA_ONNX_ENABLE_EPSEAK_NG_EXE=ON -DBUILD_ESPEAK_NG_EXE=ON -DCMAKE_VERBOSE_MAKEFILE=ON -D SHERPA_ONNX_ENABLE_TESTS=ON -D CMAKE_BUILD_TYPE=${{ matrix.build_type }} -D BUILD_SHARED_LIBS=${{ matrix.shared_lib }} -DCMAKE_INSTALL_PREFIX=./install .. - name: Build shell: bash diff --git a/.gitignore b/.gitignore index 282da268f..1eb26e5c2 100644 --- a/.gitignore +++ b/.gitignore @@ -106,3 +106,4 @@ node_modules package-lock.json sherpa-onnx-nemo-* sherpa-onnx-vits-* +sherpa-onnx-telespeech-ctc-* diff --git a/CMakeLists.txt b/CMakeLists.txt index 0b5136510..584583ba1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ set(CMAKE_OSX_DEPLOYMENT_TARGET "10.14" CACHE STRING "Minimum OS X deployment ve project(sherpa-onnx) -set(SHERPA_ONNX_VERSION "1.9.26") +set(SHERPA_ONNX_VERSION "1.9.27") # Disable warning about # diff --git a/cmake/espeak-ng-for-piper.cmake b/cmake/espeak-ng-for-piper.cmake index 42b6ce75f..8601ab2d4 100644 --- a/cmake/espeak-ng-for-piper.cmake +++ b/cmake/espeak-ng-for-piper.cmake @@ -14,7 +14,9 @@ function(download_espeak_ng_for_piper) set(USE_SPEECHPLAYER OFF CACHE BOOL "" FORCE) set(EXTRA_cmn ON CACHE BOOL "" FORCE) set(EXTRA_ru ON CACHE BOOL "" FORCE) - set(BUILD_ESPEAK_NG_EXE OFF CACHE BOOL "" FORCE) + if (NOT SHERPA_ONNX_ENABLE_EPSEAK_NG_EXE) + set(BUILD_ESPEAK_NG_EXE OFF CACHE BOOL "" FORCE) + endif() # If you don't have access to the Internet, # please pre-download kaldi-decoder diff --git a/cmake/kaldi-native-fbank.cmake b/cmake/kaldi-native-fbank.cmake index ce76745ed..1d10a01e6 100644 --- a/cmake/kaldi-native-fbank.cmake +++ b/cmake/kaldi-native-fbank.cmake @@ -1,9 +1,9 @@ function(download_kaldi_native_fbank) include(FetchContent) - set(kaldi_native_fbank_URL "https://github.com/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.19.1.tar.gz") - set(kaldi_native_fbank_URL2 "https://hub.nuaa.cf/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.19.1.tar.gz") - set(kaldi_native_fbank_HASH "SHA256=0cae8cbb9ea42916b214e088912f9e8f2f648f54756b305f93f552382f31f904") + set(kaldi_native_fbank_URL "https://github.com/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.19.3.tar.gz") + set(kaldi_native_fbank_URL2 "https://hub.nuaa.cf/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.19.3.tar.gz") + set(kaldi_native_fbank_HASH "SHA256=335fe1daf1b9bfb2a7b6bf03b64c4c4686c39077c57fb8058c02611981676638") set(KALDI_NATIVE_FBANK_BUILD_TESTS OFF CACHE BOOL "" FORCE) set(KALDI_NATIVE_FBANK_BUILD_PYTHON OFF CACHE BOOL "" FORCE) @@ -12,11 +12,11 @@ function(download_kaldi_native_fbank) # If you don't have access to the Internet, # please pre-download kaldi-native-fbank set(possible_file_locations - $ENV{HOME}/Downloads/kaldi-native-fbank-1.19.1.tar.gz - ${CMAKE_SOURCE_DIR}/kaldi-native-fbank-1.19.1.tar.gz - ${CMAKE_BINARY_DIR}/kaldi-native-fbank-1.19.1.tar.gz - /tmp/kaldi-native-fbank-1.19.1.tar.gz - /star-fj/fangjun/download/github/kaldi-native-fbank-1.19.1.tar.gz + $ENV{HOME}/Downloads/kaldi-native-fbank-1.19.3.tar.gz + ${CMAKE_SOURCE_DIR}/kaldi-native-fbank-1.19.3.tar.gz + ${CMAKE_BINARY_DIR}/kaldi-native-fbank-1.19.3.tar.gz + /tmp/kaldi-native-fbank-1.19.3.tar.gz + /star-fj/fangjun/download/github/kaldi-native-fbank-1.19.3.tar.gz ) foreach(f IN LISTS possible_file_locations) diff --git a/dotnet-examples/offline-decode-files/Program.cs b/dotnet-examples/offline-decode-files/Program.cs index f88c634d7..ea30a14e2 100644 --- a/dotnet-examples/offline-decode-files/Program.cs +++ b/dotnet-examples/offline-decode-files/Program.cs @@ -34,6 +34,9 @@ class Options [Option(Required = false, Default = "",HelpText = "Path to transducer joiner.onnx. Used only for transducer models")] public string Joiner { get; set; } + [Option("model-type", Required = false, Default = "", HelpText = "model type")] + public string ModelType { get; set; } + [Option("whisper-encoder", Required = false, Default = "", HelpText = "Path to whisper encoder.onnx. Used only for whisper models")] public string WhisperEncoder { get; set; } @@ -56,6 +59,9 @@ class Options [Option("nemo-ctc", Required = false, HelpText = "Path to model.onnx. Used only for NeMo CTC models")] public string NeMoCtc { get; set; } + [Option("telespeech-ctc", Required = false, HelpText = "Path to model.onnx. Used only for TeleSpeech CTC models")] + public string TeleSpeechCtc { get; set; } + [Option("num-threads", Required = false, Default = 1, HelpText = "Number of threads for computation")] public int NumThreads { get; set; } @@ -201,6 +207,10 @@ private static void Run(Options options) { config.ModelConfig.NeMoCtc.Model = options.NeMoCtc; } + else if (!String.IsNullOrEmpty(options.TeleSpeechCtc)) + { + config.ModelConfig.TeleSpeechCtc = options.TeleSpeechCtc; + } else if (!String.IsNullOrEmpty(options.WhisperEncoder)) { config.ModelConfig.Whisper.Encoder = options.WhisperEncoder; @@ -218,6 +228,7 @@ private static void Run(Options options) return; } + config.ModelConfig.ModelType = options.ModelType; config.DecodingMethod = options.DecodingMethod; config.MaxActivePaths = options.MaxActivePaths; config.HotwordsFile = options.HotwordsFile; diff --git a/dotnet-examples/offline-decode-files/run-telespeech-ctc.sh b/dotnet-examples/offline-decode-files/run-telespeech-ctc.sh new file mode 100755 index 000000000..d678026d0 --- /dev/null +++ b/dotnet-examples/offline-decode-files/run-telespeech-ctc.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +set -ex + +if [ ! -d sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04 ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 + tar xvf sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 + rm sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 +fi + +dotnet run \ + --telespeech-ctc=./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx \ + --tokens=./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt \ + --model-type=telespeech-ctc \ + --files ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/3-sichuan.wav diff --git a/go-api-examples/non-streaming-decode-files/main.go b/go-api-examples/non-streaming-decode-files/main.go index d38c43c26..48ad35302 100644 --- a/go-api-examples/non-streaming-decode-files/main.go +++ b/go-api-examples/non-streaming-decode-files/main.go @@ -40,6 +40,9 @@ func main() { flag.IntVar(&config.ModelConfig.Debug, "debug", 0, "Whether to show debug message") flag.StringVar(&config.ModelConfig.ModelType, "model-type", "", "Optional. Used for loading the model in a faster way") flag.StringVar(&config.ModelConfig.Provider, "provider", "cpu", "Provider to use") + flag.StringVar(&config.ModelConfig.ModelingUnit, "modeling-unit", "cjkchar", "cjkchar, bpe, cjkchar+bpe, or leave it to empty") + flag.StringVar(&config.ModelConfig.BpeVocab, "bpe-vocab", "", "") + flag.StringVar(&config.ModelConfig.TeleSpeechCtc, "telespeech-ctc", "", "Used for TeleSpeechCtc model") flag.StringVar(&config.LmConfig.Model, "lm-model", "", "Optional. Path to the LM model") flag.Float32Var(&config.LmConfig.Scale, "lm-scale", 1.0, "Optional. Scale for the LM model") diff --git a/go-api-examples/non-streaming-decode-files/run-telespeech-ctc.sh b/go-api-examples/non-streaming-decode-files/run-telespeech-ctc.sh new file mode 100755 index 000000000..d9785b2aa --- /dev/null +++ b/go-api-examples/non-streaming-decode-files/run-telespeech-ctc.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +set -ex + +if [ ! -d sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04 ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 + tar xvf sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 + rm sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 +fi + +go mod tidy +go build + +./non-streaming-decode-files \ + --telespeech-ctc ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx \ + --tokens ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt \ + --model-type telespeech-ctc \ + --debug 0 \ + ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/3-sichuan.wav diff --git a/java-api-examples/NonStreamingDecodeFileParaformer.java b/java-api-examples/NonStreamingDecodeFileParaformer.java index 4a49c8d66..81121f6a1 100644 --- a/java-api-examples/NonStreamingDecodeFileParaformer.java +++ b/java-api-examples/NonStreamingDecodeFileParaformer.java @@ -4,7 +4,7 @@ // to decode files. import com.k2fsa.sherpa.onnx.*; -public class NonStreamingDecodeFileTransducer { +public class NonStreamingDecodeFileParaformer { public static void main(String[] args) { // please refer to // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-paraformer-zh-2023-03-28-chinese-english diff --git a/java-api-examples/NonStreamingDecodeFileTeleSpeechCtc.java b/java-api-examples/NonStreamingDecodeFileTeleSpeechCtc.java new file mode 100644 index 000000000..ccb2bde04 --- /dev/null +++ b/java-api-examples/NonStreamingDecodeFileTeleSpeechCtc.java @@ -0,0 +1,47 @@ +// Copyright 2024 Xiaomi Corporation + +// This file shows how to use an offline TeleSpeech CTC model +// to decode files. +import com.k2fsa.sherpa.onnx.*; + +public class NonStreamingDecodeFileTeleSpeechCtc { + public static void main(String[] args) { + // please refer to + // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-paraformer-zh-2023-03-28-chinese-english + // to download model files + String model = "./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx"; + String tokens = "./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt"; + + String waveFilename = "./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/3-sichuan.wav"; + + WaveReader reader = new WaveReader(waveFilename); + + OfflineModelConfig modelConfig = + OfflineModelConfig.builder() + .setTeleSpeech(model) + .setTokens(tokens) + .setNumThreads(1) + .setDebug(true) + .setModelType("telespeech_ctc") + .build(); + + OfflineRecognizerConfig config = + OfflineRecognizerConfig.builder() + .setOfflineModelConfig(modelConfig) + .setDecodingMethod("greedy_search") + .build(); + + OfflineRecognizer recognizer = new OfflineRecognizer(config); + OfflineStream stream = recognizer.createStream(); + stream.acceptWaveform(reader.getSamples(), reader.getSampleRate()); + + recognizer.decode(stream); + + String text = recognizer.getResult(stream).getText(); + + System.out.printf("filename:%s\nresult:%s\n", waveFilename, text); + + stream.release(); + recognizer.release(); + } +} diff --git a/java-api-examples/run-non-streaming-decode-file-tele-speech-ctc.sh b/java-api-examples/run-non-streaming-decode-file-tele-speech-ctc.sh new file mode 100755 index 000000000..075f1e2e3 --- /dev/null +++ b/java-api-examples/run-non-streaming-decode-file-tele-speech-ctc.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash + +set -ex + +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then + mkdir -p ../build + pushd ../build + cmake \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + -DSHERPA_ONNX_ENABLE_JNI=ON \ + .. + + make -j4 + ls -lh lib + popd +fi + +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then + pushd ../sherpa-onnx/java-api + make + popd +fi + +if [ ! -f ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 + tar xvf sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 + rm sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 +fi + +java \ + -Djava.library.path=$PWD/../build/lib \ + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \ + ./NonStreamingDecodeFileTeleSpeechCtc.java diff --git a/python-api-examples/offline-telespeech-ctc-decode-files.py b/python-api-examples/offline-telespeech-ctc-decode-files.py new file mode 100755 index 000000000..17b7f4853 --- /dev/null +++ b/python-api-examples/offline-telespeech-ctc-decode-files.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +""" +This file shows how to use a non-streaming CTC model from +https://github.com/Tele-AI/TeleSpeech-ASR +to decode files. + +Please download model files from +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models + + +""" + +from pathlib import Path + +import sherpa_onnx +import soundfile as sf + + +def create_recognizer(): + model = "./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx" + tokens = "./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt" + test_wav = "./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/3-sichuan.wav" + # test_wav = "./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/4-tianjin.wav" + # test_wav = "./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/5-henan.wav" + + if not Path(model).is_file() or not Path(test_wav).is_file(): + raise ValueError( + """Please download model files from + https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models + """ + ) + return ( + sherpa_onnx.OfflineRecognizer.from_telespeech_ctc( + model=model, + tokens=tokens, + debug=True, + ), + test_wav, + ) + + +def main(): + recognizer, wave_filename = create_recognizer() + + audio, sample_rate = sf.read(wave_filename, dtype="float32", always_2d=True) + audio = audio[:, 0] # only use the first channel + + # audio is a 1-D float32 numpy array normalized to the range [-1, 1] + # sample_rate does not need to be 16000 Hz + + stream = recognizer.create_stream() + stream.accept_waveform(sample_rate, audio) + recognizer.decode_stream(stream) + print(wave_filename) + print(stream.result) + + +if __name__ == "__main__": + main() diff --git a/scripts/apk/generate-vad-asr-apk-script.py b/scripts/apk/generate-vad-asr-apk-script.py index 009c565a7..ca38fa3fb 100755 --- a/scripts/apk/generate-vad-asr-apk-script.py +++ b/scripts/apk/generate-vad-asr-apk-script.py @@ -163,6 +163,22 @@ def get_models(): ls -lh + popd + """, + ), + Model( + model_name="sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04", + idx=11, + lang="zh", + short_name="telespeech", + cmd=""" + pushd $model_name + + rm -rfv test_wavs + rm test.py + + ls -lh + popd """, ), diff --git a/scripts/dotnet/OfflineModelConfig.cs b/scripts/dotnet/OfflineModelConfig.cs index 2dc2347c1..f5620944f 100644 --- a/scripts/dotnet/OfflineModelConfig.cs +++ b/scripts/dotnet/OfflineModelConfig.cs @@ -25,6 +25,7 @@ public OfflineModelConfig() ModelType = ""; ModelingUnit = "cjkchar"; BpeVocab = ""; + TeleSpeechCtc = ""; } public OfflineTransducerModelConfig Transducer; public OfflineParaformerModelConfig Paraformer; @@ -50,5 +51,8 @@ public OfflineModelConfig() [MarshalAs(UnmanagedType.LPStr)] public string BpeVocab; + + [MarshalAs(UnmanagedType.LPStr)] + public string TeleSpeechCtc; } } diff --git a/scripts/dotnet/run.sh b/scripts/dotnet/run.sh index 3ce1a0fb5..7aa3ae5e4 100755 --- a/scripts/dotnet/run.sh +++ b/scripts/dotnet/run.sh @@ -30,7 +30,7 @@ mkdir -p linux macos windows-x64 windows-x86 linux_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl linux_wheel=$src_dir/$linux_wheel_filename -macos_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-macosx_11_0_x86_64.whl +macos_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-macosx_11_0_universal2.whl macos_wheel=$src_dir/$macos_wheel_filename windows_x64_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-win_amd64.whl @@ -61,7 +61,7 @@ if [ ! -f $src_dir/linux/libsherpa-onnx-core.so ]; then fi if [ ! -f $src_dir/macos/libsherpa-onnx-core.dylib ]; then - echo "---macOS x86_64---" + echo "--- macOS x86_64/arm64 universal2---" cd macos mkdir -p wheel cd wheel diff --git a/scripts/go/_internal/non-streaming-decode-files/run-telespeech-ctc.sh b/scripts/go/_internal/non-streaming-decode-files/run-telespeech-ctc.sh new file mode 120000 index 000000000..1625a0708 --- /dev/null +++ b/scripts/go/_internal/non-streaming-decode-files/run-telespeech-ctc.sh @@ -0,0 +1 @@ +../../../../go-api-examples/non-streaming-decode-files/run-telespeech-ctc.sh \ No newline at end of file diff --git a/scripts/go/sherpa_onnx.go b/scripts/go/sherpa_onnx.go index e89787da9..a03031866 100644 --- a/scripts/go/sherpa_onnx.go +++ b/scripts/go/sherpa_onnx.go @@ -381,8 +381,9 @@ type OfflineModelConfig struct { // Optional. Specify it for faster model initialization. ModelType string - ModelingUnit string // Optional. cjkchar, bpe, cjkchar+bpe - BpeVocab string // Optional. + ModelingUnit string // Optional. cjkchar, bpe, cjkchar+bpe + BpeVocab string // Optional. + TeleSpeechCtc string // Optional. } // Configuration for the offline/non-streaming recognizer. @@ -477,6 +478,9 @@ func NewOfflineRecognizer(config *OfflineRecognizerConfig) *OfflineRecognizer { c.model_config.bpe_vocab = C.CString(config.ModelConfig.BpeVocab) defer C.free(unsafe.Pointer(c.model_config.bpe_vocab)) + c.model_config.telespeech_ctc = C.CString(config.ModelConfig.TeleSpeechCtc) + defer C.free(unsafe.Pointer(c.model_config.telespeech_ctc)) + c.lm_config.model = C.CString(config.LmConfig.Model) defer C.free(unsafe.Pointer(c.lm_config.model)) diff --git a/scripts/node-addon-api/src/non-streaming-asr.cc b/scripts/node-addon-api/src/non-streaming-asr.cc index d101c7eb6..671528200 100644 --- a/scripts/node-addon-api/src/non-streaming-asr.cc +++ b/scripts/node-addon-api/src/non-streaming-asr.cc @@ -128,6 +128,7 @@ static SherpaOnnxOfflineModelConfig GetOfflineModelConfig(Napi::Object obj) { SHERPA_ONNX_ASSIGN_ATTR_STR(model_type, modelType); SHERPA_ONNX_ASSIGN_ATTR_STR(modeling_unit, modelingUnit); SHERPA_ONNX_ASSIGN_ATTR_STR(bpe_vocab, bpeVocab); + SHERPA_ONNX_ASSIGN_ATTR_STR(telespeech_ctc, teleSpeechCtc); return c; } @@ -242,6 +243,10 @@ CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) { delete[] c.model_config.bpe_vocab; } + if (c.model_config.telespeech_ctc) { + delete[] c.model_config.telespeech_ctc; + } + if (c.lm_config.model) { delete[] c.lm_config.model; } diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index eaf782b92..eb4e293d1 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -366,6 +366,9 @@ SherpaOnnxOfflineRecognizer *CreateOfflineRecognizer( recognizer_config.model_config.bpe_vocab = SHERPA_ONNX_OR(config->model_config.bpe_vocab, ""); + recognizer_config.model_config.telespeech_ctc = + SHERPA_ONNX_OR(config->model_config.telespeech_ctc, ""); + recognizer_config.lm_config.model = SHERPA_ONNX_OR(config->lm_config.model, ""); recognizer_config.lm_config.scale = diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index bd9b6a4d4..e75d1955f 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -395,6 +395,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig { // - cjkchar+bpe const char *modeling_unit; const char *bpe_vocab; + const char *telespeech_ctc; } SherpaOnnxOfflineModelConfig; SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerConfig { diff --git a/sherpa-onnx/csrc/CMakeLists.txt b/sherpa-onnx/csrc/CMakeLists.txt index 232412338..6edb82402 100644 --- a/sherpa-onnx/csrc/CMakeLists.txt +++ b/sherpa-onnx/csrc/CMakeLists.txt @@ -39,6 +39,7 @@ set(sources offline-stream.cc offline-tdnn-ctc-model.cc offline-tdnn-model-config.cc + offline-telespeech-ctc-model.cc offline-transducer-greedy-search-decoder.cc offline-transducer-greedy-search-nemo-decoder.cc offline-transducer-model-config.cc diff --git a/sherpa-onnx/csrc/features.cc b/sherpa-onnx/csrc/features.cc index 2eab92ede..ed806f392 100644 --- a/sherpa-onnx/csrc/features.cc +++ b/sherpa-onnx/csrc/features.cc @@ -56,22 +56,11 @@ std::string FeatureExtractorConfig::ToString() const { class FeatureExtractor::Impl { public: explicit Impl(const FeatureExtractorConfig &config) : config_(config) { - opts_.frame_opts.dither = config.dither; - opts_.frame_opts.snip_edges = config.snip_edges; - opts_.frame_opts.samp_freq = config.sampling_rate; - opts_.frame_opts.frame_shift_ms = config.frame_shift_ms; - opts_.frame_opts.frame_length_ms = config.frame_length_ms; - opts_.frame_opts.remove_dc_offset = config.remove_dc_offset; - opts_.frame_opts.window_type = config.window_type; - - opts_.mel_opts.num_bins = config.feature_dim; - - opts_.mel_opts.high_freq = config.high_freq; - opts_.mel_opts.low_freq = config.low_freq; - - opts_.mel_opts.is_librosa = config.is_librosa; - - fbank_ = std::make_unique(opts_); + if (config_.is_mfcc) { + InitMfcc(); + } else { + InitFbank(); + } } void AcceptWaveform(int32_t sampling_rate, const float *waveform, int32_t n) { @@ -101,35 +90,48 @@ class FeatureExtractor::Impl { std::vector samples; resampler_->Resample(waveform, n, false, &samples); - fbank_->AcceptWaveform(opts_.frame_opts.samp_freq, samples.data(), - samples.size()); + if (fbank_) { + fbank_->AcceptWaveform(config_.sampling_rate, samples.data(), + samples.size()); + } else { + mfcc_->AcceptWaveform(config_.sampling_rate, samples.data(), + samples.size()); + } return; } - if (sampling_rate != opts_.frame_opts.samp_freq) { + if (sampling_rate != config_.sampling_rate) { SHERPA_ONNX_LOGE( "Creating a resampler:\n" " in_sample_rate: %d\n" " output_sample_rate: %d\n", - sampling_rate, static_cast(opts_.frame_opts.samp_freq)); + sampling_rate, static_cast(config_.sampling_rate)); - float min_freq = - std::min(sampling_rate, opts_.frame_opts.samp_freq); + float min_freq = std::min(sampling_rate, config_.sampling_rate); float lowpass_cutoff = 0.99 * 0.5 * min_freq; int32_t lowpass_filter_width = 6; resampler_ = std::make_unique( - sampling_rate, opts_.frame_opts.samp_freq, lowpass_cutoff, + sampling_rate, config_.sampling_rate, lowpass_cutoff, lowpass_filter_width); std::vector samples; resampler_->Resample(waveform, n, false, &samples); - fbank_->AcceptWaveform(opts_.frame_opts.samp_freq, samples.data(), - samples.size()); + if (fbank_) { + fbank_->AcceptWaveform(config_.sampling_rate, samples.data(), + samples.size()); + } else { + mfcc_->AcceptWaveform(config_.sampling_rate, samples.data(), + samples.size()); + } return; } - fbank_->AcceptWaveform(sampling_rate, waveform, n); + if (fbank_) { + fbank_->AcceptWaveform(sampling_rate, waveform, n); + } else { + mfcc_->AcceptWaveform(sampling_rate, waveform, n); + } } void InputFinished() const { @@ -179,11 +181,56 @@ class FeatureExtractor::Impl { return features; } - int32_t FeatureDim() const { return opts_.mel_opts.num_bins; } + int32_t FeatureDim() const { + return mfcc_ ? mfcc_opts_.num_ceps : opts_.mel_opts.num_bins; + } + + private: + void InitFbank() { + opts_.frame_opts.dither = config_.dither; + opts_.frame_opts.snip_edges = config_.snip_edges; + opts_.frame_opts.samp_freq = config_.sampling_rate; + opts_.frame_opts.frame_shift_ms = config_.frame_shift_ms; + opts_.frame_opts.frame_length_ms = config_.frame_length_ms; + opts_.frame_opts.remove_dc_offset = config_.remove_dc_offset; + opts_.frame_opts.window_type = config_.window_type; + + opts_.mel_opts.num_bins = config_.feature_dim; + + opts_.mel_opts.high_freq = config_.high_freq; + opts_.mel_opts.low_freq = config_.low_freq; + + opts_.mel_opts.is_librosa = config_.is_librosa; + + fbank_ = std::make_unique(opts_); + } + void InitMfcc() { + mfcc_opts_.frame_opts.dither = config_.dither; + mfcc_opts_.frame_opts.snip_edges = config_.snip_edges; + mfcc_opts_.frame_opts.samp_freq = config_.sampling_rate; + mfcc_opts_.frame_opts.frame_shift_ms = config_.frame_shift_ms; + mfcc_opts_.frame_opts.frame_length_ms = config_.frame_length_ms; + mfcc_opts_.frame_opts.remove_dc_offset = config_.remove_dc_offset; + mfcc_opts_.frame_opts.window_type = config_.window_type; + + mfcc_opts_.mel_opts.num_bins = config_.feature_dim; + + mfcc_opts_.mel_opts.high_freq = config_.high_freq; + mfcc_opts_.mel_opts.low_freq = config_.low_freq; + + mfcc_opts_.mel_opts.is_librosa = config_.is_librosa; + + mfcc_opts_.num_ceps = config_.num_ceps; + mfcc_opts_.use_energy = config_.use_energy; + + mfcc_ = std::make_unique(mfcc_opts_); + } private: std::unique_ptr fbank_; + std::unique_ptr mfcc_; knf::FbankOptions opts_; + knf::MfccOptions mfcc_opts_; FeatureExtractorConfig config_; mutable std::mutex mutex_; std::unique_ptr resampler_; diff --git a/sherpa-onnx/csrc/features.h b/sherpa-onnx/csrc/features.h index c3bc02d5e..2e4596a57 100644 --- a/sherpa-onnx/csrc/features.h +++ b/sherpa-onnx/csrc/features.h @@ -18,7 +18,10 @@ struct FeatureExtractorConfig { // the sampling rate of the input waveform, we will do resampling inside. int32_t sampling_rate = 16000; - // Feature dimension + // num_mel_bins + // + // Note: for mfcc, this value is also for num_mel_bins. + // The actual feature dimension is actuall num_ceps int32_t feature_dim = 80; // minimal frequency for Mel-filterbank, in Hz @@ -69,6 +72,12 @@ struct FeatureExtractorConfig { // for details std::string nemo_normalize_type; + // for MFCC + int32_t num_ceps = 13; + bool use_energy = true; + + bool is_mfcc = false; + std::string ToString() const; void Register(ParseOptions *po); diff --git a/sherpa-onnx/csrc/offline-ctc-model.cc b/sherpa-onnx/csrc/offline-ctc-model.cc index ed18720a3..cfa8ab45c 100644 --- a/sherpa-onnx/csrc/offline-ctc-model.cc +++ b/sherpa-onnx/csrc/offline-ctc-model.cc @@ -12,6 +12,7 @@ #include "sherpa-onnx/csrc/macros.h" #include "sherpa-onnx/csrc/offline-nemo-enc-dec-ctc-model.h" #include "sherpa-onnx/csrc/offline-tdnn-ctc-model.h" +#include "sherpa-onnx/csrc/offline-telespeech-ctc-model.h" #include "sherpa-onnx/csrc/offline-wenet-ctc-model.h" #include "sherpa-onnx/csrc/offline-zipformer-ctc-model.h" #include "sherpa-onnx/csrc/onnx-utils.h" @@ -24,6 +25,7 @@ enum class ModelType { kTdnn, kZipformerCtc, kWenetCtc, + kTeleSpeechCtc, kUnknown, }; @@ -63,6 +65,9 @@ static ModelType GetModelType(char *model_data, size_t model_data_length, "If you are using models from WeNet, please refer to\n" "https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/wenet/" "run.sh\n" + "If you are using models from TeleSpeech, please refer to\n" + "https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/tele-speech/" + "add-metadata.py" "\n" "for how to add metadta to model.onnx\n"); return ModelType::kUnknown; @@ -78,6 +83,8 @@ static ModelType GetModelType(char *model_data, size_t model_data_length, return ModelType::kZipformerCtc; } else if (model_type.get() == std::string("wenet_ctc")) { return ModelType::kWenetCtc; + } else if (model_type.get() == std::string("telespeech_ctc")) { + return ModelType::kTeleSpeechCtc; } else { SHERPA_ONNX_LOGE("Unsupported model_type: %s", model_type.get()); return ModelType::kUnknown; @@ -97,6 +104,8 @@ std::unique_ptr OfflineCtcModel::Create( filename = config.zipformer_ctc.model; } else if (!config.wenet_ctc.model.empty()) { filename = config.wenet_ctc.model; + } else if (!config.telespeech_ctc.empty()) { + filename = config.telespeech_ctc; } else { SHERPA_ONNX_LOGE("Please specify a CTC model"); exit(-1); @@ -124,6 +133,9 @@ std::unique_ptr OfflineCtcModel::Create( case ModelType::kWenetCtc: return std::make_unique(config); break; + case ModelType::kTeleSpeechCtc: + return std::make_unique(config); + break; case ModelType::kUnknown: SHERPA_ONNX_LOGE("Unknown model type in offline CTC!"); return nullptr; @@ -147,6 +159,8 @@ std::unique_ptr OfflineCtcModel::Create( filename = config.zipformer_ctc.model; } else if (!config.wenet_ctc.model.empty()) { filename = config.wenet_ctc.model; + } else if (!config.telespeech_ctc.empty()) { + filename = config.telespeech_ctc; } else { SHERPA_ONNX_LOGE("Please specify a CTC model"); exit(-1); @@ -175,6 +189,9 @@ std::unique_ptr OfflineCtcModel::Create( case ModelType::kWenetCtc: return std::make_unique(mgr, config); break; + case ModelType::kTeleSpeechCtc: + return std::make_unique(mgr, config); + break; case ModelType::kUnknown: SHERPA_ONNX_LOGE("Unknown model type in offline CTC!"); return nullptr; diff --git a/sherpa-onnx/csrc/offline-model-config.cc b/sherpa-onnx/csrc/offline-model-config.cc index b85a0a9f2..24a5a2141 100644 --- a/sherpa-onnx/csrc/offline-model-config.cc +++ b/sherpa-onnx/csrc/offline-model-config.cc @@ -19,6 +19,9 @@ void OfflineModelConfig::Register(ParseOptions *po) { zipformer_ctc.Register(po); wenet_ctc.Register(po); + po->Register("telespeech-ctc", &telespeech_ctc, + "Path to model.onnx for telespeech ctc"); + po->Register("tokens", &tokens, "Path to tokens.txt"); po->Register("num-threads", &num_threads, @@ -33,7 +36,7 @@ void OfflineModelConfig::Register(ParseOptions *po) { po->Register("model-type", &model_type, "Specify it to reduce model initialization time. " "Valid values are: transducer, paraformer, nemo_ctc, whisper, " - "tdnn, zipformer2_ctc" + "tdnn, zipformer2_ctc, telespeech_ctc." "All other values lead to loading the model twice."); po->Register("modeling-unit", &modeling_unit, "The modeling unit of the model, commonly used units are bpe, " @@ -55,14 +58,14 @@ bool OfflineModelConfig::Validate() const { } if (!FileExists(tokens)) { - SHERPA_ONNX_LOGE("tokens: %s does not exist", tokens.c_str()); + SHERPA_ONNX_LOGE("tokens: '%s' does not exist", tokens.c_str()); return false; } if (!modeling_unit.empty() && (modeling_unit == "bpe" || modeling_unit == "cjkchar+bpe")) { if (!FileExists(bpe_vocab)) { - SHERPA_ONNX_LOGE("bpe_vocab: %s does not exist", bpe_vocab.c_str()); + SHERPA_ONNX_LOGE("bpe_vocab: '%s' does not exist", bpe_vocab.c_str()); return false; } } @@ -91,6 +94,14 @@ bool OfflineModelConfig::Validate() const { return wenet_ctc.Validate(); } + if (!telespeech_ctc.empty() && !FileExists(telespeech_ctc)) { + SHERPA_ONNX_LOGE("telespeech_ctc: '%s' does not exist", + telespeech_ctc.c_str()); + return false; + } else { + return true; + } + return transducer.Validate(); } @@ -105,6 +116,7 @@ std::string OfflineModelConfig::ToString() const { os << "tdnn=" << tdnn.ToString() << ", "; os << "zipformer_ctc=" << zipformer_ctc.ToString() << ", "; os << "wenet_ctc=" << wenet_ctc.ToString() << ", "; + os << "telespeech_ctc=\"" << telespeech_ctc << "\", "; os << "tokens=\"" << tokens << "\", "; os << "num_threads=" << num_threads << ", "; os << "debug=" << (debug ? "True" : "False") << ", "; diff --git a/sherpa-onnx/csrc/offline-model-config.h b/sherpa-onnx/csrc/offline-model-config.h index 93ea7fd0e..856a6f35d 100644 --- a/sherpa-onnx/csrc/offline-model-config.h +++ b/sherpa-onnx/csrc/offline-model-config.h @@ -24,6 +24,7 @@ struct OfflineModelConfig { OfflineTdnnModelConfig tdnn; OfflineZipformerCtcModelConfig zipformer_ctc; OfflineWenetCtcModelConfig wenet_ctc; + std::string telespeech_ctc; std::string tokens; int32_t num_threads = 2; @@ -52,6 +53,7 @@ struct OfflineModelConfig { const OfflineTdnnModelConfig &tdnn, const OfflineZipformerCtcModelConfig &zipformer_ctc, const OfflineWenetCtcModelConfig &wenet_ctc, + const std::string &telespeech_ctc, const std::string &tokens, int32_t num_threads, bool debug, const std::string &provider, const std::string &model_type, const std::string &modeling_unit, @@ -63,6 +65,7 @@ struct OfflineModelConfig { tdnn(tdnn), zipformer_ctc(zipformer_ctc), wenet_ctc(wenet_ctc), + telespeech_ctc(telespeech_ctc), tokens(tokens), num_threads(num_threads), debug(debug), diff --git a/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h b/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h index 988a487b6..2c83dac28 100644 --- a/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h +++ b/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h @@ -88,6 +88,17 @@ class OfflineRecognizerCtcImpl : public OfflineRecognizerImpl { #endif void Init() { + if (!config_.model_config.telespeech_ctc.empty()) { + config_.feat_config.snip_edges = true; + config_.feat_config.num_ceps = 40; + config_.feat_config.feature_dim = 40; + config_.feat_config.low_freq = 40; + config_.feat_config.high_freq = -200; + config_.feat_config.use_energy = false; + config_.feat_config.normalize_samples = false; + config_.feat_config.is_mfcc = true; + } + if (!config_.model_config.wenet_ctc.model.empty()) { // WeNet CTC models assume input samples are in the range // [-32768, 32767], so we set normalize_samples to false diff --git a/sherpa-onnx/csrc/offline-recognizer-impl.cc b/sherpa-onnx/csrc/offline-recognizer-impl.cc index c23acf126..656425778 100644 --- a/sherpa-onnx/csrc/offline-recognizer-impl.cc +++ b/sherpa-onnx/csrc/offline-recognizer-impl.cc @@ -29,7 +29,8 @@ std::unique_ptr OfflineRecognizerImpl::Create( } else if (model_type == "paraformer") { return std::make_unique(config); } else if (model_type == "nemo_ctc" || model_type == "tdnn" || - model_type == "zipformer2_ctc" || model_type == "wenet_ctc") { + model_type == "zipformer2_ctc" || model_type == "wenet_ctc" || + model_type == "telespeech_ctc") { return std::make_unique(config); } else if (model_type == "whisper") { return std::make_unique(config); @@ -53,6 +54,8 @@ std::unique_ptr OfflineRecognizerImpl::Create( model_filename = config.model_config.paraformer.model; } else if (!config.model_config.nemo_ctc.model.empty()) { model_filename = config.model_config.nemo_ctc.model; + } else if (!config.model_config.telespeech_ctc.empty()) { + model_filename = config.model_config.telespeech_ctc; } else if (!config.model_config.tdnn.model.empty()) { model_filename = config.model_config.tdnn.model; } else if (!config.model_config.zipformer_ctc.model.empty()) { @@ -111,6 +114,10 @@ std::unique_ptr OfflineRecognizerImpl::Create( "\n " "https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/wenet/run.sh" "\n" + "(7) CTC models from TeleSpeech" + "\n " + "https://github.com/Tele-AI/TeleSpeech-ASR" + "\n" "\n"); exit(-1); } @@ -133,7 +140,8 @@ std::unique_ptr OfflineRecognizerImpl::Create( if (model_type == "EncDecCTCModelBPE" || model_type == "EncDecHybridRNNTCTCBPEModel" || model_type == "tdnn" || - model_type == "zipformer2_ctc" || model_type == "wenet_ctc") { + model_type == "zipformer2_ctc" || model_type == "wenet_ctc" || + model_type == "telespeech_ctc") { return std::make_unique(config); } @@ -151,7 +159,8 @@ std::unique_ptr OfflineRecognizerImpl::Create( " - Whisper models\n" " - Tdnn models\n" " - Zipformer CTC models\n" - " - WeNet CTC models\n", + " - WeNet CTC models\n" + " - TeleSpeech CTC models\n", model_type.c_str()); exit(-1); @@ -169,7 +178,8 @@ std::unique_ptr OfflineRecognizerImpl::Create( } else if (model_type == "paraformer") { return std::make_unique(mgr, config); } else if (model_type == "nemo_ctc" || model_type == "tdnn" || - model_type == "zipformer2_ctc" || model_type == "wenet_ctc") { + model_type == "zipformer2_ctc" || model_type == "wenet_ctc" || + model_type == "telespeech_ctc") { return std::make_unique(mgr, config); } else if (model_type == "whisper") { return std::make_unique(mgr, config); @@ -199,6 +209,8 @@ std::unique_ptr OfflineRecognizerImpl::Create( model_filename = config.model_config.zipformer_ctc.model; } else if (!config.model_config.wenet_ctc.model.empty()) { model_filename = config.model_config.wenet_ctc.model; + } else if (!config.model_config.telespeech_ctc.empty()) { + model_filename = config.model_config.telespeech_ctc; } else if (!config.model_config.whisper.encoder.empty()) { model_filename = config.model_config.whisper.encoder; } else { @@ -251,6 +263,10 @@ std::unique_ptr OfflineRecognizerImpl::Create( "\n " "https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/wenet/run.sh" "\n" + "(7) CTC models from TeleSpeech" + "\n " + "https://github.com/Tele-AI/TeleSpeech-ASR" + "\n" "\n"); exit(-1); } @@ -273,7 +289,8 @@ std::unique_ptr OfflineRecognizerImpl::Create( if (model_type == "EncDecCTCModelBPE" || model_type == "EncDecHybridRNNTCTCBPEModel" || model_type == "tdnn" || - model_type == "zipformer2_ctc" || model_type == "wenet_ctc") { + model_type == "zipformer2_ctc" || model_type == "wenet_ctc" || + model_type == "telespeech_ctc") { return std::make_unique(mgr, config); } @@ -291,7 +308,8 @@ std::unique_ptr OfflineRecognizerImpl::Create( " - Whisper models\n" " - Tdnn models\n" " - Zipformer CTC models\n" - " - WeNet CTC models\n", + " - WeNet CTC models\n" + " - TeleSpeech CTC models\n", model_type.c_str()); exit(-1); diff --git a/sherpa-onnx/csrc/offline-stream.cc b/sherpa-onnx/csrc/offline-stream.cc index 206b36003..4321a62dd 100644 --- a/sherpa-onnx/csrc/offline-stream.cc +++ b/sherpa-onnx/csrc/offline-stream.cc @@ -57,22 +57,44 @@ class OfflineStream::Impl { explicit Impl(const FeatureExtractorConfig &config, ContextGraphPtr context_graph) : config_(config), context_graph_(context_graph) { - opts_.frame_opts.dither = config.dither; - opts_.frame_opts.snip_edges = config.snip_edges; - opts_.frame_opts.samp_freq = config.sampling_rate; - opts_.frame_opts.frame_shift_ms = config.frame_shift_ms; - opts_.frame_opts.frame_length_ms = config.frame_length_ms; - opts_.frame_opts.remove_dc_offset = config.remove_dc_offset; - opts_.frame_opts.window_type = config.window_type; + if (config.is_mfcc) { + mfcc_opts_.frame_opts.dither = config_.dither; + mfcc_opts_.frame_opts.snip_edges = config_.snip_edges; + mfcc_opts_.frame_opts.samp_freq = config_.sampling_rate; + mfcc_opts_.frame_opts.frame_shift_ms = config_.frame_shift_ms; + mfcc_opts_.frame_opts.frame_length_ms = config_.frame_length_ms; + mfcc_opts_.frame_opts.remove_dc_offset = config_.remove_dc_offset; + mfcc_opts_.frame_opts.window_type = config_.window_type; - opts_.mel_opts.num_bins = config.feature_dim; + mfcc_opts_.mel_opts.num_bins = config_.feature_dim; - opts_.mel_opts.high_freq = config.high_freq; - opts_.mel_opts.low_freq = config.low_freq; + mfcc_opts_.mel_opts.high_freq = config_.high_freq; + mfcc_opts_.mel_opts.low_freq = config_.low_freq; - opts_.mel_opts.is_librosa = config.is_librosa; + mfcc_opts_.mel_opts.is_librosa = config_.is_librosa; - fbank_ = std::make_unique(opts_); + mfcc_opts_.num_ceps = config_.num_ceps; + mfcc_opts_.use_energy = config_.use_energy; + + mfcc_ = std::make_unique(mfcc_opts_); + } else { + opts_.frame_opts.dither = config.dither; + opts_.frame_opts.snip_edges = config.snip_edges; + opts_.frame_opts.samp_freq = config.sampling_rate; + opts_.frame_opts.frame_shift_ms = config.frame_shift_ms; + opts_.frame_opts.frame_length_ms = config.frame_length_ms; + opts_.frame_opts.remove_dc_offset = config.remove_dc_offset; + opts_.frame_opts.window_type = config.window_type; + + opts_.mel_opts.num_bins = config.feature_dim; + + opts_.mel_opts.high_freq = config.high_freq; + opts_.mel_opts.low_freq = config.low_freq; + + opts_.mel_opts.is_librosa = config.is_librosa; + + fbank_ = std::make_unique(opts_); + } } explicit Impl(WhisperTag /*tag*/) { @@ -81,6 +103,7 @@ class OfflineStream::Impl { opts_.mel_opts.num_bins = 80; // not used whisper_fbank_ = std::make_unique(opts_.frame_opts); + config_.sampling_rate = opts_.frame_opts.samp_freq; } explicit Impl(CEDTag /*tag*/) { @@ -98,6 +121,8 @@ class OfflineStream::Impl { opts_.mel_opts.num_bins = 64; opts_.mel_opts.high_freq = 8000; + config_.sampling_rate = opts_.frame_opts.samp_freq; + fbank_ = std::make_unique(opts_); } @@ -115,52 +140,60 @@ class OfflineStream::Impl { void AcceptWaveformImpl(int32_t sampling_rate, const float *waveform, int32_t n) { - if (sampling_rate != opts_.frame_opts.samp_freq) { + if (sampling_rate != config_.sampling_rate) { SHERPA_ONNX_LOGE( "Creating a resampler:\n" " in_sample_rate: %d\n" " output_sample_rate: %d\n", - sampling_rate, static_cast(opts_.frame_opts.samp_freq)); + sampling_rate, static_cast(config_.sampling_rate)); - float min_freq = - std::min(sampling_rate, opts_.frame_opts.samp_freq); + float min_freq = std::min(sampling_rate, config_.sampling_rate); float lowpass_cutoff = 0.99 * 0.5 * min_freq; int32_t lowpass_filter_width = 6; auto resampler = std::make_unique( - sampling_rate, opts_.frame_opts.samp_freq, lowpass_cutoff, + sampling_rate, config_.sampling_rate, lowpass_cutoff, lowpass_filter_width); std::vector samples; resampler->Resample(waveform, n, true, &samples); if (fbank_) { - fbank_->AcceptWaveform(opts_.frame_opts.samp_freq, samples.data(), + fbank_->AcceptWaveform(config_.sampling_rate, samples.data(), samples.size()); fbank_->InputFinished(); + } else if (mfcc_) { + mfcc_->AcceptWaveform(config_.sampling_rate, samples.data(), + samples.size()); + mfcc_->InputFinished(); } else { - whisper_fbank_->AcceptWaveform(opts_.frame_opts.samp_freq, - samples.data(), samples.size()); + whisper_fbank_->AcceptWaveform(config_.sampling_rate, samples.data(), + samples.size()); whisper_fbank_->InputFinished(); } return; - } // if (sampling_rate != opts_.frame_opts.samp_freq) + } // if (sampling_rate != config_.sampling_rate) if (fbank_) { fbank_->AcceptWaveform(sampling_rate, waveform, n); fbank_->InputFinished(); + } else if (mfcc_) { + mfcc_->AcceptWaveform(sampling_rate, waveform, n); + mfcc_->InputFinished(); } else { whisper_fbank_->AcceptWaveform(sampling_rate, waveform, n); whisper_fbank_->InputFinished(); } } - int32_t FeatureDim() const { return opts_.mel_opts.num_bins; } + int32_t FeatureDim() const { + return mfcc_ ? mfcc_opts_.num_ceps : opts_.mel_opts.num_bins; + } std::vector GetFrames() const { - int32_t n = - fbank_ ? fbank_->NumFramesReady() : whisper_fbank_->NumFramesReady(); - + int32_t n = fbank_ ? fbank_->NumFramesReady() + : mfcc_ ? mfcc_->NumFramesReady() + : whisper_fbank_->NumFramesReady(); assert(n > 0 && "Please first call AcceptWaveform()"); int32_t feature_dim = FeatureDim(); @@ -170,8 +203,9 @@ class OfflineStream::Impl { float *p = features.data(); for (int32_t i = 0; i != n; ++i) { - const float *f = - fbank_ ? fbank_->GetFrame(i) : whisper_fbank_->GetFrame(i); + const float *f = fbank_ ? fbank_->GetFrame(i) + : mfcc_ ? mfcc_->GetFrame(i) + : whisper_fbank_->GetFrame(i); std::copy(f, f + feature_dim, p); p += feature_dim; } @@ -222,8 +256,10 @@ class OfflineStream::Impl { private: FeatureExtractorConfig config_; std::unique_ptr fbank_; + std::unique_ptr mfcc_; std::unique_ptr whisper_fbank_; knf::FbankOptions opts_; + knf::MfccOptions mfcc_opts_; OfflineRecognitionResult r_; ContextGraphPtr context_graph_; }; diff --git a/sherpa-onnx/csrc/offline-telespeech-ctc-model.cc b/sherpa-onnx/csrc/offline-telespeech-ctc-model.cc new file mode 100644 index 000000000..68c0afbe8 --- /dev/null +++ b/sherpa-onnx/csrc/offline-telespeech-ctc-model.cc @@ -0,0 +1,144 @@ +// sherpa-onnx/csrc/offline-telespeech-ctc-model.cc +// +// Copyright (c) 2023-2024 Xiaomi Corporation + +#include "sherpa-onnx/csrc/offline-telespeech-ctc-model.h" + +#include "sherpa-onnx/csrc/macros.h" +#include "sherpa-onnx/csrc/onnx-utils.h" +#include "sherpa-onnx/csrc/session.h" +#include "sherpa-onnx/csrc/text-utils.h" +#include "sherpa-onnx/csrc/transpose.h" + +namespace sherpa_onnx { + +class OfflineTeleSpeechCtcModel::Impl { + public: + explicit Impl(const OfflineModelConfig &config) + : config_(config), + env_(ORT_LOGGING_LEVEL_ERROR), + sess_opts_(GetSessionOptions(config)), + allocator_{} { + auto buf = ReadFile(config_.telespeech_ctc); + Init(buf.data(), buf.size()); + } + +#if __ANDROID_API__ >= 9 + Impl(AAssetManager *mgr, const OfflineModelConfig &config) + : config_(config), + env_(ORT_LOGGING_LEVEL_ERROR), + sess_opts_(GetSessionOptions(config)), + allocator_{} { + auto buf = ReadFile(mgr, config_.telespeech_ctc); + Init(buf.data(), buf.size()); + } +#endif + + std::vector Forward(Ort::Value features, + Ort::Value /*features_length*/) { + std::vector shape = + features.GetTensorTypeAndShapeInfo().GetShape(); + + if (static_cast(shape[0]) != 1) { + SHERPA_ONNX_LOGE("This model supports only batch size 1. Given %d", + static_cast(shape[0])); + } + + auto out = sess_->Run({}, input_names_ptr_.data(), &features, 1, + output_names_ptr_.data(), output_names_ptr_.size()); + + std::vector logits_shape = {1}; + Ort::Value logits_length = Ort::Value::CreateTensor( + allocator_, logits_shape.data(), logits_shape.size()); + + int64_t *dst = logits_length.GetTensorMutableData(); + dst[0] = out[0].GetTensorTypeAndShapeInfo().GetShape()[0]; + + // (T, B, C) -> (B, T, C) + Ort::Value logits = Transpose01(allocator_, &out[0]); + + std::vector ans; + ans.reserve(2); + ans.push_back(std::move(logits)); + ans.push_back(std::move(logits_length)); + + return ans; + } + + int32_t VocabSize() const { return vocab_size_; } + + int32_t SubsamplingFactor() const { return subsampling_factor_; } + + OrtAllocator *Allocator() const { return allocator_; } + + private: + void Init(void *model_data, size_t model_data_length) { + sess_ = std::make_unique(env_, model_data, model_data_length, + sess_opts_); + + GetInputNames(sess_.get(), &input_names_, &input_names_ptr_); + + GetOutputNames(sess_.get(), &output_names_, &output_names_ptr_); + + // get meta data + Ort::ModelMetadata meta_data = sess_->GetModelMetadata(); + if (config_.debug) { + std::ostringstream os; + PrintModelMetadata(os, meta_data); + SHERPA_ONNX_LOGE("%s\n", os.str().c_str()); + } + + { + auto shape = + sess_->GetOutputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape(); + vocab_size_ = shape[2]; + } + } + + private: + OfflineModelConfig config_; + Ort::Env env_; + Ort::SessionOptions sess_opts_; + Ort::AllocatorWithDefaultOptions allocator_; + + std::unique_ptr sess_; + + std::vector input_names_; + std::vector input_names_ptr_; + + std::vector output_names_; + std::vector output_names_ptr_; + + int32_t vocab_size_ = 0; + int32_t subsampling_factor_ = 4; +}; + +OfflineTeleSpeechCtcModel::OfflineTeleSpeechCtcModel( + const OfflineModelConfig &config) + : impl_(std::make_unique(config)) {} + +#if __ANDROID_API__ >= 9 +OfflineTeleSpeechCtcModel::OfflineTeleSpeechCtcModel( + AAssetManager *mgr, const OfflineModelConfig &config) + : impl_(std::make_unique(mgr, config)) {} +#endif + +OfflineTeleSpeechCtcModel::~OfflineTeleSpeechCtcModel() = default; + +std::vector OfflineTeleSpeechCtcModel::Forward( + Ort::Value features, Ort::Value features_length) { + return impl_->Forward(std::move(features), std::move(features_length)); +} + +int32_t OfflineTeleSpeechCtcModel::VocabSize() const { + return impl_->VocabSize(); +} +int32_t OfflineTeleSpeechCtcModel::SubsamplingFactor() const { + return impl_->SubsamplingFactor(); +} + +OrtAllocator *OfflineTeleSpeechCtcModel::Allocator() const { + return impl_->Allocator(); +} + +} // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/offline-telespeech-ctc-model.h b/sherpa-onnx/csrc/offline-telespeech-ctc-model.h new file mode 100644 index 000000000..42ef300ff --- /dev/null +++ b/sherpa-onnx/csrc/offline-telespeech-ctc-model.h @@ -0,0 +1,81 @@ +// sherpa-onnx/csrc/offline-telespeech-ctc-model.h +// +// Copyright (c) 2024 Xiaomi Corporation +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TELESPEECH_CTC_MODEL_H_ +#define SHERPA_ONNX_CSRC_OFFLINE_TELESPEECH_CTC_MODEL_H_ +#include +#include +#include +#include + +#if __ANDROID_API__ >= 9 +#include "android/asset_manager.h" +#include "android/asset_manager_jni.h" +#endif + +#include "onnxruntime_cxx_api.h" // NOLINT +#include "sherpa-onnx/csrc/offline-ctc-model.h" +#include "sherpa-onnx/csrc/offline-model-config.h" + +namespace sherpa_onnx { + +/** This class implements the CTC model from + * https://github.com/Tele-AI/TeleSpeech-ASR. + * + * See + * https://github.com/lovemefan/telespeech-asr-python/blob/main/telespeechasr/onnx/onnx_infer.py + * and + * https://github.com/k2-fsa/sherpa-onnx/blob/master/scripts/tele-speech/test.py + */ +class OfflineTeleSpeechCtcModel : public OfflineCtcModel { + public: + explicit OfflineTeleSpeechCtcModel(const OfflineModelConfig &config); + +#if __ANDROID_API__ >= 9 + OfflineTeleSpeechCtcModel(AAssetManager *mgr, + const OfflineModelConfig &config); +#endif + + ~OfflineTeleSpeechCtcModel() override; + + /** Run the forward method of the model. + * + * @param features A tensor of shape (N, T, C). + * @param features_length A 1-D tensor of shape (N,) containing number of + * valid frames in `features` before padding. + * Its dtype is int64_t. + * + * @return Return a vector containing: + * - log_probs: A 3-D tensor of shape (N, T', vocab_size). + * - log_probs_length A 1-D tensor of shape (N,). Its dtype is int64_t + */ + std::vector Forward(Ort::Value features, + Ort::Value features_length) override; + + /** Return the vocabulary size of the model + */ + int32_t VocabSize() const override; + + /** SubsamplingFactor of the model + */ + int32_t SubsamplingFactor() const override; + + /** Return an allocator for allocating memory + */ + OrtAllocator *Allocator() const override; + + // TeleSpeech CTC models do not support batch size > 1 + bool SupportBatchProcessing() const override { return false; } + + std::string FeatureNormalizationMethod() const override { + return "per_feature"; + } + + private: + class Impl; + std::unique_ptr impl_; +}; + +} // namespace sherpa_onnx + +#endif // SHERPA_ONNX_CSRC_OFFLINE_TELESPEECH_CTC_MODEL_H_ diff --git a/sherpa-onnx/csrc/online-model-config.cc b/sherpa-onnx/csrc/online-model-config.cc index 5ea24babe..a8efa870d 100644 --- a/sherpa-onnx/csrc/online-model-config.cc +++ b/sherpa-onnx/csrc/online-model-config.cc @@ -66,7 +66,7 @@ bool OnlineModelConfig::Validate() const { if (!modeling_unit.empty() && (modeling_unit == "bpe" || modeling_unit == "cjkchar+bpe")) { if (!FileExists(bpe_vocab)) { - SHERPA_ONNX_LOGE("bpe_vocab: %s does not exist", bpe_vocab.c_str()); + SHERPA_ONNX_LOGE("bpe_vocab: '%s' does not exist", bpe_vocab.c_str()); return false; } } diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineModelConfig.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineModelConfig.java index c7eba0237..6b44a84ba 100644 --- a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineModelConfig.java +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineModelConfig.java @@ -7,6 +7,7 @@ public class OfflineModelConfig { private final OfflineParaformerModelConfig paraformer; private final OfflineWhisperModelConfig whisper; private final OfflineNemoEncDecCtcModelConfig nemo; + private final String teleSpeech; private final String tokens; private final int numThreads; private final boolean debug; @@ -21,6 +22,7 @@ private OfflineModelConfig(Builder builder) { this.paraformer = builder.paraformer; this.whisper = builder.whisper; this.nemo = builder.nemo; + this.teleSpeech = builder.teleSpeech; this.tokens = builder.tokens; this.numThreads = builder.numThreads; this.debug = builder.debug; @@ -74,11 +76,16 @@ public String getBpeVocab() { return bpeVocab; } + public String getTeleSpeech() { + return teleSpeech; + } + public static class Builder { private OfflineParaformerModelConfig paraformer = OfflineParaformerModelConfig.builder().build(); private OfflineTransducerModelConfig transducer = OfflineTransducerModelConfig.builder().build(); private OfflineWhisperModelConfig whisper = OfflineWhisperModelConfig.builder().build(); private OfflineNemoEncDecCtcModelConfig nemo = OfflineNemoEncDecCtcModelConfig.builder().build(); + private String teleSpeech = ""; private String tokens = ""; private int numThreads = 1; private boolean debug = true; @@ -106,6 +113,12 @@ public Builder setNemo(OfflineNemoEncDecCtcModelConfig nemo) { return this; } + + public Builder setTeleSpeech(String teleSpeech) { + this.teleSpeech = teleSpeech; + return this; + } + public Builder setWhisper(OfflineWhisperModelConfig whisper) { this.whisper = whisper; return this; diff --git a/sherpa-onnx/jni/offline-recognizer.cc b/sherpa-onnx/jni/offline-recognizer.cc index c067451a4..cf69389a3 100644 --- a/sherpa-onnx/jni/offline-recognizer.cc +++ b/sherpa-onnx/jni/offline-recognizer.cc @@ -172,6 +172,12 @@ static OfflineRecognizerConfig GetOfflineConfig(JNIEnv *env, jobject config) { ans.model_config.nemo_ctc.model = p; env->ReleaseStringUTFChars(s, p); + fid = env->GetFieldID(model_config_cls, "teleSpeech", "Ljava/lang/String;"); + s = (jstring)env->GetObjectField(model_config, fid); + p = env->GetStringUTFChars(s, nullptr); + ans.model_config.telespeech_ctc = p; + env->ReleaseStringUTFChars(s, p); + return ans; } diff --git a/sherpa-onnx/kotlin-api/OfflineRecognizer.kt b/sherpa-onnx/kotlin-api/OfflineRecognizer.kt index e7f72884e..151ac73d5 100644 --- a/sherpa-onnx/kotlin-api/OfflineRecognizer.kt +++ b/sherpa-onnx/kotlin-api/OfflineRecognizer.kt @@ -35,6 +35,7 @@ data class OfflineModelConfig( var paraformer: OfflineParaformerModelConfig = OfflineParaformerModelConfig(), var whisper: OfflineWhisperModelConfig = OfflineWhisperModelConfig(), var nemo: OfflineNemoEncDecCtcModelConfig = OfflineNemoEncDecCtcModelConfig(), + var teleSpeech: String = "", var numThreads: Int = 1, var debug: Boolean = false, var provider: String = "cpu", @@ -272,6 +273,15 @@ fun getOfflineModelConfig(type: Int): OfflineModelConfig? { tokens = "$modelDir/tokens.txt", ) } + + 11 -> { + val modelDir = "sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04" + return OfflineModelConfig( + teleSpeech = "$modelDir/model.int8.onnx", + tokens = "$modelDir/tokens.txt", + modelType = "tele_speech", + ) + } } return null } diff --git a/sherpa-onnx/python/csrc/offline-model-config.cc b/sherpa-onnx/python/csrc/offline-model-config.cc index 3fc3b34c2..a72c182ea 100644 --- a/sherpa-onnx/python/csrc/offline-model-config.cc +++ b/sherpa-onnx/python/csrc/offline-model-config.cc @@ -29,25 +29,27 @@ void PybindOfflineModelConfig(py::module *m) { using PyClass = OfflineModelConfig; py::class_(*m, "OfflineModelConfig") - .def(py::init(), - py::arg("transducer") = OfflineTransducerModelConfig(), - py::arg("paraformer") = OfflineParaformerModelConfig(), - py::arg("nemo_ctc") = OfflineNemoEncDecCtcModelConfig(), - py::arg("whisper") = OfflineWhisperModelConfig(), - py::arg("tdnn") = OfflineTdnnModelConfig(), - py::arg("zipformer_ctc") = OfflineZipformerCtcModelConfig(), - py::arg("wenet_ctc") = OfflineWenetCtcModelConfig(), - py::arg("tokens"), py::arg("num_threads"), py::arg("debug") = false, - py::arg("provider") = "cpu", py::arg("model_type") = "", - py::arg("modeling_unit") = "cjkchar", py::arg("bpe_vocab") = "") + .def( + py::init< + const OfflineTransducerModelConfig &, + const OfflineParaformerModelConfig &, + const OfflineNemoEncDecCtcModelConfig &, + const OfflineWhisperModelConfig &, const OfflineTdnnModelConfig &, + const OfflineZipformerCtcModelConfig &, + const OfflineWenetCtcModelConfig &, const std::string &, + const std::string &, int32_t, bool, const std::string &, + const std::string &, const std::string &, const std::string &>(), + py::arg("transducer") = OfflineTransducerModelConfig(), + py::arg("paraformer") = OfflineParaformerModelConfig(), + py::arg("nemo_ctc") = OfflineNemoEncDecCtcModelConfig(), + py::arg("whisper") = OfflineWhisperModelConfig(), + py::arg("tdnn") = OfflineTdnnModelConfig(), + py::arg("zipformer_ctc") = OfflineZipformerCtcModelConfig(), + py::arg("wenet_ctc") = OfflineWenetCtcModelConfig(), + py::arg("telespeech_ctc") = "", py::arg("tokens"), + py::arg("num_threads"), py::arg("debug") = false, + py::arg("provider") = "cpu", py::arg("model_type") = "", + py::arg("modeling_unit") = "cjkchar", py::arg("bpe_vocab") = "") .def_readwrite("transducer", &PyClass::transducer) .def_readwrite("paraformer", &PyClass::paraformer) .def_readwrite("nemo_ctc", &PyClass::nemo_ctc) @@ -55,6 +57,7 @@ void PybindOfflineModelConfig(py::module *m) { .def_readwrite("tdnn", &PyClass::tdnn) .def_readwrite("zipformer_ctc", &PyClass::zipformer_ctc) .def_readwrite("wenet_ctc", &PyClass::wenet_ctc) + .def_readwrite("telespeech_ctc", &PyClass::telespeech_ctc) .def_readwrite("tokens", &PyClass::tokens) .def_readwrite("num_threads", &PyClass::num_threads) .def_readwrite("debug", &PyClass::debug) diff --git a/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py b/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py index 87c5132d2..480ea23ce 100644 --- a/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py +++ b/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py @@ -211,6 +211,71 @@ def from_paraformer( self.config = recognizer_config return self + @classmethod + def from_telespeech_ctc( + cls, + model: str, + tokens: str, + num_threads: int = 1, + sample_rate: int = 16000, + feature_dim: int = 40, + decoding_method: str = "greedy_search", + debug: bool = False, + provider: str = "cpu", + ): + """ + Please refer to + ``_ + to download pre-trained models. + + Args: + model: + Path to ``model.onnx``. + tokens: + Path to ``tokens.txt``. Each line in ``tokens.txt`` contains two + columns:: + + symbol integer_id + + num_threads: + Number of threads for neural network computation. + sample_rate: + Sample rate of the training data used to train the model. It is + ignored and is hard-coded in C++ to 40. + feature_dim: + Dimension of the feature used to train the model. It is ignored + and is hard-coded in C++ to 40. + decoding_method: + Valid values are greedy_search. + debug: + True to show debug messages. + provider: + onnxruntime execution providers. Valid values are: cpu, cuda, coreml. + """ + self = cls.__new__(cls) + model_config = OfflineModelConfig( + telespeech_ctc=model, + tokens=tokens, + num_threads=num_threads, + debug=debug, + provider=provider, + model_type="nemo_ctc", + ) + + feat_config = FeatureExtractorConfig( + sampling_rate=sample_rate, + feature_dim=feature_dim, + ) + + recognizer_config = OfflineRecognizerConfig( + feat_config=feat_config, + model_config=model_config, + decoding_method=decoding_method, + ) + self.recognizer = _Recognizer(recognizer_config) + self.config = recognizer_config + return self + @classmethod def from_nemo_ctc( cls, diff --git a/swift-api-examples/SherpaOnnx.swift b/swift-api-examples/SherpaOnnx.swift index 0c8d22f3d..7346ac4b8 100644 --- a/swift-api-examples/SherpaOnnx.swift +++ b/swift-api-examples/SherpaOnnx.swift @@ -102,7 +102,7 @@ func sherpaOnnxOnlineModelConfig( debug: Int32(debug), model_type: toCPointer(modelType), modeling_unit: toCPointer(modelingUnit), - bpeVocab: toCPointer(bpeVocab) + bpe_vocab: toCPointer(bpeVocab) ) } @@ -360,7 +360,8 @@ func sherpaOnnxOfflineModelConfig( debug: Int = 0, modelType: String = "", modelingUnit: String = "cjkchar", - bpeVocab: String = "" + bpeVocab: String = "", + teleSpeechCtc: String = "" ) -> SherpaOnnxOfflineModelConfig { return SherpaOnnxOfflineModelConfig( transducer: transducer, @@ -374,7 +375,8 @@ func sherpaOnnxOfflineModelConfig( provider: toCPointer(provider), model_type: toCPointer(modelType), modeling_unit: toCPointer(modelingUnit), - bpeVocab: toCPointer(bpeVocab) + bpe_vocab: toCPointer(bpeVocab), + telespeech_ctc: toCPointer(teleSpeechCtc) ) } diff --git a/wasm/asr/sherpa-onnx-asr.js b/wasm/asr/sherpa-onnx-asr.js index c77794a68..53afe1875 100644 --- a/wasm/asr/sherpa-onnx-asr.js +++ b/wasm/asr/sherpa-onnx-asr.js @@ -529,7 +529,7 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { const tdnn = initSherpaOnnxOfflineTdnnModelConfig(config.tdnn, Module); const len = transducer.len + paraformer.len + nemoCtc.len + whisper.len + - tdnn.len + 7 * 4; + tdnn.len + 8 * 4; const ptr = Module._malloc(len); let offset = 0; @@ -553,9 +553,11 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { const modelTypeLen = Module.lengthBytesUTF8(config.modelType) + 1; const modelingUnitLen = Module.lengthBytesUTF8(config.modelingUnit || '') + 1; const bpeVocabLen = Module.lengthBytesUTF8(config.bpeVocab || '') + 1; + const teleSpeechCtcLen = + Module.lengthBytesUTF8(config.teleSpeechCtc || '') + 1; - const bufferLen = - tokensLen + providerLen + modelTypeLen + modelingUnitLen + bpeVocabLen; + const bufferLen = tokensLen + providerLen + modelTypeLen + modelingUnitLen + + bpeVocabLen + teleSpeechCtcLen; const buffer = Module._malloc(bufferLen); offset = 0; @@ -575,6 +577,10 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { Module.stringToUTF8(config.bpeVocab || '', buffer + offset, bpeVocabLen); offset += bpeVocabLen; + Module.stringToUTF8( + config.teleSpeechCtc || '', buffer + offset, teleSpeechCtcLen); + offset += teleSpeechCtcLen; + offset = transducer.len + paraformer.len + nemoCtc.len + whisper.len + tdnn.len; Module.setValue(ptr + offset, buffer, 'i8*'); // tokens @@ -604,6 +610,13 @@ function initSherpaOnnxOfflineModelConfig(config, Module) { 'i8*'); // bpeVocab offset += 4; + Module.setValue( + ptr + offset, + buffer + tokensLen + providerLen + modelTypeLen + modelingUnitLen + + bpeVocabLen, + 'i8*'); // teleSpeechCtc + offset += 4; + return { buffer: buffer, ptr: ptr, len: len, transducer: transducer, paraformer: paraformer, nemoCtc: nemoCtc, whisper: whisper, tdnn: tdnn diff --git a/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc b/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc index ceb5a2442..177fb6f04 100644 --- a/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc +++ b/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc @@ -23,7 +23,7 @@ static_assert(sizeof(SherpaOnnxOfflineModelConfig) == sizeof(SherpaOnnxOfflineParaformerModelConfig) + sizeof(SherpaOnnxOfflineNemoEncDecCtcModelConfig) + sizeof(SherpaOnnxOfflineWhisperModelConfig) + - sizeof(SherpaOnnxOfflineTdnnModelConfig) + 7 * 4, + sizeof(SherpaOnnxOfflineTdnnModelConfig) + 8 * 4, ""); static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, ""); static_assert(sizeof(SherpaOnnxOfflineRecognizerConfig) == @@ -92,6 +92,7 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { fprintf(stdout, "model type: %s\n", model_config->model_type); fprintf(stdout, "modeling unit: %s\n", model_config->modeling_unit); fprintf(stdout, "bpe vocab: %s\n", model_config->bpe_vocab); + fprintf(stdout, "telespeech_ctc: %s\n", model_config->telespeech_ctc); fprintf(stdout, "----------feat config----------\n"); fprintf(stdout, "sample rate: %d\n", feat->sample_rate); From 7e0931c762fc40180e267764d972fe4249a4e95c Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Wed, 5 Jun 2024 11:23:19 +0800 Subject: [PATCH 004/237] Fix punctuation (#976) --- .github/workflows/sanitizer.yaml | 14 ++++++++------ .../csrc/offline-punctuation-ct-transformer-impl.h | 11 ++++++----- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/.github/workflows/sanitizer.yaml b/.github/workflows/sanitizer.yaml index ef348a380..2d4abf49c 100644 --- a/.github/workflows/sanitizer.yaml +++ b/.github/workflows/sanitizer.yaml @@ -76,6 +76,14 @@ jobs: otool -L build/bin/sherpa-onnx otool -l build/bin/sherpa-onnx + - name: Test offline punctuation + shell: bash + run: | + export PATH=$PWD/build/bin:$PATH + export EXE=sherpa-onnx-offline-punctuation + + .github/scripts/test-offline-punctuation.sh + - name: Test offline transducer shell: bash run: | @@ -92,13 +100,7 @@ jobs: .github/scripts/test-online-ctc.sh - - name: Test offline punctuation - shell: bash - run: | - export PATH=$PWD/build/bin:$PATH - export EXE=sherpa-onnx-offline-punctuation - .github/scripts/test-offline-punctuation.sh - name: Test C API shell: bash diff --git a/sherpa-onnx/csrc/offline-punctuation-ct-transformer-impl.h b/sherpa-onnx/csrc/offline-punctuation-ct-transformer-impl.h index 4d05fb503..eb2c46d6a 100644 --- a/sherpa-onnx/csrc/offline-punctuation-ct-transformer-impl.h +++ b/sherpa-onnx/csrc/offline-punctuation-ct-transformer-impl.h @@ -69,8 +69,8 @@ class OfflinePunctuationCtTransformerImpl : public OfflinePunctuationImpl { std::vector punctuations; int32_t last = -1; for (int32_t i = 0; i != num_segments; ++i) { - int32_t this_start = i * segment_size; // inclusive - int32_t this_end = this_start + segment_size; // exclusive + int32_t this_start = i * segment_size; // included + int32_t this_end = this_start + segment_size; // not included if (this_end > static_cast(token_ids.size())) { this_end = token_ids.size(); } @@ -113,7 +113,8 @@ class OfflinePunctuationCtTransformerImpl : public OfflinePunctuationImpl { int32_t dot_index = -1; int32_t comma_index = -1; - for (int32_t m = this_punctuations.size() - 2; m >= 1; --m) { + for (int32_t m = static_cast(this_punctuations.size()) - 2; + m >= 1; --m) { int32_t punct_id = this_punctuations[m]; if (punct_id == meta_data.dot_id || punct_id == meta_data.quest_id) { @@ -137,13 +138,13 @@ class OfflinePunctuationCtTransformerImpl : public OfflinePunctuationImpl { } if (i == num_segments - 1) { - dot_index = token_ids.size() - 1; + dot_index = static_cast(this_punctuations.size()) - 1; } } else { last = this_start + dot_index + 1; } - if (dot_index != 1) { + if (dot_index != -1) { punctuations.insert(punctuations.end(), this_punctuations.begin(), this_punctuations.begin() + (dot_index + 1)); } From 69347ffc8f299060a11e43cf6e194822b6356581 Mon Sep 17 00:00:00 2001 From: Manix <50542248+manickavela29@users.noreply.github.com> Date: Thu, 6 Jun 2024 08:15:28 +0530 Subject: [PATCH 005/237] Support TensorRT provider (#921) Signed-off-by: manickavela1998@gmail.com Signed-off-by: manickavela1998@gmail.com --- sherpa-onnx/csrc/provider.cc | 2 ++ sherpa-onnx/csrc/provider.h | 1 + sherpa-onnx/csrc/session.cc | 62 +++++++++++++++++++++++++++++++++++- 3 files changed, 64 insertions(+), 1 deletion(-) diff --git a/sherpa-onnx/csrc/provider.cc b/sherpa-onnx/csrc/provider.cc index 95bc18c5f..19d585976 100644 --- a/sherpa-onnx/csrc/provider.cc +++ b/sherpa-onnx/csrc/provider.cc @@ -24,6 +24,8 @@ Provider StringToProvider(std::string s) { return Provider::kXnnpack; } else if (s == "nnapi") { return Provider::kNNAPI; + } else if (s == "trt") { + return Provider::kTRT; } else { SHERPA_ONNX_LOGE("Unsupported string: %s. Fallback to cpu", s.c_str()); return Provider::kCPU; diff --git a/sherpa-onnx/csrc/provider.h b/sherpa-onnx/csrc/provider.h index 467e5dab5..c104d401a 100644 --- a/sherpa-onnx/csrc/provider.h +++ b/sherpa-onnx/csrc/provider.h @@ -18,6 +18,7 @@ enum class Provider { kCoreML = 2, // CoreMLExecutionProvider kXnnpack = 3, // XnnpackExecutionProvider kNNAPI = 4, // NnapiExecutionProvider + kTRT = 5, // TensorRTExecutionProvider }; /** diff --git a/sherpa-onnx/csrc/session.cc b/sherpa-onnx/csrc/session.cc index d0a697404..431a6a761 100644 --- a/sherpa-onnx/csrc/session.cc +++ b/sherpa-onnx/csrc/session.cc @@ -21,6 +21,16 @@ namespace sherpa_onnx { + +static void OrtStatusFailure(OrtStatus *status, const char *s) { + const auto &api = Ort::GetApi(); + const char *msg = api.GetErrorMessage(status); + SHERPA_ONNX_LOGE( + "Failed to enable TensorRT : %s." + "Available providers: %s. Fallback to cuda", msg, s); + api.ReleaseStatus(status); +} + static Ort::SessionOptions GetSessionOptionsImpl(int32_t num_threads, std::string provider_str) { Provider p = StringToProvider(std::move(provider_str)); @@ -53,6 +63,57 @@ static Ort::SessionOptions GetSessionOptionsImpl(int32_t num_threads, } break; } + case Provider::kTRT: { + struct TrtPairs { + const char* op_keys; + const char* op_values; + }; + + std::vector trt_options = { + {"device_id", "0"}, + {"trt_max_workspace_size", "2147483648"}, + {"trt_max_partition_iterations", "10"}, + {"trt_min_subgraph_size", "5"}, + {"trt_fp16_enable", "0"}, + {"trt_detailed_build_log", "0"}, + {"trt_engine_cache_enable", "1"}, + {"trt_engine_cache_path", "."}, + {"trt_timing_cache_enable", "1"}, + {"trt_timing_cache_path", "."} + }; + // ToDo : Trt configs + // "trt_int8_enable" + // "trt_int8_use_native_calibration_table" + // "trt_dump_subgraphs" + + std::vector option_keys, option_values; + for (const TrtPairs& pair : trt_options) { + option_keys.emplace_back(pair.op_keys); + option_values.emplace_back(pair.op_values); + } + + std::vector available_providers = + Ort::GetAvailableProviders(); + if (std::find(available_providers.begin(), available_providers.end(), + "TensorrtExecutionProvider") != available_providers.end()) { + const auto& api = Ort::GetApi(); + + OrtTensorRTProviderOptionsV2* tensorrt_options; + OrtStatus *statusC = api.CreateTensorRTProviderOptions( + &tensorrt_options); + OrtStatus *statusU = api.UpdateTensorRTProviderOptions( + tensorrt_options, option_keys.data(), option_values.data(), + option_keys.size()); + sess_opts.AppendExecutionProvider_TensorRT_V2(*tensorrt_options); + + if (statusC) { OrtStatusFailure(statusC, os.str().c_str()); } + if (statusU) { OrtStatusFailure(statusU, os.str().c_str()); } + + api.ReleaseTensorRTProviderOptions(tensorrt_options); + } + // break; is omitted here intentionally so that + // if TRT not available, CUDA will be used + } case Provider::kCUDA: { if (std::find(available_providers.begin(), available_providers.end(), "CUDAExecutionProvider") != available_providers.end()) { @@ -116,7 +177,6 @@ static Ort::SessionOptions GetSessionOptionsImpl(int32_t num_threads, break; } } - return sess_opts; } From 1a43d1e37f2a65a7326e75be4607b4996f9737a8 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 6 Jun 2024 14:22:39 +0800 Subject: [PATCH 006/237] Support getting word IDs for CTC HLG decoding. (#978) --- sherpa-onnx/csrc/offline-ctc-decoder.h | 8 ++++++++ sherpa-onnx/csrc/offline-ctc-fst-decoder.cc | 3 +++ .../offline-paraformer-greedy-search-decoder.cc | 4 ---- sherpa-onnx/csrc/offline-recognizer-ctc-impl.h | 2 ++ sherpa-onnx/csrc/offline-stream.cc | 14 ++++++++++++++ sherpa-onnx/csrc/offline-stream.h | 2 ++ sherpa-onnx/csrc/online-ctc-decoder.h | 8 ++++++++ sherpa-onnx/csrc/online-ctc-fst-decoder.cc | 7 ++++--- sherpa-onnx/csrc/online-recognizer-ctc-impl.h | 1 + sherpa-onnx/csrc/online-recognizer.cc | 17 +++++++++++------ sherpa-onnx/csrc/online-recognizer.h | 2 ++ sherpa-onnx/python/csrc/offline-stream.cc | 2 ++ sherpa-onnx/python/csrc/online-recognizer.cc | 3 +++ 13 files changed, 60 insertions(+), 13 deletions(-) diff --git a/sherpa-onnx/csrc/offline-ctc-decoder.h b/sherpa-onnx/csrc/offline-ctc-decoder.h index 23e8d0bd8..c9d1b36ff 100644 --- a/sherpa-onnx/csrc/offline-ctc-decoder.h +++ b/sherpa-onnx/csrc/offline-ctc-decoder.h @@ -15,8 +15,16 @@ struct OfflineCtcDecoderResult { /// The decoded token IDs std::vector tokens; + /// The decoded word IDs + /// Note: tokens.size() is usually not equal to words.size() + /// words is empty for greedy search decoding. + /// it is not empty when an HLG graph or an HLG graph is used. + std::vector words; + /// timestamps[i] contains the output frame index where tokens[i] is decoded. /// Note: The index is after subsampling + /// + /// tokens.size() == timestamps.size() std::vector timestamps; }; diff --git a/sherpa-onnx/csrc/offline-ctc-fst-decoder.cc b/sherpa-onnx/csrc/offline-ctc-fst-decoder.cc index e54274df4..6c9df3fd3 100644 --- a/sherpa-onnx/csrc/offline-ctc-fst-decoder.cc +++ b/sherpa-onnx/csrc/offline-ctc-fst-decoder.cc @@ -108,6 +108,9 @@ static OfflineCtcDecoderResult DecodeOne(kaldi_decoder::FasterDecoder *decoder, // -1 here since the input labels are incremented during graph // construction r.tokens.push_back(arc.ilabel - 1); + if (arc.olabel != 0) { + r.words.push_back(arc.olabel); + } r.timestamps.push_back(t); prev = arc.ilabel; diff --git a/sherpa-onnx/csrc/offline-paraformer-greedy-search-decoder.cc b/sherpa-onnx/csrc/offline-paraformer-greedy-search-decoder.cc index 0e31bd97c..cdaf79413 100644 --- a/sherpa-onnx/csrc/offline-paraformer-greedy-search-decoder.cc +++ b/sherpa-onnx/csrc/offline-paraformer-greedy-search-decoder.cc @@ -64,10 +64,6 @@ OfflineParaformerGreedySearchDecoder::Decode( if (timestamps.size() == results[i].tokens.size()) { results[i].timestamps = std::move(timestamps); - } else { - SHERPA_ONNX_LOGE("time stamp for batch: %d, %d vs %d", i, - static_cast(results[i].tokens.size()), - static_cast(timestamps.size())); } } } diff --git a/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h b/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h index 2c83dac28..c64da12af 100644 --- a/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h +++ b/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h @@ -65,6 +65,8 @@ static OfflineRecognitionResult Convert(const OfflineCtcDecoderResult &src, r.timestamps.push_back(time); } + r.words = std::move(src.words); + return r; } diff --git a/sherpa-onnx/csrc/offline-stream.cc b/sherpa-onnx/csrc/offline-stream.cc index 4321a62dd..6e72a4a1f 100644 --- a/sherpa-onnx/csrc/offline-stream.cc +++ b/sherpa-onnx/csrc/offline-stream.cc @@ -339,6 +339,20 @@ std::string OfflineRecognitionResult::AsJsonString() const { } sep = ", "; } + os << "], "; + + sep = ""; + + os << "\"" + << "words" + << "\"" + << ": "; + os << "["; + for (int32_t w : words) { + os << sep << w; + sep = ", "; + } + os << "]"; os << "}"; diff --git a/sherpa-onnx/csrc/offline-stream.h b/sherpa-onnx/csrc/offline-stream.h index 13cc56004..9df46d04e 100644 --- a/sherpa-onnx/csrc/offline-stream.h +++ b/sherpa-onnx/csrc/offline-stream.h @@ -30,6 +30,8 @@ struct OfflineRecognitionResult { /// timestamps[i] records the time in seconds when tokens[i] is decoded. std::vector timestamps; + std::vector words; + std::string AsJsonString() const; }; diff --git a/sherpa-onnx/csrc/online-ctc-decoder.h b/sherpa-onnx/csrc/online-ctc-decoder.h index 28809e39f..65305e6ac 100644 --- a/sherpa-onnx/csrc/online-ctc-decoder.h +++ b/sherpa-onnx/csrc/online-ctc-decoder.h @@ -22,8 +22,16 @@ struct OnlineCtcDecoderResult { /// The decoded token IDs std::vector tokens; + /// The decoded word IDs + /// Note: tokens.size() is usually not equal to words.size() + /// words is empty for greedy search decoding. + /// it is not empty when an HLG graph or an HLG graph is used. + std::vector words; + /// timestamps[i] contains the output frame index where tokens[i] is decoded. /// Note: The index is after subsampling + /// + /// tokens.size() == timestamps.size() std::vector timestamps; int32_t num_trailing_blanks = 0; diff --git a/sherpa-onnx/csrc/online-ctc-fst-decoder.cc b/sherpa-onnx/csrc/online-ctc-fst-decoder.cc index 93e4c103b..f50578833 100644 --- a/sherpa-onnx/csrc/online-ctc-fst-decoder.cc +++ b/sherpa-onnx/csrc/online-ctc-fst-decoder.cc @@ -51,9 +51,9 @@ static void DecodeOne(const float *log_probs, int32_t num_rows, bool ok = decoder->GetBestPath(&fst_out); if (ok) { std::vector isymbols_out; - std::vector osymbols_out_unused; - ok = fst::GetLinearSymbolSequence(fst_out, &isymbols_out, - &osymbols_out_unused, nullptr); + std::vector osymbols_out; + ok = fst::GetLinearSymbolSequence(fst_out, &isymbols_out, &osymbols_out, + nullptr); std::vector tokens; tokens.reserve(isymbols_out.size()); @@ -83,6 +83,7 @@ static void DecodeOne(const float *log_probs, int32_t num_rows, } result->tokens = std::move(tokens); + result->words = std::move(osymbols_out); result->timestamps = std::move(timestamps); // no need to set frame_offset } diff --git a/sherpa-onnx/csrc/online-recognizer-ctc-impl.h b/sherpa-onnx/csrc/online-recognizer-ctc-impl.h index 7b85ceefd..4d8ce2961 100644 --- a/sherpa-onnx/csrc/online-recognizer-ctc-impl.h +++ b/sherpa-onnx/csrc/online-recognizer-ctc-impl.h @@ -59,6 +59,7 @@ static OnlineRecognizerResult Convert(const OnlineCtcDecoderResult &src, } r.segment = segment; + r.words = std::move(src.words); r.start_time = frames_since_start * frame_shift_ms / 1000.; return r; diff --git a/sherpa-onnx/csrc/online-recognizer.cc b/sherpa-onnx/csrc/online-recognizer.cc index 9004d3fbf..fcb9169ef 100644 --- a/sherpa-onnx/csrc/online-recognizer.cc +++ b/sherpa-onnx/csrc/online-recognizer.cc @@ -22,14 +22,16 @@ namespace sherpa_onnx { template std::string VecToString(const std::vector &vec, int32_t precision = 6) { std::ostringstream oss; - oss << std::fixed << std::setprecision(precision); - oss << "[ "; + if (precision != 0) { + oss << std::fixed << std::setprecision(precision); + } + oss << "["; std::string sep = ""; for (const auto &item : vec) { oss << sep << item; sep = ", "; } - oss << " ]"; + oss << "]"; return oss.str(); } @@ -38,26 +40,29 @@ template <> // explicit specialization for T = std::string std::string VecToString(const std::vector &vec, int32_t) { // ignore 2nd arg std::ostringstream oss; - oss << "[ "; + oss << "["; std::string sep = ""; for (const auto &item : vec) { oss << sep << "\"" << item << "\""; sep = ", "; } - oss << " ]"; + oss << "]"; return oss.str(); } std::string OnlineRecognizerResult::AsJsonString() const { std::ostringstream os; os << "{ "; - os << "\"text\": " << "\"" << text << "\"" << ", "; + os << "\"text\": " + << "\"" << text << "\"" + << ", "; os << "\"tokens\": " << VecToString(tokens) << ", "; os << "\"timestamps\": " << VecToString(timestamps, 2) << ", "; os << "\"ys_probs\": " << VecToString(ys_probs, 6) << ", "; os << "\"lm_probs\": " << VecToString(lm_probs, 6) << ", "; os << "\"context_scores\": " << VecToString(context_scores, 6) << ", "; os << "\"segment\": " << segment << ", "; + os << "\"words\": " << VecToString(words, 0) << ", "; os << "\"start_time\": " << std::fixed << std::setprecision(2) << start_time << ", "; os << "\"is_final\": " << (is_final ? "true" : "false"); diff --git a/sherpa-onnx/csrc/online-recognizer.h b/sherpa-onnx/csrc/online-recognizer.h index c04122ea0..f7fcf2f21 100644 --- a/sherpa-onnx/csrc/online-recognizer.h +++ b/sherpa-onnx/csrc/online-recognizer.h @@ -47,6 +47,8 @@ struct OnlineRecognizerResult { /// log-domain scores from "hot-phrase" contextual boosting std::vector context_scores; + std::vector words; + /// ID of this segment /// When an endpoint is detected, it is incremented int32_t segment = 0; diff --git a/sherpa-onnx/python/csrc/offline-stream.cc b/sherpa-onnx/python/csrc/offline-stream.cc index 5679eca7b..3c1cf3486 100644 --- a/sherpa-onnx/python/csrc/offline-stream.cc +++ b/sherpa-onnx/python/csrc/offline-stream.cc @@ -34,6 +34,8 @@ static void PybindOfflineRecognitionResult(py::module *m) { // NOLINT }) .def_property_readonly("tokens", [](const PyClass &self) { return self.tokens; }) + .def_property_readonly("words", + [](const PyClass &self) { return self.words; }) .def_property_readonly( "timestamps", [](const PyClass &self) { return self.timestamps; }); } diff --git a/sherpa-onnx/python/csrc/online-recognizer.cc b/sherpa-onnx/python/csrc/online-recognizer.cc index c402163fe..148f73ee5 100644 --- a/sherpa-onnx/python/csrc/online-recognizer.cc +++ b/sherpa-onnx/python/csrc/online-recognizer.cc @@ -40,6 +40,9 @@ static void PybindOnlineRecognizerResult(py::module *m) { }) .def_property_readonly( "segment", [](PyClass &self) -> int32_t { return self.segment; }) + .def_property_readonly( + "words", + [](PyClass &self) -> std::vector { return self.words; }) .def_property_readonly( "is_final", [](PyClass &self) -> bool { return self.is_final; }) .def("__str__", &PyClass::AsJsonString, From fc09227cd1aada502e12e3c9b4cc948c7e55d68f Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Mon, 10 Jun 2024 21:01:48 +0800 Subject: [PATCH 007/237] Add Python example to show how to register speakers dynamically for speaker ID. (#986) --- ...speaker-identification-with-vad-dynamic.py | 221 ++++++++++++++++++ 1 file changed, 221 insertions(+) create mode 100755 python-api-examples/speaker-identification-with-vad-dynamic.py diff --git a/python-api-examples/speaker-identification-with-vad-dynamic.py b/python-api-examples/speaker-identification-with-vad-dynamic.py new file mode 100755 index 000000000..c6cc80bbe --- /dev/null +++ b/python-api-examples/speaker-identification-with-vad-dynamic.py @@ -0,0 +1,221 @@ +#!/usr/bin/env python3 + +""" +This script shows how to use Python APIs for speaker identification with +a microphone and a VAD model + +Usage: + +(1) Download a model for computing speaker embeddings + +Please visit +https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models +to download a model. An example is given below: + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_large_sv_zh-cn_3dspeaker_16k.onnx + +Note that `zh` means Chinese, while `en` means English. + +(2) Download the VAD model +Please visit +https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx +to download silero_vad.onnx + +For instance, + +wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx + +(3) Run this script + +python3 ./python-api-examples/speaker-identification-with-vad-dynamic.py \ + --silero-vad-model=/path/to/silero_vad.onnx \ + --model ./3dspeaker_speech_eres2net_large_sv_zh-cn_3dspeaker_16k.onnx +""" +import argparse +import sys + +import numpy as np +import sherpa_onnx + +try: + import sounddevice as sd +except ImportError: + print("Please install sounddevice first. You can use") + print() + print(" pip install sounddevice") + print() + print("to install it") + sys.exit(-1) + +g_sample_rate = 16000 + + +def get_args(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + parser.add_argument( + "--model", + type=str, + required=True, + help="Path to the speaker embedding model file.", + ) + + parser.add_argument( + "--silero-vad-model", + type=str, + required=True, + help="Path to silero_vad.onnx", + ) + + parser.add_argument("--threshold", type=float, default=0.4) + + parser.add_argument( + "--num-threads", + type=int, + default=1, + help="Number of threads for neural network computation", + ) + + parser.add_argument( + "--debug", + type=bool, + default=False, + help="True to show debug messages", + ) + + parser.add_argument( + "--provider", + type=str, + default="cpu", + help="Valid values: cpu, cuda, coreml", + ) + + return parser.parse_args() + + +def load_speaker_embedding_model(args): + config = sherpa_onnx.SpeakerEmbeddingExtractorConfig( + model=args.model, + num_threads=args.num_threads, + debug=args.debug, + provider=args.provider, + ) + if not config.validate(): + raise ValueError(f"Invalid config. {config}") + extractor = sherpa_onnx.SpeakerEmbeddingExtractor(config) + return extractor + + +def compute_speaker_embedding( + samples: np.ndarray, + extractor: sherpa_onnx.SpeakerEmbeddingExtractor, +) -> np.ndarray: + """ + Args: + samples: + A 1-D float32 array. + extractor: + The return value of function load_speaker_embedding_model(). + Returns: + Return a 1-D float32 array. + """ + if len(samples) < g_sample_rate: + print(f"Your input contains only {len(samples)} samples!") + + stream = extractor.create_stream() + stream.accept_waveform(sample_rate=g_sample_rate, waveform=samples) + stream.input_finished() + + assert extractor.is_ready(stream) + embedding = extractor.compute(stream) + embedding = np.array(embedding) + return embedding + + +def main(): + args = get_args() + print(args) + + devices = sd.query_devices() + if len(devices) == 0: + print("No microphone devices found") + sys.exit(0) + + print(devices) + # If you want to select a different device, please change + # sd.default.device[0]. For instance, if you want to select device 10, + # please use + # + # sd.default.device[0] = 4 + # print(devices) + # + + default_input_device_idx = sd.default.device[0] + print(f'Use default device: {devices[default_input_device_idx]["name"]}') + + extractor = load_speaker_embedding_model(args) + + manager = sherpa_onnx.SpeakerEmbeddingManager(extractor.dim) + + vad_config = sherpa_onnx.VadModelConfig() + vad_config.silero_vad.model = args.silero_vad_model + vad_config.silero_vad.min_silence_duration = 0.25 + vad_config.silero_vad.min_speech_duration = 1.0 + vad_config.sample_rate = g_sample_rate + + window_size = vad_config.silero_vad.window_size + vad = sherpa_onnx.VoiceActivityDetector(vad_config, buffer_size_in_seconds=100) + + samples_per_read = int(0.1 * g_sample_rate) # 0.1 second = 100 ms + + print("Started! Please speak") + + line_num = 0 + speaker_id = 0 + buffer = [] + with sd.InputStream(channels=1, dtype="float32", samplerate=g_sample_rate) as s: + while True: + samples, _ = s.read(samples_per_read) # a blocking read + samples = samples.reshape(-1) + buffer = np.concatenate([buffer, samples]) + while len(buffer) > window_size: + vad.accept_waveform(buffer[:window_size]) + buffer = buffer[window_size:] + + while not vad.empty(): + if len(vad.front.samples) < 0.5 * g_sample_rate: + # this segment is too short, skip it + vad.pop() + continue + stream = extractor.create_stream() + stream.accept_waveform( + sample_rate=g_sample_rate, waveform=vad.front.samples + ) + vad.pop() + stream.input_finished() + + embedding = extractor.compute(stream) + embedding = np.array(embedding) + name = manager.search(embedding, threshold=args.threshold) + if not name: + # register it + new_name = f"speaker_{speaker_id}" + status = manager.add(new_name, embedding) + if not status: + raise RuntimeError(f"Failed to register speaker {new_name}") + print( + f"{line_num}: Detected new speaker. Register it as {new_name}" + ) + speaker_id += 1 + else: + print(f"{line_num}: Detected existing speaker: {name}") + line_num += 1 + + +if __name__ == "__main__": + try: + main() + except KeyboardInterrupt: + print("\nCaught Ctrl + C. Exiting") From 09efe548084a5b216ebef0b725555a0cd4b76027 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Tue, 11 Jun 2024 15:22:48 +0800 Subject: [PATCH 008/237] add more text-to-speech models from piper (#988) --- README.md | 1 + scripts/apk/generate-asr-2pass-apk-script.py | 80 ++++++++++---------- scripts/apk/generate-asr-apk-script.py | 66 ++++++++-------- scripts/apk/generate-tts-apk-script.py | 9 +++ 4 files changed, 83 insertions(+), 73 deletions(-) diff --git a/README.md b/README.md index c80ade988..e2c4e913a 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,7 @@ with the following APIs - C++, C, Python, Go, ``C#`` - Java, Kotlin, JavaScript - Swift + - Dart ## Links for pre-built Android APKs diff --git a/scripts/apk/generate-asr-2pass-apk-script.py b/scripts/apk/generate-asr-2pass-apk-script.py index 8b0948a96..85da8ada2 100755 --- a/scripts/apk/generate-asr-2pass-apk-script.py +++ b/scripts/apk/generate-asr-2pass-apk-script.py @@ -52,13 +52,13 @@ def get_2nd_models(): short_name="whisper_tiny", cmd=""" pushd $model_name - rm -v tiny.en-encoder.onnx - rm -v tiny.en-decoder.onnx + rm -fv tiny.en-encoder.onnx + rm -fv tiny.en-decoder.onnx rm -rf test_wavs - rm -v *.py - rm -v requirements.txt - rm -v .gitignore - rm -v README.md + rm -fv *.py + rm -fv requirements.txt + rm -fv .gitignore + rm -fv README.md ls -lh @@ -73,7 +73,7 @@ def get_2nd_models(): cmd=""" pushd $model_name - rm -v README.md + rm -fv README.md rm -rfv test_wavs rm model.onnx @@ -91,7 +91,7 @@ def get_2nd_models(): pushd $model_name rm -rfv test_wavs - rm -v README.md + rm -fv README.md mv -v data/lang_char/tokens.txt ./ rm -rfv data/lang_char @@ -119,15 +119,15 @@ def get_1st_models(): short_name="zipformer", cmd=""" pushd $model_name - rm -v decoder-epoch-99-avg-1.int8.onnx - rm -v encoder-epoch-99-avg-1.onnx - rm -v joiner-epoch-99-avg-1.onnx - - rm -v *.sh - rm -v bpe.model - rm -v README.md - rm -v .gitattributes - rm -v *state* + rm -fv decoder-epoch-99-avg-1.int8.onnx + rm -fv encoder-epoch-99-avg-1.onnx + rm -fv joiner-epoch-99-avg-1.onnx + + rm -fv *.sh + rm -fv bpe.model + rm -fv README.md + rm -fv .gitattributes + rm -fv *state* rm -rfv test_wavs ls -lh @@ -142,12 +142,12 @@ def get_1st_models(): short_name="zipformer2", cmd=""" pushd $model_name - rm -v encoder-epoch-99-avg-1-chunk-16-left-128.onnx - rm -v decoder-epoch-99-avg-1-chunk-16-left-128.int8.onnx - rm -v joiner-epoch-99-avg-1-chunk-16-left-128.int8.onnx + rm -fv encoder-epoch-99-avg-1-chunk-16-left-128.onnx + rm -fv decoder-epoch-99-avg-1-chunk-16-left-128.int8.onnx + rm -fv joiner-epoch-99-avg-1-chunk-16-left-128.int8.onnx - rm -v README.md - rm -v bpe.model + rm -fv README.md + rm -fv bpe.model rm -rfv test_wavs ls -lh @@ -162,14 +162,14 @@ def get_1st_models(): short_name="zipformer2", cmd=""" pushd $model_name - rm -v exp/encoder-epoch-12-avg-4-chunk-16-left-128.onnx - rm -v exp/decoder-epoch-12-avg-4-chunk-16-left-128.int8.onnx - rm -v exp/joiner-epoch-12-avg-4-chunk-16-left-128.int8.onnx + rm -fv exp/encoder-epoch-12-avg-4-chunk-16-left-128.onnx + rm -fv exp/decoder-epoch-12-avg-4-chunk-16-left-128.int8.onnx + rm -fv exp/joiner-epoch-12-avg-4-chunk-16-left-128.int8.onnx - rm -v data/lang_char/lexicon.txt - rm -v data/lang_char/words.txt + rm -fv data/lang_char/lexicon.txt + rm -fv data/lang_char/words.txt rm -rfv test_wavs - rm -v README.md + rm -fv README.md ls -lh exp/ ls -lh data/lang_char @@ -184,11 +184,11 @@ def get_1st_models(): short_name="zipformer", cmd=""" pushd $model_name - rm -v encoder-epoch-29-avg-9-with-averaged-model.onnx - rm -v decoder-epoch-29-avg-9-with-averaged-model.int8.onnx - rm -v joiner-epoch-29-avg-9-with-averaged-model.int8.onnx + rm -fv encoder-epoch-29-avg-9-with-averaged-model.onnx + rm -fv decoder-epoch-29-avg-9-with-averaged-model.int8.onnx + rm -fv joiner-epoch-29-avg-9-with-averaged-model.int8.onnx - rm -v *.sh + rm -fv *.sh rm -rf test_wavs rm README.md @@ -204,11 +204,11 @@ def get_1st_models(): short_name="small_zipformer", cmd=""" pushd $model_name - rm -v encoder-epoch-99-avg-1.onnx - rm -v decoder-epoch-99-avg-1.int8.onnx - rm -v joiner-epoch-99-avg-1.onnx + rm -fv encoder-epoch-99-avg-1.onnx + rm -fv decoder-epoch-99-avg-1.int8.onnx + rm -fv joiner-epoch-99-avg-1.onnx - rm -v *.sh + rm -fv *.sh rm -rf test_wavs rm README.md @@ -224,11 +224,11 @@ def get_1st_models(): short_name="small_zipformer", cmd=""" pushd $model_name - rm -v encoder-epoch-99-avg-1.onnx - rm -v decoder-epoch-99-avg-1.int8.onnx - rm -v joiner-epoch-99-avg-1.onnx + rm -fv encoder-epoch-99-avg-1.onnx + rm -fv decoder-epoch-99-avg-1.int8.onnx + rm -fv joiner-epoch-99-avg-1.onnx - rm -v *.sh + rm -fv *.sh rm -rf test_wavs rm README.md diff --git a/scripts/apk/generate-asr-apk-script.py b/scripts/apk/generate-asr-apk-script.py index 845819c5c..09a8915e6 100755 --- a/scripts/apk/generate-asr-apk-script.py +++ b/scripts/apk/generate-asr-apk-script.py @@ -52,15 +52,15 @@ def get_models(): short_name="zipformer", cmd=""" pushd $model_name - rm -v decoder-epoch-99-avg-1.int8.onnx - rm -v encoder-epoch-99-avg-1.onnx - rm -v joiner-epoch-99-avg-1.onnx - - rm -v *.sh - rm -v bpe.model - rm -v README.md - rm -v .gitattributes - rm -v *state* + rm -fv decoder-epoch-99-avg-1.int8.onnx + rm -fv encoder-epoch-99-avg-1.onnx + rm -fv joiner-epoch-99-avg-1.onnx + + rm -fv *.sh + rm -fv bpe.model + rm -fv README.md + rm -fv .gitattributes + rm -fv *state* rm -rfv test_wavs ls -lh @@ -75,12 +75,12 @@ def get_models(): short_name="zipformer2", cmd=""" pushd $model_name - rm -v encoder-epoch-99-avg-1-chunk-16-left-128.onnx - rm -v decoder-epoch-99-avg-1-chunk-16-left-128.int8.onnx - rm -v joiner-epoch-99-avg-1-chunk-16-left-128.int8.onnx + rm -fv encoder-epoch-99-avg-1-chunk-16-left-128.onnx + rm -fv decoder-epoch-99-avg-1-chunk-16-left-128.int8.onnx + rm -fv joiner-epoch-99-avg-1-chunk-16-left-128.int8.onnx - rm -v README.md - rm -v bpe.model + rm -fv README.md + rm -fv bpe.model rm -rfv test_wavs ls -lh @@ -95,14 +95,14 @@ def get_models(): short_name="zipformer2", cmd=""" pushd $model_name - rm -v exp/encoder-epoch-12-avg-4-chunk-16-left-128.onnx - rm -v exp/decoder-epoch-12-avg-4-chunk-16-left-128.int8.onnx - rm -v exp/joiner-epoch-12-avg-4-chunk-16-left-128.int8.onnx + rm -fv exp/encoder-epoch-12-avg-4-chunk-16-left-128.onnx + rm -fv exp/decoder-epoch-12-avg-4-chunk-16-left-128.int8.onnx + rm -fv exp/joiner-epoch-12-avg-4-chunk-16-left-128.int8.onnx - rm -v data/lang_char/lexicon.txt - rm -v data/lang_char/words.txt + rm -fv data/lang_char/lexicon.txt + rm -fv data/lang_char/words.txt rm -rfv test_wavs - rm -v README.md + rm -fv README.md ls -lh exp/ ls -lh data/lang_char @@ -117,12 +117,12 @@ def get_models(): short_name="zipformer", cmd=""" pushd $model_name - rm -v encoder-epoch-29-avg-9-with-averaged-model.onnx - rm -v decoder-epoch-29-avg-9-with-averaged-model.int8.onnx - rm -v joiner-epoch-29-avg-9-with-averaged-model.int8.onnx + rm -fv encoder-epoch-29-avg-9-with-averaged-model.onnx + rm -fv decoder-epoch-29-avg-9-with-averaged-model.int8.onnx + rm -fv joiner-epoch-29-avg-9-with-averaged-model.int8.onnx - rm -v *.sh - rm -rf test_wavs + rm -fv *.sh + rm -rfv test_wavs rm README.md ls -lh @@ -137,11 +137,11 @@ def get_models(): short_name="small_zipformer", cmd=""" pushd $model_name - rm -v encoder-epoch-99-avg-1.onnx - rm -v decoder-epoch-99-avg-1.int8.onnx - rm -v joiner-epoch-99-avg-1.onnx + rm -fv encoder-epoch-99-avg-1.onnx + rm -fv decoder-epoch-99-avg-1.int8.onnx + rm -fv joiner-epoch-99-avg-1.onnx - rm -v *.sh + rm -fv *.sh rm -rf test_wavs rm README.md @@ -157,11 +157,11 @@ def get_models(): short_name="small_zipformer", cmd=""" pushd $model_name - rm -v encoder-epoch-99-avg-1.onnx - rm -v decoder-epoch-99-avg-1.int8.onnx - rm -v joiner-epoch-99-avg-1.onnx + rm -fv encoder-epoch-99-avg-1.onnx + rm -fv decoder-epoch-99-avg-1.int8.onnx + rm -fv joiner-epoch-99-avg-1.onnx - rm -v *.sh + rm -fv *.sh rm -rf test_wavs rm README.md diff --git a/scripts/apk/generate-tts-apk-script.py b/scripts/apk/generate-tts-apk-script.py index 9a3766874..6f1bb5bb8 100755 --- a/scripts/apk/generate-tts-apk-script.py +++ b/scripts/apk/generate-tts-apk-script.py @@ -103,6 +103,7 @@ def get_piper_models() -> List[TtsModel]: TtsModel(model_dir="vits-piper-ca_ES-upc_pau-x_low"), TtsModel(model_dir="vits-piper-ca_ES-upc_pau-x_low"), TtsModel(model_dir="vits-piper-cs_CZ-jirka-medium"), + TtsModel(model_dir="vits-piper-cy_GB-gwryw_gogleddol-medium"), TtsModel(model_dir="vits-piper-da_DK-talesyntese-medium"), TtsModel(model_dir="vits-piper-de_DE-eva_k-x_low"), TtsModel(model_dir="vits-piper-de_DE-karlsson-low"), @@ -126,15 +127,19 @@ def get_piper_models() -> List[TtsModel]: TtsModel(model_dir="vits-piper-en_GB-semaine-medium"), TtsModel(model_dir="vits-piper-en_GB-southern_english_female-low"), TtsModel(model_dir="vits-piper-en_GB-southern_english_female-medium"), + TtsModel(model_dir="vits-piper-en_GB-southern_english_male-medium"), TtsModel(model_dir="vits-piper-en_GB-sweetbbak-amy"), TtsModel(model_dir="vits-piper-en_GB-vctk-medium"), TtsModel(model_dir="vits-piper-en_US-amy-low"), TtsModel(model_dir="vits-piper-en_US-amy-medium"), TtsModel(model_dir="vits-piper-en_US-arctic-medium"), + TtsModel(model_dir="vits-piper-en_US-bryce-medium"), TtsModel(model_dir="vits-piper-en_US-danny-low"), TtsModel(model_dir="vits-piper-en_US-glados"), + TtsModel(model_dir="vits-piper-en_US-hfc_female-medium"), TtsModel(model_dir="vits-piper-en_US-hfc_male-medium"), TtsModel(model_dir="vits-piper-en_US-joe-medium"), + TtsModel(model_dir="vits-piper-en_US-john-medium"), TtsModel(model_dir="vits-piper-en_US-kathleen-low"), TtsModel(model_dir="vits-piper-en_US-kristin-medium"), TtsModel(model_dir="vits-piper-en_US-kusal-medium"), @@ -146,6 +151,7 @@ def get_piper_models() -> List[TtsModel]: TtsModel(model_dir="vits-piper-en_US-libritts_r-medium"), TtsModel(model_dir="vits-piper-en_US-ljspeech-high"), TtsModel(model_dir="vits-piper-en_US-ljspeech-medium"), + TtsModel(model_dir="vits-piper-en_US-norman-medium"), TtsModel(model_dir="vits-piper-en_US-ryan-high"), TtsModel(model_dir="vits-piper-en_US-ryan-low"), TtsModel(model_dir="vits-piper-en_US-ryan-medium"), @@ -162,6 +168,7 @@ def get_piper_models() -> List[TtsModel]: # TtsModel(model_dir="vits-piper-fr_FR-mls-medium"), TtsModel(model_dir="vits-piper-fr_FR-siwis-low"), TtsModel(model_dir="vits-piper-fr_FR-siwis-medium"), + TtsModel(model_dir="vits-piper-fr_FR-tom-medium"), TtsModel(model_dir="vits-piper-fr_FR-upmc-medium"), TtsModel(model_dir="vits-piper-hu_HU-anna-medium"), TtsModel(model_dir="vits-piper-hu_HU-berta-medium"), @@ -170,6 +177,7 @@ def get_piper_models() -> List[TtsModel]: TtsModel(model_dir="vits-piper-is_IS-salka-medium"), TtsModel(model_dir="vits-piper-is_IS-steinn-medium"), TtsModel(model_dir="vits-piper-is_IS-ugla-medium"), + TtsModel(model_dir="vits-piper-it_IT-paola-medium"), TtsModel(model_dir="vits-piper-it_IT-riccardo-x_low"), TtsModel(model_dir="vits-piper-ka_GE-natia-medium"), TtsModel(model_dir="vits-piper-kk_KZ-iseke-x_low"), @@ -204,6 +212,7 @@ def get_piper_models() -> List[TtsModel]: TtsModel(model_dir="vits-piper-sw_CD-lanfrica-medium"), TtsModel(model_dir="vits-piper-tr_TR-dfki-medium"), TtsModel(model_dir="vits-piper-tr_TR-fahrettin-medium"), + TtsModel(model_dir="vits-piper-tr_TR-fettah-medium"), TtsModel(model_dir="vits-piper-uk_UA-lada-x_low"), TtsModel(model_dir="vits-piper-uk_UA-ukrainian_tts-medium"), TtsModel(model_dir="vits-piper-vi_VN-25hours_single-low"), From aac86847adfdd5f2bb7975ae5c205289a5c508d4 Mon Sep 17 00:00:00 2001 From: gilcu3 <828241+gilcu3@users.noreply.github.com> Date: Tue, 11 Jun 2024 15:16:42 +0200 Subject: [PATCH 009/237] store speed in SharedPreferences (#991) --- .../sherpa/onnx/tts/engine/MainActivity.kt | 7 ++++++- .../onnx/tts/engine/PreferencesHelper.kt | 21 +++++++++++++++++++ .../k2fsa/sherpa/onnx/tts/engine/TtsEngine.kt | 3 +++ 3 files changed, 30 insertions(+), 1 deletion(-) create mode 100644 android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/PreferencesHelper.kt diff --git a/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/MainActivity.kt b/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/MainActivity.kt index 28ce449a0..f64b05f4c 100644 --- a/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/MainActivity.kt +++ b/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/MainActivity.kt @@ -2,6 +2,7 @@ package com.k2fsa.sherpa.onnx.tts.engine +import PreferenceHelper import android.media.MediaPlayer import android.net.Uri import android.os.Bundle @@ -47,6 +48,7 @@ class MainActivity : ComponentActivity() { override fun onCreate(savedInstanceState: Bundle?) { super.onCreate(savedInstanceState) TtsEngine.createTts(this) + val preferenceHelper = PreferenceHelper(this) setContent { SherpaOnnxTtsEngineTheme { // A surface container using the 'background' color from the theme @@ -63,7 +65,10 @@ class MainActivity : ComponentActivity() { Text("Speed " + String.format("%.1f", TtsEngine.speed)) Slider( value = TtsEngine.speedState.value, - onValueChange = { TtsEngine.speed = it }, + onValueChange = { + TtsEngine.speed = it + preferenceHelper.setSpeed(it) + }, valueRange = 0.2F..3.0F, modifier = Modifier.fillMaxWidth() ) diff --git a/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/PreferencesHelper.kt b/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/PreferencesHelper.kt new file mode 100644 index 000000000..94c71f47d --- /dev/null +++ b/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/PreferencesHelper.kt @@ -0,0 +1,21 @@ +import android.content.Context +import android.content.SharedPreferences + +class PreferenceHelper(context: Context) { + + private val PREFS_NAME = "com.k2fsa.sherpa.onnx.tts.engine" + private val SPEED_KEY = "speed" + + private val sharedPreferences: SharedPreferences = + context.getSharedPreferences(PREFS_NAME, Context.MODE_PRIVATE) + + fun setSpeed(value: Float) { + val editor = sharedPreferences.edit() + editor.putFloat(SPEED_KEY, value) + editor.apply() + } + + fun getSpeed(): Float { + return sharedPreferences.getFloat(SPEED_KEY, 1.0f) + } +} \ No newline at end of file diff --git a/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/TtsEngine.kt b/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/TtsEngine.kt index 1bf92972e..49d794014 100644 --- a/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/TtsEngine.kt +++ b/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/TtsEngine.kt @@ -11,6 +11,7 @@ import com.k2fsa.sherpa.onnx.getOfflineTtsConfig import java.io.File import java.io.FileOutputStream import java.io.IOException +import PreferenceHelper object TtsEngine { var tts: OfflineTts? = null @@ -136,6 +137,8 @@ object TtsEngine { ruleFars = ruleFars ?: "" ) + speed = PreferenceHelper(context).getSpeed() + tts = OfflineTts(assetManager = assets, config = config) } From 208da78343838b754ec26dc117ddf6f0b8d1b3d7 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Wed, 12 Jun 2024 10:49:37 +0800 Subject: [PATCH 010/237] Limit the maximum segment length for VAD. (#990) --- sherpa-onnx/csrc/silero-vad-model.cc | 16 ++++++++++++++++ sherpa-onnx/csrc/silero-vad-model.h | 3 +++ sherpa-onnx/csrc/vad-model.h | 2 ++ sherpa-onnx/csrc/voice-activity-detector.cc | 12 ++++++++++++ 4 files changed, 33 insertions(+) diff --git a/sherpa-onnx/csrc/silero-vad-model.cc b/sherpa-onnx/csrc/silero-vad-model.cc index 1f8957d4e..a0c1e6c53 100644 --- a/sherpa-onnx/csrc/silero-vad-model.cc +++ b/sherpa-onnx/csrc/silero-vad-model.cc @@ -190,6 +190,14 @@ class SileroVadModel::Impl { int32_t MinSpeechDurationSamples() const { return min_speech_samples_; } + void SetMinSilenceDuration(float s) { + min_silence_samples_ = sample_rate_ * s; + } + + void SetThreshold(float threshold) { + config_.silero_vad.threshold = threshold; + } + private: void Init(void *model_data, size_t model_data_length) { sess_ = std::make_unique(env_, model_data, model_data_length, @@ -306,4 +314,12 @@ int32_t SileroVadModel::MinSpeechDurationSamples() const { return impl_->MinSpeechDurationSamples(); } +void SileroVadModel::SetMinSilenceDuration(float s) { + impl_->SetMinSilenceDuration(s); +} + +void SileroVadModel::SetThreshold(float threshold) { + impl_->SetThreshold(threshold); +} + } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/silero-vad-model.h b/sherpa-onnx/csrc/silero-vad-model.h index 9539890a6..d83e68321 100644 --- a/sherpa-onnx/csrc/silero-vad-model.h +++ b/sherpa-onnx/csrc/silero-vad-model.h @@ -42,6 +42,9 @@ class SileroVadModel : public VadModel { int32_t MinSilenceDurationSamples() const override; int32_t MinSpeechDurationSamples() const override; + void SetMinSilenceDuration(float s) override; + void SetThreshold(float threshold) override; + private: class Impl; std::unique_ptr impl_; diff --git a/sherpa-onnx/csrc/vad-model.h b/sherpa-onnx/csrc/vad-model.h index 8131b6af7..f3b2aab06 100644 --- a/sherpa-onnx/csrc/vad-model.h +++ b/sherpa-onnx/csrc/vad-model.h @@ -42,6 +42,8 @@ class VadModel { virtual int32_t MinSilenceDurationSamples() const = 0; virtual int32_t MinSpeechDurationSamples() const = 0; + virtual void SetMinSilenceDuration(float s) = 0; + virtual void SetThreshold(float threshold) = 0; }; } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/voice-activity-detector.cc b/sherpa-onnx/csrc/voice-activity-detector.cc index 5f63acf1f..9b2b1b872 100644 --- a/sherpa-onnx/csrc/voice-activity-detector.cc +++ b/sherpa-onnx/csrc/voice-activity-detector.cc @@ -29,6 +29,14 @@ class VoiceActivityDetector::Impl { #endif void AcceptWaveform(const float *samples, int32_t n) { + if (buffer_.Size() > max_utterance_length_) { + model_->SetMinSilenceDuration(new_min_silence_duration_s_); + model_->SetThreshold(new_threshold_); + } else { + model_->SetMinSilenceDuration(config_.silero_vad.min_silence_duration); + model_->SetThreshold(config_.silero_vad.threshold); + } + int32_t window_size = model_->WindowSize(); // note n is usually window_size and there is no need to use @@ -114,6 +122,10 @@ class VoiceActivityDetector::Impl { CircularBuffer buffer_; std::vector last_; + int max_utterance_length_ = 16000 * 20; // in samples + float new_min_silence_duration_s_ = 0.1; + float new_threshold_ = 1.10; + int32_t start_ = -1; }; From 6c12590d21a3f8b8cfcd7e5c009940e445cbbcd9 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Wed, 12 Jun 2024 11:42:19 +0800 Subject: [PATCH 011/237] Fix CI errors. (#993) --- .github/workflows/flutter-linux.yaml | 7 +------ CMakeLists.txt | 4 +--- .../com/k2fsa/sherpa/onnx/tts/engine/GetSampleText.kt | 4 ++++ .../com/k2fsa/sherpa/onnx/tts/engine/MainActivity.kt | 1 + .../k2fsa/sherpa/onnx/tts/engine/PreferencesHelper.kt | 11 +++++++++++ .../com/k2fsa/sherpa/onnx/tts/engine/TtsEngine.kt | 1 + build-swift-macos.sh | 1 - 7 files changed, 19 insertions(+), 10 deletions(-) diff --git a/.github/workflows/flutter-linux.yaml b/.github/workflows/flutter-linux.yaml index 80a31267e..a74f45007 100644 --- a/.github/workflows/flutter-linux.yaml +++ b/.github/workflows/flutter-linux.yaml @@ -89,18 +89,13 @@ jobs: flutter --version - mkdir -p /__t/flutter-Linux-3.22.1-X64/flutter - - git config --global --add safe.directory /__t/flutter-Linux-3.22.1-X64/flutter + git config --global --add safe.directory /__t/flutter-Linux-*/flutter || true flutter --version dart --version flutter doctor - - flutter doctor - - name: Install libgtk-3-dev shell: bash run: | diff --git a/CMakeLists.txt b/CMakeLists.txt index 584583ba1..7302a16b9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,11 +2,9 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR) set(CMAKE_OSX_DEPLOYMENT_TARGET "10.14" CACHE STRING "Minimum OS X deployment version. Used only for macOS") - - project(sherpa-onnx) -set(SHERPA_ONNX_VERSION "1.9.27") +set(SHERPA_ONNX_VERSION "1.9.28") # Disable warning about # diff --git a/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/GetSampleText.kt b/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/GetSampleText.kt index 61e683738..a01e0a7b6 100644 --- a/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/GetSampleText.kt +++ b/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/GetSampleText.kt @@ -25,6 +25,10 @@ fun getSampleText(lang: String): String { text = "Aquest és un motor de text a veu que utilitza Kaldi de nova generació" } + "cym" -> { + text = "Peiriant testun-i-lais yw hwn sy'n defnyddio Kaldi'r genhedlaeth nesaf" + } + "ces" -> { text = "Toto je převodník textu na řeč využívající novou generaci kaldi" } diff --git a/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/MainActivity.kt b/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/MainActivity.kt index f64b05f4c..9a6bd47ab 100644 --- a/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/MainActivity.kt +++ b/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/MainActivity.kt @@ -93,6 +93,7 @@ class MainActivity : ComponentActivity() { TtsEngine.speakerId = 0 } } + preferenceHelper.setSid(TtsEngine.speakerId) }, label = { Text("Speaker ID: (0-${numSpeakers - 1})") diff --git a/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/PreferencesHelper.kt b/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/PreferencesHelper.kt index 94c71f47d..57a6e324c 100644 --- a/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/PreferencesHelper.kt +++ b/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/PreferencesHelper.kt @@ -5,6 +5,7 @@ class PreferenceHelper(context: Context) { private val PREFS_NAME = "com.k2fsa.sherpa.onnx.tts.engine" private val SPEED_KEY = "speed" + private val SID_KEY = "speaker_id" private val sharedPreferences: SharedPreferences = context.getSharedPreferences(PREFS_NAME, Context.MODE_PRIVATE) @@ -18,4 +19,14 @@ class PreferenceHelper(context: Context) { fun getSpeed(): Float { return sharedPreferences.getFloat(SPEED_KEY, 1.0f) } + + fun setSid(value: Int) { + val editor = sharedPreferences.edit() + editor.putInt(SID_KEY, value) + editor.apply() + } + + fun getSid(): Int { + return sharedPreferences.getInt(SID_KEY, 0) + } } \ No newline at end of file diff --git a/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/TtsEngine.kt b/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/TtsEngine.kt index 49d794014..480f8a384 100644 --- a/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/TtsEngine.kt +++ b/android/SherpaOnnxTtsEngine/app/src/main/java/com/k2fsa/sherpa/onnx/tts/engine/TtsEngine.kt @@ -138,6 +138,7 @@ object TtsEngine { ) speed = PreferenceHelper(context).getSpeed() + speakerId = PreferenceHelper(context).getSid() tts = OfflineTts(assetManager = assets, config = config) } diff --git a/build-swift-macos.sh b/build-swift-macos.sh index cebfa295a..f41dd7d5c 100755 --- a/build-swift-macos.sh +++ b/build-swift-macos.sh @@ -7,7 +7,6 @@ mkdir -p $dir cd $dir cmake \ - -DCMAKE_OSX_ARCHITECTURES="x86_64" \ -DCMAKE_INSTALL_PREFIX=./install \ -DCMAKE_BUILD_TYPE=Release \ -DBUILD_SHARED_LIBS=OFF \ From 155f22d511d17a93626b76fab73ae2d92a2e180a Mon Sep 17 00:00:00 2001 From: Ikko Eltociear Ashimine Date: Wed, 12 Jun 2024 16:47:44 +0900 Subject: [PATCH 012/237] Update features.h (#994) --- sherpa-onnx/csrc/features.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sherpa-onnx/csrc/features.h b/sherpa-onnx/csrc/features.h index 2e4596a57..afbacd2ec 100644 --- a/sherpa-onnx/csrc/features.h +++ b/sherpa-onnx/csrc/features.h @@ -119,7 +119,7 @@ class FeatureExtractor { * @param frame_index The starting frame index * @param n Number of frames to get. * @return Return a 2-D tensor of shape (n, feature_dim). - * which is flattened into a 1-D vector (flattened in in row major) + * which is flattened into a 1-D vector (flattened in row major) */ std::vector GetFrames(int32_t frame_index, int32_t n) const; From c214d8fb741b76d84d450f2bddcfed4f9ad4b4ff Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 14 Jun 2024 10:22:56 +0800 Subject: [PATCH 013/237] fix kws for WebAssembly (#999) --- wasm/kws/app.js | 4 ++-- wasm/kws/sherpa-onnx-kws.js | 30 ++++++++++++++++++++++++--- wasm/kws/sherpa-onnx-wasm-main-kws.cc | 2 +- 3 files changed, 30 insertions(+), 6 deletions(-) diff --git a/wasm/kws/app.js b/wasm/kws/app.js index e823f0494..1e97262a1 100644 --- a/wasm/kws/app.js +++ b/wasm/kws/app.js @@ -106,9 +106,9 @@ if (navigator.mediaDevices.getUserMedia) { let result = recognizer.getResult(recognizer_stream); - console.log(result) if (result.keyword.length > 0) { + console.log(result) lastResult = result; resultList.push(JSON.stringify(result)); } @@ -287,4 +287,4 @@ function downsampleBuffer(buffer, exportSampleRate) { offsetBuffer = nextOffsetBuffer; } return result; -}; \ No newline at end of file +}; diff --git a/wasm/kws/sherpa-onnx-kws.js b/wasm/kws/sherpa-onnx-kws.js index 22679dc5f..cb9d5ac7a 100644 --- a/wasm/kws/sherpa-onnx-kws.js +++ b/wasm/kws/sherpa-onnx-kws.js @@ -67,7 +67,7 @@ function initModelConfig(config, Module) { const paraformer_len = 2 * 4 const ctc_len = 1 * 4 - const len = transducer.len + paraformer_len + ctc_len + 5 * 4; + const len = transducer.len + paraformer_len + ctc_len + 7 * 4; const ptr = Module._malloc(len); let offset = 0; @@ -76,7 +76,10 @@ function initModelConfig(config, Module) { const tokensLen = Module.lengthBytesUTF8(config.tokens) + 1; const providerLen = Module.lengthBytesUTF8(config.provider) + 1; const modelTypeLen = Module.lengthBytesUTF8(config.modelType) + 1; - const bufferLen = tokensLen + providerLen + modelTypeLen; + const modelingUnitLen = Module.lengthBytesUTF8(config.modelingUnit || '') + 1; + const bpeVocabLen = Module.lengthBytesUTF8(config.bpeVocab || '') + 1; + const bufferLen = + tokensLen + providerLen + modelTypeLen + modelingUnitLen + bpeVocabLen; const buffer = Module._malloc(bufferLen); offset = 0; @@ -87,6 +90,14 @@ function initModelConfig(config, Module) { offset += providerLen; Module.stringToUTF8(config.modelType, buffer + offset, modelTypeLen); + offset += modelTypeLen; + + Module.stringToUTF8( + config.modelingUnit || '', buffer + offset, modelingUnitLen); + offset += modelingUnitLen; + + Module.stringToUTF8(config.bpeVocab || '', buffer + offset, bpeVocabLen); + offset += bpeVocabLen; offset = transducer.len + paraformer_len + ctc_len; Module.setValue(ptr + offset, buffer, 'i8*'); // tokens @@ -105,6 +116,17 @@ function initModelConfig(config, Module) { ptr + offset, buffer + tokensLen + providerLen, 'i8*'); // modelType offset += 4; + Module.setValue( + ptr + offset, buffer + tokensLen + providerLen + modelTypeLen, + 'i8*'); // modelingUnit + offset += 4; + + Module.setValue( + ptr + offset, + buffer + tokensLen + providerLen + modelTypeLen + modelingUnitLen, + 'i8*'); // bpeVocab + offset += 4; + return { buffer: buffer, ptr: ptr, len: len, transducer: transducer } @@ -248,7 +270,9 @@ function createKws(Module, myConfig) { provider: 'cpu', modelType: '', numThreads: 1, - debug: 1 + debug: 1, + modelingUnit: 'cjkchar', + bpeVocab: '', }; let featConfig = { diff --git a/wasm/kws/sherpa-onnx-wasm-main-kws.cc b/wasm/kws/sherpa-onnx-wasm-main-kws.cc index 832e525d9..cbb3ab37d 100644 --- a/wasm/kws/sherpa-onnx-wasm-main-kws.cc +++ b/wasm/kws/sherpa-onnx-wasm-main-kws.cc @@ -19,7 +19,7 @@ static_assert(sizeof(SherpaOnnxOnlineZipformer2CtcModelConfig) == 1 * 4, ""); static_assert(sizeof(SherpaOnnxOnlineModelConfig) == sizeof(SherpaOnnxOnlineTransducerModelConfig) + sizeof(SherpaOnnxOnlineParaformerModelConfig) + - sizeof(SherpaOnnxOnlineZipformer2CtcModelConfig) + 5 * 4, + sizeof(SherpaOnnxOnlineZipformer2CtcModelConfig) + 7 * 4, ""); static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, ""); static_assert(sizeof(SherpaOnnxKeywordSpotterConfig) == From d08cc04567dd842c896a03d1d9be46f1f2eeb99b Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 14 Jun 2024 10:37:16 +0800 Subject: [PATCH 014/237] Add VAD example for Dart API (#996) --- .../workflows/build-wheels-macos-arm64.yaml | 12 +- .../build-wheels-macos-universal2.yaml | 2 +- .github/workflows/build-wheels-macos-x64.yaml | 12 +- .github/workflows/flutter-macos.yaml | 48 ++- .github/workflows/flutter-windows-x64.yaml | 46 +- .github/workflows/test-dart.yaml | 60 +++ CMakeLists.txt | 5 +- cmake/cmake_extension.py | 8 +- cmake/kaldi-decoder.cmake | 16 +- cmake/kaldifst.cmake | 17 +- cmake/openfst.cmake | 20 +- dart-api-examples/README.md | 18 + dart-api-examples/vad/.gitignore | 3 + dart-api-examples/vad/CHANGELOG.md | 3 + dart-api-examples/vad/README.md | 21 + dart-api-examples/vad/analysis_options.yaml | 30 ++ dart-api-examples/vad/bin/vad.dart | 93 ++++ dart-api-examples/vad/pubspec.lock | 402 ++++++++++++++++++ dart-api-examples/vad/pubspec.yaml | 17 + dart-api-examples/vad/run.sh | 22 + scripts/dotnet/generate.py | 6 +- scripts/dotnet/run.sh | 2 +- sherpa-onnx/flutter/CHANGELOG.md | 11 + sherpa-onnx/flutter/lib/sherpa_onnx.dart | 24 +- .../flutter/lib/src/offline_recognizer.dart | 3 +- .../flutter/lib/src/offline_stream.dart | 2 +- .../flutter/lib/src/online_recognizer.dart | 3 +- .../flutter/lib/src/online_stream.dart | 3 +- .../flutter/lib/src/sherpa_onnx_bindings.dart | 7 +- sherpa-onnx/flutter/lib/src/vad.dart | 20 +- sherpa-onnx/flutter/notes.md | 17 + sherpa-onnx/flutter/pubspec.yaml | 66 +-- sherpa-onnx/flutter/windows/CMakeLists.txt | 6 +- 33 files changed, 883 insertions(+), 142 deletions(-) create mode 100644 .github/workflows/test-dart.yaml create mode 100644 dart-api-examples/README.md create mode 100644 dart-api-examples/vad/.gitignore create mode 100644 dart-api-examples/vad/CHANGELOG.md create mode 100644 dart-api-examples/vad/README.md create mode 100644 dart-api-examples/vad/analysis_options.yaml create mode 100644 dart-api-examples/vad/bin/vad.dart create mode 100644 dart-api-examples/vad/pubspec.lock create mode 100644 dart-api-examples/vad/pubspec.yaml create mode 100755 dart-api-examples/vad/run.sh create mode 100644 sherpa-onnx/flutter/CHANGELOG.md diff --git a/.github/workflows/build-wheels-macos-arm64.yaml b/.github/workflows/build-wheels-macos-arm64.yaml index 9a8edd504..c31d92bf4 100644 --- a/.github/workflows/build-wheels-macos-arm64.yaml +++ b/.github/workflows/build-wheels-macos-arm64.yaml @@ -48,7 +48,7 @@ jobs: path: ./wheelhouse/*.whl - name: Publish to huggingface - if: matrix.python-version == 'cp38' + if: matrix.python-version == 'cp39' env: HF_TOKEN: ${{ secrets.HF_TOKEN }} uses: nick-fields/retry@v3 @@ -82,7 +82,13 @@ jobs: TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} run: | - python3 -m pip install --break-system-packages --upgrade pip - python3 -m pip install --break-system-packages wheel twine setuptools + opts='--break-system-packages' + v=${{ matrix.python-version }} + if [[ $v == cp38 || $v == cp39 ]]; then + opts='' + fi + + python3 -m pip install $opts --upgrade pip + python3 -m pip install $opts wheel twine setuptools twine upload ./wheelhouse/*.whl diff --git a/.github/workflows/build-wheels-macos-universal2.yaml b/.github/workflows/build-wheels-macos-universal2.yaml index 4d52110ee..d08a93075 100644 --- a/.github/workflows/build-wheels-macos-universal2.yaml +++ b/.github/workflows/build-wheels-macos-universal2.yaml @@ -50,7 +50,7 @@ jobs: path: ./wheelhouse/*.whl - name: Publish to huggingface - if: matrix.python-version == 'cp38' + if: matrix.python-version == 'cp39' env: HF_TOKEN: ${{ secrets.HF_TOKEN }} uses: nick-fields/retry@v3 diff --git a/.github/workflows/build-wheels-macos-x64.yaml b/.github/workflows/build-wheels-macos-x64.yaml index fbd7781b5..250ef76c7 100644 --- a/.github/workflows/build-wheels-macos-x64.yaml +++ b/.github/workflows/build-wheels-macos-x64.yaml @@ -65,7 +65,7 @@ jobs: path: ./wheelhouse/*.whl - name: Publish to huggingface - if: matrix.python-version == 'cp38' + if: matrix.python-version == 'cp39' env: HF_TOKEN: ${{ secrets.HF_TOKEN }} uses: nick-fields/retry@v3 @@ -99,7 +99,13 @@ jobs: TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} run: | - python3 -m pip install --break-system-packages --upgrade pip - python3 -m pip install --break-system-packages wheel twine setuptools + opts='--break-system-packages' + v=${{ matrix.python-version }} + if [[ $v == cp38 || $v == cp39 ]]; then + opts='' + fi + + python3 -m pip install $opts --upgrade pip + python3 -m pip install $opts wheel twine setuptools twine upload ./wheelhouse/*.whl diff --git a/.github/workflows/flutter-macos.yaml b/.github/workflows/flutter-macos.yaml index 3ffa700f7..25a53d89a 100644 --- a/.github/workflows/flutter-macos.yaml +++ b/.github/workflows/flutter-macos.yaml @@ -152,6 +152,8 @@ jobs: - name: Build flutter shell: bash run: | + SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2) + d=$PWD pushd sherpa-onnx/flutter @@ -166,7 +168,7 @@ jobs: tree ./sherpa_onnx.app - app=flutter_sherpa_onnx_macos_${{ matrix.arch }}.app + app=sherpa-onnx-osx-${{ matrix.arch }}-$SHERPA_ONNX_VERSION.app mv sherpa_onnx.app $app tar cjfv $app.tar.bz2 $app ls -lh @@ -178,13 +180,43 @@ jobs: - uses: actions/upload-artifact@v4 with: - name: flutter-sherpa-onnx-app-macos-${{ matrix.arch }} + name: sherpa-onnx-osx-${{ matrix.arch }} path: ./*.tar.bz2 - - name: Release android libs - if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && github.event_name == 'push' && contains(github.ref, 'refs/tags/') - uses: svenstaro/upload-release-action@v2 + - name: Publish to huggingface + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + uses: nick-fields/retry@v3 with: - file_glob: true - overwrite: true - file: flutter*.tar.bz2 + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + rm -rf huggingface + export GIT_LFS_SKIP_SMUDGE=1 + export GIT_CLONE_PROTECTION_ACTIVE=false + + git clone https://huggingface.co/csukuangfj/sherpa-onnx-flutter huggingface + cd huggingface + git fetch + git pull + git merge -m "merge remote" --ff origin main + mkdir -p flutter + cp -v ../*.tar.bz2 ./flutter + + git status + git lfs track "*.bz2" + git add . + git commit -m "add more files" + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-flutter main + + # - name: Release android libs + # if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && github.event_name == 'push' && contains(github.ref, 'refs/tags/') + # uses: svenstaro/upload-release-action@v2 + # with: + # file_glob: true + # overwrite: true + # file: sherpa*.tar.bz2 diff --git a/.github/workflows/flutter-windows-x64.yaml b/.github/workflows/flutter-windows-x64.yaml index f09187ca7..cfe54b053 100644 --- a/.github/workflows/flutter-windows-x64.yaml +++ b/.github/workflows/flutter-windows-x64.yaml @@ -142,7 +142,7 @@ jobs: cd build/windows/x64/runner/ - dst=flutter_sherpa_onnx_windows_x64 + dst=sherpa-onnx-win-x64-$SHERPA_ONNX_VERSION mv Release $dst tar cjfv $dst.tar.bz2 ./$dst @@ -155,13 +155,43 @@ jobs: - uses: actions/upload-artifact@v4 with: - name: flutter-sherpa-onnx-windows-x64 + name: sherpa-onnx-win-x64 path: ./*.tar.bz2 - - name: Release android libs - if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && github.event_name == 'push' && contains(github.ref, 'refs/tags/') - uses: svenstaro/upload-release-action@v2 + - name: Publish to huggingface + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + uses: nick-fields/retry@v3 with: - file_glob: true - overwrite: true - file: flutter*.tar.bz2 + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + rm -rf huggingface + export GIT_LFS_SKIP_SMUDGE=1 + export GIT_CLONE_PROTECTION_ACTIVE=false + + git clone https://huggingface.co/csukuangfj/sherpa-onnx-flutter huggingface + cd huggingface + git fetch + git pull + git merge -m "merge remote" --ff origin main + mkdir -p flutter + cp -v ../*.tar.bz2 ./flutter + + git status + git lfs track "*.bz2" + git add . + git commit -m "add more files" + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-flutter main + + # - name: Release android libs + # if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && github.event_name == 'push' && contains(github.ref, 'refs/tags/') + # uses: svenstaro/upload-release-action@v2 + # with: + # file_glob: true + # overwrite: true + # file: sherpa*.tar.bz2 diff --git a/.github/workflows/test-dart.yaml b/.github/workflows/test-dart.yaml new file mode 100644 index 000000000..0734df705 --- /dev/null +++ b/.github/workflows/test-dart.yaml @@ -0,0 +1,60 @@ +name: test-dart + +on: + push: + branches: + - master + paths: + - '.github/workflows/test-dart.yaml' + - 'dart-api-examples/**' + pull_request: + branches: + - master + paths: + - '.github/workflows/test-dart.yaml' + - 'dart-api-examples/**' + + workflow_dispatch: + +concurrency: + group: test-dart-${{ github.ref }} + cancel-in-progress: true + +jobs: + dart: + name: ${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [macos-latest, ubuntu-latest] #, windows-latest] + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Setup Flutter SDK + uses: flutter-actions/setup-flutter@v3 + with: + channel: stable + version: latest + + - name: Display flutter info + shell: bash + run: | + which flutter + which dart + + flutter --version + dart --version + flutter doctor + + - name: Run tests + shell: bash + run: | + cd dart-api-examples + + pushd vad + ./run.sh + popd diff --git a/CMakeLists.txt b/CMakeLists.txt index 7302a16b9..4c1f66416 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,7 +4,10 @@ set(CMAKE_OSX_DEPLOYMENT_TARGET "10.14" CACHE STRING "Minimum OS X deployment ve project(sherpa-onnx) -set(SHERPA_ONNX_VERSION "1.9.28") +# Remember to update +# ./nodejs-addon-examples +# ./dart-api-examples/ +set(SHERPA_ONNX_VERSION "1.9.29") # Disable warning about # diff --git a/cmake/cmake_extension.py b/cmake/cmake_extension.py index 46116f5c5..68be0f0c4 100644 --- a/cmake/cmake_extension.py +++ b/cmake/cmake_extension.py @@ -84,9 +84,9 @@ def get_binaries(): "piper_phonemize.dll", "sherpa-onnx-c-api.dll", "sherpa-onnx-core.dll", - "sherpa-onnx-fstfar.lib", - "sherpa-onnx-fst.lib", - "sherpa-onnx-kaldifst-core.lib", + "sherpa-onnx-fstfar.dll", + "sherpa-onnx-fst.dll", + "sherpa-onnx-kaldifst-core.dll", "sherpa-onnx-portaudio.dll", "ucd.dll", ] @@ -211,7 +211,7 @@ def build_extension(self, ext: setuptools.extension.Extension): binaries = get_binaries() for f in binaries: - suffix = "" if (".dll" in f or ".lib" in f) else suffix + suffix = "" if ".dll" in f else suffix src_file = install_dir / "bin" / (f + suffix) if not src_file.is_file(): src_file = install_dir / "lib" / (f + suffix) diff --git a/cmake/kaldi-decoder.cmake b/cmake/kaldi-decoder.cmake index b78ece58f..aa937b3e4 100644 --- a/cmake/kaldi-decoder.cmake +++ b/cmake/kaldi-decoder.cmake @@ -1,9 +1,9 @@ function(download_kaldi_decoder) include(FetchContent) - set(kaldi_decoder_URL "https://github.com/k2-fsa/kaldi-decoder/archive/refs/tags/v0.2.5.tar.gz") - set(kaldi_decoder_URL2 "https://hub.nuaa.cf/k2-fsa/kaldi-decoder/archive/refs/tags/v0.2.5.tar.gz") - set(kaldi_decoder_HASH "SHA256=f663e58aef31b33cd8086eaa09ff1383628039845f31300b5abef817d8cc2fff") + set(kaldi_decoder_URL "https://github.com/k2-fsa/kaldi-decoder/archive/refs/tags/v0.2.6.tar.gz") + set(kaldi_decoder_URL2 "https://hub.nuaa.cf/k2-fsa/kaldi-decoder/archive/refs/tags/v0.2.6.tar.gz") + set(kaldi_decoder_HASH "SHA256=b13c78b37495cafc6ef3f8a7b661b349c55a51abbd7f7f42f389408dcf86a463") set(KALDI_DECODER_BUILD_PYTHON OFF CACHE BOOL "" FORCE) set(KALDI_DECODER_ENABLE_TESTS OFF CACHE BOOL "" FORCE) @@ -12,11 +12,11 @@ function(download_kaldi_decoder) # If you don't have access to the Internet, # please pre-download kaldi-decoder set(possible_file_locations - $ENV{HOME}/Downloads/kaldi-decoder-0.2.5.tar.gz - ${CMAKE_SOURCE_DIR}/kaldi-decoder-0.2.5.tar.gz - ${CMAKE_BINARY_DIR}/kaldi-decoder-0.2.5.tar.gz - /tmp/kaldi-decoder-0.2.5.tar.gz - /star-fj/fangjun/download/github/kaldi-decoder-0.2.5.tar.gz + $ENV{HOME}/Downloads/kaldi-decoder-0.2.6.tar.gz + ${CMAKE_SOURCE_DIR}/kaldi-decoder-0.2.6.tar.gz + ${CMAKE_BINARY_DIR}/kaldi-decoder-0.2.6.tar.gz + /tmp/kaldi-decoder-0.2.6.tar.gz + /star-fj/fangjun/download/github/kaldi-decoder-0.2.6.tar.gz ) foreach(f IN LISTS possible_file_locations) diff --git a/cmake/kaldifst.cmake b/cmake/kaldifst.cmake index 3b5ce3ba2..5e2130dbe 100644 --- a/cmake/kaldifst.cmake +++ b/cmake/kaldifst.cmake @@ -1,18 +1,18 @@ function(download_kaldifst) include(FetchContent) - set(kaldifst_URL "https://github.com/k2-fsa/kaldifst/archive/refs/tags/v1.7.10.tar.gz") - set(kaldifst_URL2 "https://hub.nuaa.cf/k2-fsa/kaldifst/archive/refs/tags/v1.7.10.tar.gz") - set(kaldifst_HASH "SHA256=7f7b3173a6584a6b1987f65ae7af2ac453d66b845f875a9d31074b8d2cd0de54") + set(kaldifst_URL "https://github.com/k2-fsa/kaldifst/archive/refs/tags/v1.7.11.tar.gz") + set(kaldifst_URL2 "https://hub.nuaa.cf/k2-fsa/kaldifst/archive/refs/tags/v1.7.11.tar.gz") + set(kaldifst_HASH "SHA256=b43b3332faa2961edc730e47995a58cd4e22ead21905d55b0c4a41375b4a525f") # If you don't have access to the Internet, # please pre-download kaldifst set(possible_file_locations - $ENV{HOME}/Downloads/kaldifst-1.7.10.tar.gz - ${CMAKE_SOURCE_DIR}/kaldifst-1.7.10.tar.gz - ${CMAKE_BINARY_DIR}/kaldifst-1.7.10.tar.gz - /tmp/kaldifst-1.7.10.tar.gz - /star-fj/fangjun/download/github/kaldifst-1.7.10.tar.gz + $ENV{HOME}/Downloads/kaldifst-1.7.11.tar.gz + ${CMAKE_SOURCE_DIR}/kaldifst-1.7.11.tar.gz + ${CMAKE_BINARY_DIR}/kaldifst-1.7.11.tar.gz + /tmp/kaldifst-1.7.11.tar.gz + /star-fj/fangjun/download/github/kaldifst-1.7.11.tar.gz ) foreach(f IN LISTS possible_file_locations) @@ -51,6 +51,7 @@ function(download_kaldifst) ) set_target_properties(kaldifst_core PROPERTIES OUTPUT_NAME "sherpa-onnx-kaldifst-core") + # installed in ./kaldi-decoder.cmake endfunction() download_kaldifst() diff --git a/cmake/openfst.cmake b/cmake/openfst.cmake index c964c14a4..59d4f9fc3 100644 --- a/cmake/openfst.cmake +++ b/cmake/openfst.cmake @@ -3,18 +3,18 @@ function(download_openfst) include(FetchContent) - set(openfst_URL "https://github.com/csukuangfj/openfst/archive/refs/tags/sherpa-onnx-2024-05-22-2.tar.gz") - set(openfst_URL2 "https://hub.nuaa.cf/csukuangfj/openfst/archive/refs/tags/sherpa-onnx-2024-05-22-2.tar.gz") - set(openfst_HASH "SHA256=ec52d32ab46ac884d77c87918155ca9d0cae424095ce3bd7e3cc7eaab8235a39") + set(openfst_URL "https://github.com/csukuangfj/openfst/archive/refs/tags/sherpa-onnx-2024-06-13.tar.gz") + set(openfst_URL2 "https://hub.nuaa.cf/csukuangfj/openfst/archive/refs/tags/sherpa-onnx-2024-06-13.tar.gz") + set(openfst_HASH "SHA256=f10a71c6b64d89eabdc316d372b956c30c825c7c298e2f20c780320e8181ffb6") # If you don't have access to the Internet, # please pre-download it set(possible_file_locations - $ENV{HOME}/Downloads/openfst-sherpa-onnx-2024-05-22-2.tar.gz - ${CMAKE_SOURCE_DIR}/openfst-sherpa-onnx-2024-05-22-2.tar.gz - ${CMAKE_BINARY_DIR}/openfst-sherpa-onnx-2024-05-22-2.tar.gz - /tmp/openfst-sherpa-onnx-2024-05-22-2.tar.gz - /star-fj/fangjun/download/github/openfst-sherpa-onnx-2024-05-22-2.tar.gz + $ENV{HOME}/Downloads/openfst-sherpa-onnx-2024-06-13.tar.gz + ${CMAKE_SOURCE_DIR}/openfst-sherpa-onnx-2024-06-13.tar.gz + ${CMAKE_BINARY_DIR}/openfst-sherpa-onnx-2024-06-13.tar.gz + /tmp/openfst-sherpa-onnx-2024-06-13.tar.gz + /star-fj/fangjun/download/github/openfst-sherpa-onnx-2024-06-13.tar.gz ) foreach(f IN LISTS possible_file_locations) @@ -27,7 +27,7 @@ function(download_openfst) endforeach() set(HAVE_BIN OFF CACHE BOOL "" FORCE) - set(HAVE_SCRIPT ON CACHE BOOL "" FORCE) + set(HAVE_SCRIPT OFF CACHE BOOL "" FORCE) set(HAVE_COMPACT OFF CACHE BOOL "" FORCE) set(HAVE_COMPRESS OFF CACHE BOOL "" FORCE) set(HAVE_CONST OFF CACHE BOOL "" FORCE) @@ -70,8 +70,6 @@ function(download_openfst) add_subdirectory(${openfst_SOURCE_DIR} ${openfst_BINARY_DIR} EXCLUDE_FROM_ALL) set(openfst_SOURCE_DIR ${openfst_SOURCE_DIR} PARENT_SCOPE) - # Rename libfst.so.6 to libsherpa-onnx-fst.so.6 to avoid potential conflicts - # when sherpa-onnx is installed. set_target_properties(fst PROPERTIES OUTPUT_NAME "sherpa-onnx-fst") set_target_properties(fstfar PROPERTIES OUTPUT_NAME "sherpa-onnx-fstfar") diff --git a/dart-api-examples/README.md b/dart-api-examples/README.md new file mode 100644 index 000000000..930037160 --- /dev/null +++ b/dart-api-examples/README.md @@ -0,0 +1,18 @@ +# Introduction + +This directory contains examples for Dart API. + +You can find the package at +https://pub.dev/packages/sherpa_onnx + +## How to create an example in this folder + +```bash +dart create vad +cd vad + +# Edit pubspec.yaml and add sherpa_onnx to dependencies + +dart pub get +dart run +``` diff --git a/dart-api-examples/vad/.gitignore b/dart-api-examples/vad/.gitignore new file mode 100644 index 000000000..3a8579040 --- /dev/null +++ b/dart-api-examples/vad/.gitignore @@ -0,0 +1,3 @@ +# https://dart.dev/guides/libraries/private-files +# Created by `dart pub` +.dart_tool/ diff --git a/dart-api-examples/vad/CHANGELOG.md b/dart-api-examples/vad/CHANGELOG.md new file mode 100644 index 000000000..effe43c82 --- /dev/null +++ b/dart-api-examples/vad/CHANGELOG.md @@ -0,0 +1,3 @@ +## 1.0.0 + +- Initial version. diff --git a/dart-api-examples/vad/README.md b/dart-api-examples/vad/README.md new file mode 100644 index 000000000..1dd7f68c1 --- /dev/null +++ b/dart-api-examples/vad/README.md @@ -0,0 +1,21 @@ +# Introduction + +This example shows how to use the Dart API from sherpa-onnx for voice activity detection (VAD). +Specifically, we use VAD to remove silences from a wave file. + +# Usage + +```bash +dart pub get + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav + +dart run \ + ./bin/vad.dart \ + --silero-vad ./silero_vad.onnx \ + --input-wav ./lei-jun-test.wav \ + --output-wav ./lei-jun-test-no-silence.wav +``` + +It should generate a file `lei-jun-test-no-silence.wav`, where silences are removed. diff --git a/dart-api-examples/vad/analysis_options.yaml b/dart-api-examples/vad/analysis_options.yaml new file mode 100644 index 000000000..dee8927aa --- /dev/null +++ b/dart-api-examples/vad/analysis_options.yaml @@ -0,0 +1,30 @@ +# This file configures the static analysis results for your project (errors, +# warnings, and lints). +# +# This enables the 'recommended' set of lints from `package:lints`. +# This set helps identify many issues that may lead to problems when running +# or consuming Dart code, and enforces writing Dart using a single, idiomatic +# style and format. +# +# If you want a smaller set of lints you can change this to specify +# 'package:lints/core.yaml'. These are just the most critical lints +# (the recommended set includes the core lints). +# The core lints are also what is used by pub.dev for scoring packages. + +include: package:lints/recommended.yaml + +# Uncomment the following section to specify additional rules. + +# linter: +# rules: +# - camel_case_types + +# analyzer: +# exclude: +# - path/to/excluded/files/** + +# For more information about the core and recommended set of lints, see +# https://dart.dev/go/core-lints + +# For additional information about configuring this file, see +# https://dart.dev/guides/language/analysis-options diff --git a/dart-api-examples/vad/bin/vad.dart b/dart-api-examples/vad/bin/vad.dart new file mode 100644 index 000000000..d981bad94 --- /dev/null +++ b/dart-api-examples/vad/bin/vad.dart @@ -0,0 +1,93 @@ +import 'dart:io'; +import 'dart:isolate'; +import 'dart:typed_data'; + +import 'package:args/args.dart'; +import 'package:path/path.dart' as p; +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; + +Future initSherpaOnnx() async { + var uri = await Isolate.resolvePackageUri( + Uri.parse('package:sherpa_onnx/sherpa_onnx.dart')); + + if (uri == null) { + print('File not found'); + exit(1); + } + String platform = ''; + if (Platform.isMacOS) { + platform = 'macos'; + } else if (Platform.isLinux) { + platform = 'linux'; + } else if (Platform.isWindows) { + platform = 'windows'; + } else { + throw UnsupportedError('Unknown platform: ${Platform.operatingSystem}'); + } + + final libPath = p.join(p.dirname(p.fromUri(uri)), '..', platform); + sherpa_onnx.initBindings(libPath); +} + +void main(List arguments) async { + await initSherpaOnnx(); + + final parser = ArgParser() + ..addOption('silero-vad', help: 'Path to silero_vad.onnx') + ..addOption('input-wav', help: 'Path to input.wav') + ..addOption('output-wav', help: 'Path to output.wav'); + final res = parser.parse(arguments); + if (res['silero-vad'] == null || + res['input-wav'] == null || + res['output-wav'] == null) { + print(parser.usage); + exit(1); + } + + final sileroVad = res['silero-vad'] as String; + final inputWav = res['input-wav'] as String; + final outputWav = res['output-wav'] as String; + + final sileroVadConfig = sherpa_onnx.SileroVadModelConfig( + model: sileroVad, + minSilenceDuration: 0.25, + minSpeechDuration: 0.5, + ); + final config = sherpa_onnx.VadModelConfig( + sileroVad: sileroVadConfig, + numThreads: 1, + debug: true, + ); + + final vad = sherpa_onnx.VoiceActivityDetector( + config: config, bufferSizeInSeconds: 10); + + final waveData = sherpa_onnx.readWave(inputWav); + if (waveData.sampleRate != 16000) { + print('Only 16000 Hz is supported. Given: ${waveData.sampleRate}'); + exit(1); + } + + int numSamples = waveData.samples.length; + int numIter = numSamples ~/ config.sileroVad.windowSize; + + List> allSamples = []; + + for (int i = 0; i != numIter; ++i) { + int start = i * config.sileroVad.windowSize; + vad.acceptWaveform(Float32List.sublistView( + waveData.samples, start, start + config.sileroVad.windowSize)); + + if (vad.isDetected()) { + while (!vad.isEmpty()) { + allSamples.add(vad.front().samples); + vad.pop(); + } + } + } + + final s = Float32List.fromList(allSamples.expand((x) => x).toList()); + sherpa_onnx.writeWave( + filename: outputWav, samples: s, sampleRate: waveData.sampleRate); + print('Saved to ${outputWav}'); +} diff --git a/dart-api-examples/vad/pubspec.lock b/dart-api-examples/vad/pubspec.lock new file mode 100644 index 000000000..776660ec5 --- /dev/null +++ b/dart-api-examples/vad/pubspec.lock @@ -0,0 +1,402 @@ +# Generated by pub +# See https://dart.dev/tools/pub/glossary#lockfile +packages: + _fe_analyzer_shared: + dependency: transitive + description: + name: _fe_analyzer_shared + sha256: "5aaf60d96c4cd00fe7f21594b5ad6a1b699c80a27420f8a837f4d68473ef09e3" + url: "https://pub.dev" + source: hosted + version: "68.0.0" + _macros: + dependency: transitive + description: dart + source: sdk + version: "0.1.0" + analyzer: + dependency: transitive + description: + name: analyzer + sha256: "21f1d3720fd1c70316399d5e2bccaebb415c434592d778cce8acb967b8578808" + url: "https://pub.dev" + source: hosted + version: "6.5.0" + args: + dependency: transitive + description: + name: args + sha256: "7cf60b9f0cc88203c5a190b4cd62a99feea42759a7fa695010eb5de1c0b2252a" + url: "https://pub.dev" + source: hosted + version: "2.5.0" + async: + dependency: transitive + description: + name: async + sha256: "947bfcf187f74dbc5e146c9eb9c0f10c9f8b30743e341481c1e2ed3ecc18c20c" + url: "https://pub.dev" + source: hosted + version: "2.11.0" + boolean_selector: + dependency: transitive + description: + name: boolean_selector + sha256: "6cfb5af12253eaf2b368f07bacc5a80d1301a071c73360d746b7f2e32d762c66" + url: "https://pub.dev" + source: hosted + version: "2.1.1" + collection: + dependency: transitive + description: + name: collection + sha256: ee67cb0715911d28db6bf4af1026078bd6f0128b07a5f66fb2ed94ec6783c09a + url: "https://pub.dev" + source: hosted + version: "1.18.0" + convert: + dependency: transitive + description: + name: convert + sha256: "0f08b14755d163f6e2134cb58222dd25ea2a2ee8a195e53983d57c075324d592" + url: "https://pub.dev" + source: hosted + version: "3.1.1" + coverage: + dependency: transitive + description: + name: coverage + sha256: "3945034e86ea203af7a056d98e98e42a5518fff200d6e8e6647e1886b07e936e" + url: "https://pub.dev" + source: hosted + version: "1.8.0" + crypto: + dependency: transitive + description: + name: crypto + sha256: ff625774173754681d66daaf4a448684fb04b78f902da9cb3d308c19cc5e8bab + url: "https://pub.dev" + source: hosted + version: "3.0.3" + file: + dependency: transitive + description: + name: file + sha256: "5fc22d7c25582e38ad9a8515372cd9a93834027aacf1801cf01164dac0ffa08c" + url: "https://pub.dev" + source: hosted + version: "7.0.0" + frontend_server_client: + dependency: transitive + description: + name: frontend_server_client + sha256: f64a0333a82f30b0cca061bc3d143813a486dc086b574bfb233b7c1372427694 + url: "https://pub.dev" + source: hosted + version: "4.0.0" + glob: + dependency: transitive + description: + name: glob + sha256: "0e7014b3b7d4dac1ca4d6114f82bf1782ee86745b9b42a92c9289c23d8a0ab63" + url: "https://pub.dev" + source: hosted + version: "2.1.2" + http_multi_server: + dependency: transitive + description: + name: http_multi_server + sha256: "97486f20f9c2f7be8f514851703d0119c3596d14ea63227af6f7a481ef2b2f8b" + url: "https://pub.dev" + source: hosted + version: "3.2.1" + http_parser: + dependency: transitive + description: + name: http_parser + sha256: "2aa08ce0341cc9b354a498388e30986515406668dbcc4f7c950c3e715496693b" + url: "https://pub.dev" + source: hosted + version: "4.0.2" + io: + dependency: transitive + description: + name: io + sha256: "2ec25704aba361659e10e3e5f5d672068d332fc8ac516421d483a11e5cbd061e" + url: "https://pub.dev" + source: hosted + version: "1.0.4" + js: + dependency: transitive + description: + name: js + sha256: c1b2e9b5ea78c45e1a0788d29606ba27dc5f71f019f32ca5140f61ef071838cf + url: "https://pub.dev" + source: hosted + version: "0.7.1" + lints: + dependency: "direct dev" + description: + name: lints + sha256: cbf8d4b858bb0134ef3ef87841abdf8d63bfc255c266b7bf6b39daa1085c4290 + url: "https://pub.dev" + source: hosted + version: "3.0.0" + logging: + dependency: transitive + description: + name: logging + sha256: "623a88c9594aa774443aa3eb2d41807a48486b5613e67599fb4c41c0ad47c340" + url: "https://pub.dev" + source: hosted + version: "1.2.0" + macros: + dependency: transitive + description: + name: macros + sha256: "12e8a9842b5a7390de7a781ec63d793527582398d16ea26c60fed58833c9ae79" + url: "https://pub.dev" + source: hosted + version: "0.1.0-main.0" + matcher: + dependency: transitive + description: + name: matcher + sha256: d2323aa2060500f906aa31a895b4030b6da3ebdcc5619d14ce1aada65cd161cb + url: "https://pub.dev" + source: hosted + version: "0.12.16+1" + meta: + dependency: transitive + description: + name: meta + sha256: bdb68674043280c3428e9ec998512fb681678676b3c54e773629ffe74419f8c7 + url: "https://pub.dev" + source: hosted + version: "1.15.0" + mime: + dependency: transitive + description: + name: mime + sha256: "2e123074287cc9fd6c09de8336dae606d1ddb88d9ac47358826db698c176a1f2" + url: "https://pub.dev" + source: hosted + version: "1.0.5" + node_preamble: + dependency: transitive + description: + name: node_preamble + sha256: "6e7eac89047ab8a8d26cf16127b5ed26de65209847630400f9aefd7cd5c730db" + url: "https://pub.dev" + source: hosted + version: "2.0.2" + package_config: + dependency: transitive + description: + name: package_config + sha256: "1c5b77ccc91e4823a5af61ee74e6b972db1ef98c2ff5a18d3161c982a55448bd" + url: "https://pub.dev" + source: hosted + version: "2.1.0" + path: + dependency: transitive + description: + name: path + sha256: "087ce49c3f0dc39180befefc60fdb4acd8f8620e5682fe2476afd0b3688bb4af" + url: "https://pub.dev" + source: hosted + version: "1.9.0" + pool: + dependency: transitive + description: + name: pool + sha256: "20fe868b6314b322ea036ba325e6fc0711a22948856475e2c2b6306e8ab39c2a" + url: "https://pub.dev" + source: hosted + version: "1.5.1" + pub_semver: + dependency: transitive + description: + name: pub_semver + sha256: "40d3ab1bbd474c4c2328c91e3a7df8c6dd629b79ece4c4bd04bee496a224fb0c" + url: "https://pub.dev" + source: hosted + version: "2.1.4" + shelf: + dependency: transitive + description: + name: shelf + sha256: ad29c505aee705f41a4d8963641f91ac4cee3c8fad5947e033390a7bd8180fa4 + url: "https://pub.dev" + source: hosted + version: "1.4.1" + shelf_packages_handler: + dependency: transitive + description: + name: shelf_packages_handler + sha256: "89f967eca29607c933ba9571d838be31d67f53f6e4ee15147d5dc2934fee1b1e" + url: "https://pub.dev" + source: hosted + version: "3.0.2" + shelf_static: + dependency: transitive + description: + name: shelf_static + sha256: a41d3f53c4adf0f57480578c1d61d90342cd617de7fc8077b1304643c2d85c1e + url: "https://pub.dev" + source: hosted + version: "1.1.2" + shelf_web_socket: + dependency: transitive + description: + name: shelf_web_socket + sha256: "073c147238594ecd0d193f3456a5fe91c4b0abbcc68bf5cd95b36c4e194ac611" + url: "https://pub.dev" + source: hosted + version: "2.0.0" + source_map_stack_trace: + dependency: transitive + description: + name: source_map_stack_trace + sha256: "84cf769ad83aa6bb61e0aa5a18e53aea683395f196a6f39c4c881fb90ed4f7ae" + url: "https://pub.dev" + source: hosted + version: "2.1.1" + source_maps: + dependency: transitive + description: + name: source_maps + sha256: "708b3f6b97248e5781f493b765c3337db11c5d2c81c3094f10904bfa8004c703" + url: "https://pub.dev" + source: hosted + version: "0.10.12" + source_span: + dependency: transitive + description: + name: source_span + sha256: "53e943d4206a5e30df338fd4c6e7a077e02254531b138a15aec3bd143c1a8b3c" + url: "https://pub.dev" + source: hosted + version: "1.10.0" + stack_trace: + dependency: transitive + description: + name: stack_trace + sha256: "73713990125a6d93122541237550ee3352a2d84baad52d375a4cad2eb9b7ce0b" + url: "https://pub.dev" + source: hosted + version: "1.11.1" + stream_channel: + dependency: transitive + description: + name: stream_channel + sha256: ba2aa5d8cc609d96bbb2899c28934f9e1af5cddbd60a827822ea467161eb54e7 + url: "https://pub.dev" + source: hosted + version: "2.1.2" + string_scanner: + dependency: transitive + description: + name: string_scanner + sha256: "556692adab6cfa87322a115640c11f13cb77b3f076ddcc5d6ae3c20242bedcde" + url: "https://pub.dev" + source: hosted + version: "1.2.0" + term_glyph: + dependency: transitive + description: + name: term_glyph + sha256: a29248a84fbb7c79282b40b8c72a1209db169a2e0542bce341da992fe1bc7e84 + url: "https://pub.dev" + source: hosted + version: "1.2.1" + test: + dependency: "direct dev" + description: + name: test + sha256: "7ee44229615f8f642b68120165ae4c2a75fe77ae2065b1e55ae4711f6cf0899e" + url: "https://pub.dev" + source: hosted + version: "1.25.7" + test_api: + dependency: transitive + description: + name: test_api + sha256: "5b8a98dafc4d5c4c9c72d8b31ab2b23fc13422348d2997120294d3bac86b4ddb" + url: "https://pub.dev" + source: hosted + version: "0.7.2" + test_core: + dependency: transitive + description: + name: test_core + sha256: "55ea5a652e38a1dfb32943a7973f3681a60f872f8c3a05a14664ad54ef9c6696" + url: "https://pub.dev" + source: hosted + version: "0.6.4" + typed_data: + dependency: transitive + description: + name: typed_data + sha256: facc8d6582f16042dd49f2463ff1bd6e2c9ef9f3d5da3d9b087e244a7b564b3c + url: "https://pub.dev" + source: hosted + version: "1.3.2" + vm_service: + dependency: transitive + description: + name: vm_service + sha256: "360c4271613beb44db559547d02f8b0dc044741d0eeb9aa6ccdb47e8ec54c63a" + url: "https://pub.dev" + source: hosted + version: "14.2.3" + watcher: + dependency: transitive + description: + name: watcher + sha256: "3d2ad6751b3c16cf07c7fca317a1413b3f26530319181b37e3b9039b84fc01d8" + url: "https://pub.dev" + source: hosted + version: "1.1.0" + web: + dependency: transitive + description: + name: web + sha256: "97da13628db363c635202ad97068d47c5b8aa555808e7a9411963c533b449b27" + url: "https://pub.dev" + source: hosted + version: "0.5.1" + web_socket: + dependency: transitive + description: + name: web_socket + sha256: "24301d8c293ce6fe327ffe6f59d8fd8834735f0ec36e4fd383ec7ff8a64aa078" + url: "https://pub.dev" + source: hosted + version: "0.1.5" + web_socket_channel: + dependency: transitive + description: + name: web_socket_channel + sha256: a2d56211ee4d35d9b344d9d4ce60f362e4f5d1aafb988302906bd732bc731276 + url: "https://pub.dev" + source: hosted + version: "3.0.0" + webkit_inspection_protocol: + dependency: transitive + description: + name: webkit_inspection_protocol + sha256: "87d3f2333bb240704cd3f1c6b5b7acd8a10e7f0bc28c28dcf14e782014f4a572" + url: "https://pub.dev" + source: hosted + version: "1.2.1" + yaml: + dependency: transitive + description: + name: yaml + sha256: "75769501ea3489fca56601ff33454fe45507ea3bfb014161abc3b43ae25989d5" + url: "https://pub.dev" + source: hosted + version: "3.1.2" +sdks: + dart: ">=3.4.0 <4.0.0" diff --git a/dart-api-examples/vad/pubspec.yaml b/dart-api-examples/vad/pubspec.yaml new file mode 100644 index 000000000..e7c7bc1cc --- /dev/null +++ b/dart-api-examples/vad/pubspec.yaml @@ -0,0 +1,17 @@ +name: vad + +description: > + This example demonstrates how to use the Dart API for VAD (voice activity detection). + +version: 1.0.0 + +environment: + sdk: ^3.4.0 + +dependencies: + sherpa_onnx: ^0.0.3 + path: ^1.9.0 + args: ^2.5.0 + +dev_dependencies: + lints: ^3.0.0 diff --git a/dart-api-examples/vad/run.sh b/dart-api-examples/vad/run.sh new file mode 100755 index 000000000..0db5ebe10 --- /dev/null +++ b/dart-api-examples/vad/run.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +set -ex + +dart pub get + + +if [[ ! -f ./silero_vad.onnx ]]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +fi + +if [[ ! -f ./lei-jun-test.wav ]]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav +fi + +dart run \ + ./bin/vad.dart \ + --silero-vad ./silero_vad.onnx \ + --input-wav ./lei-jun-test.wav \ + --output-wav ./lei-jun-test-no-silence.wav + +ls -lh *.wav diff --git a/scripts/dotnet/generate.py b/scripts/dotnet/generate.py index 9e3443202..46514673f 100755 --- a/scripts/dotnet/generate.py +++ b/scripts/dotnet/generate.py @@ -104,9 +104,9 @@ def process_windows(s, rid): "piper_phonemize.dll", "sherpa-onnx-c-api.dll", "sherpa-onnx-core.dll", - "sherpa-onnx-fstfar.lib", - "sherpa-onnx-fst.lib", - "sherpa-onnx-kaldifst-core.lib", + "sherpa-onnx-fstfar.dll", + "sherpa-onnx-fst.dll", + "sherpa-onnx-kaldifst-core.dll", "ucd.dll", ] diff --git a/scripts/dotnet/run.sh b/scripts/dotnet/run.sh index 7aa3ae5e4..ed28dfad2 100755 --- a/scripts/dotnet/run.sh +++ b/scripts/dotnet/run.sh @@ -30,7 +30,7 @@ mkdir -p linux macos windows-x64 windows-x86 linux_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl linux_wheel=$src_dir/$linux_wheel_filename -macos_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-macosx_11_0_universal2.whl +macos_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp39-cp39-macosx_11_0_universal2.whl macos_wheel=$src_dir/$macos_wheel_filename windows_x64_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-win_amd64.whl diff --git a/sherpa-onnx/flutter/CHANGELOG.md b/sherpa-onnx/flutter/CHANGELOG.md new file mode 100644 index 000000000..d59aedaca --- /dev/null +++ b/sherpa-onnx/flutter/CHANGELOG.md @@ -0,0 +1,11 @@ +## 0.0.3 + +* Fix path separator on Windows. + +## 0.0.2 + +* Support specifying lib path. + +## 0.0.1 + +* Initial release. diff --git a/sherpa-onnx/flutter/lib/sherpa_onnx.dart b/sherpa-onnx/flutter/lib/sherpa_onnx.dart index 77988d501..ff968a3d8 100644 --- a/sherpa-onnx/flutter/lib/sherpa_onnx.dart +++ b/sherpa-onnx/flutter/lib/sherpa_onnx.dart @@ -14,24 +14,40 @@ export 'src/wave_writer.dart'; import 'src/sherpa_onnx_bindings.dart'; +String? _path; + final DynamicLibrary _dylib = () { if (Platform.isIOS) { throw UnsupportedError('Unknown platform: ${Platform.operatingSystem}'); } if (Platform.isMacOS) { - return DynamicLibrary.open('libsherpa-onnx-c-api.dylib'); + if (_path == null) { + return DynamicLibrary.open('libsherpa-onnx-c-api.dylib'); + } else { + return DynamicLibrary.open('${_path}/libsherpa-onnx-c-api.dylib'); + } } + if (Platform.isAndroid || Platform.isLinux) { - return DynamicLibrary.open('libsherpa-onnx-c-api.so'); + if (_path == null) { + return DynamicLibrary.open('libsherpa-onnx-c-api.so'); + } else { + return DynamicLibrary.open('${_path}/libsherpa-onnx-c-api.so'); + } } if (Platform.isWindows) { - return DynamicLibrary.open('sherpa-onnx-c-api.dll'); + if (_path == null) { + return DynamicLibrary.open('sherpa-onnx-c-api.dll'); + } else { + return DynamicLibrary.open('${_path}\\sherpa-onnx-c-api.dll'); + } } throw UnsupportedError('Unknown platform: ${Platform.operatingSystem}'); }(); -void initBindings() { +void initBindings([String? p]) { + _path ??= p; SherpaOnnxBindings.init(_dylib); } diff --git a/sherpa-onnx/flutter/lib/src/offline_recognizer.dart b/sherpa-onnx/flutter/lib/src/offline_recognizer.dart index 6b6c29fa7..633312424 100644 --- a/sherpa-onnx/flutter/lib/src/offline_recognizer.dart +++ b/sherpa-onnx/flutter/lib/src/offline_recognizer.dart @@ -1,7 +1,6 @@ // Copyright (c) 2024 Xiaomi Corporation import 'dart:convert'; import 'dart:ffi'; -import 'dart:typed_data'; import 'package:ffi/ffi.dart'; @@ -262,7 +261,7 @@ class OfflineRecognizer { final json = SherpaOnnxBindings.getOfflineStreamResultAsJson?.call(stream.ptr) ?? nullptr; - if (json == null) { + if (json == nullptr) { return OfflineRecognizerResult(text: '', tokens: [], timestamps: []); } diff --git a/sherpa-onnx/flutter/lib/src/offline_stream.dart b/sherpa-onnx/flutter/lib/src/offline_stream.dart index 4157886d3..0b6f9c866 100644 --- a/sherpa-onnx/flutter/lib/src/offline_stream.dart +++ b/sherpa-onnx/flutter/lib/src/offline_stream.dart @@ -28,7 +28,7 @@ class OfflineStream { final pList = p.asTypedList(n); pList.setAll(0, samples); - SherpaOnnxBindings.acceptWaveformOffline?.call(this.ptr, sampleRate, p, n); + SherpaOnnxBindings.acceptWaveformOffline?.call(ptr, sampleRate, p, n); calloc.free(p); } diff --git a/sherpa-onnx/flutter/lib/src/online_recognizer.dart b/sherpa-onnx/flutter/lib/src/online_recognizer.dart index 8445e9d75..538c68dda 100644 --- a/sherpa-onnx/flutter/lib/src/online_recognizer.dart +++ b/sherpa-onnx/flutter/lib/src/online_recognizer.dart @@ -1,7 +1,6 @@ // Copyright (c) 2024 Xiaomi Corporation import 'dart:convert'; import 'dart:ffi'; -import 'dart:typed_data'; import 'package:ffi/ffi.dart'; @@ -247,7 +246,7 @@ class OnlineRecognizer { final json = SherpaOnnxBindings.getOnlineStreamResultAsJson?.call(ptr, stream.ptr) ?? nullptr; - if (json == null) { + if (json == nullptr) { return OnlineRecognizerResult(text: '', tokens: [], timestamps: []); } diff --git a/sherpa-onnx/flutter/lib/src/online_stream.dart b/sherpa-onnx/flutter/lib/src/online_stream.dart index ad4875072..29b196221 100644 --- a/sherpa-onnx/flutter/lib/src/online_stream.dart +++ b/sherpa-onnx/flutter/lib/src/online_stream.dart @@ -28,8 +28,7 @@ class OnlineStream { final pList = p.asTypedList(n); pList.setAll(0, samples); - SherpaOnnxBindings.onlineStreamAcceptWaveform - ?.call(this.ptr, sampleRate, p, n); + SherpaOnnxBindings.onlineStreamAcceptWaveform?.call(ptr, sampleRate, p, n); calloc.free(p); } diff --git a/sherpa-onnx/flutter/lib/src/sherpa_onnx_bindings.dart b/sherpa-onnx/flutter/lib/src/sherpa_onnx_bindings.dart index 273b48e52..997bfc70e 100644 --- a/sherpa-onnx/flutter/lib/src/sherpa_onnx_bindings.dart +++ b/sherpa-onnx/flutter/lib/src/sherpa_onnx_bindings.dart @@ -553,13 +553,10 @@ typedef DestroyOnlineStreamNative = Void Function( typedef DestroyOnlineStream = void Function(Pointer); typedef OnlineStreamAcceptWaveformNative = Void Function( - Pointer, - Int32 sample_rate, - Pointer, - Int32 n); + Pointer, Int32, Pointer, Int32); typedef OnlineStreamAcceptWaveform = void Function( - Pointer, int sample_rate, Pointer, int n); + Pointer, int, Pointer, int); typedef OnlineStreamInputFinishedNative = Void Function( Pointer); diff --git a/sherpa-onnx/flutter/lib/src/vad.dart b/sherpa-onnx/flutter/lib/src/vad.dart index 5fe4392f4..7f8b412a1 100644 --- a/sherpa-onnx/flutter/lib/src/vad.dart +++ b/sherpa-onnx/flutter/lib/src/vad.dart @@ -106,8 +106,8 @@ class CircularBuffer { SherpaOnnxBindings.circularBufferReset?.call(this.ptr); } - int get size => SherpaOnnxBindings.circularBufferSize?.call(this.ptr) ?? 0; - int get head => SherpaOnnxBindings.circularBufferHead?.call(this.ptr) ?? 0; + int get size => SherpaOnnxBindings.circularBufferSize?.call(ptr) ?? 0; + int get head => SherpaOnnxBindings.circularBufferHead?.call(ptr) ?? 0; Pointer ptr; } @@ -159,38 +159,36 @@ class VoiceActivityDetector { final pList = p.asTypedList(n); pList.setAll(0, samples); - SherpaOnnxBindings.voiceActivityDetectorAcceptWaveform - ?.call(this.ptr, p, n); + SherpaOnnxBindings.voiceActivityDetectorAcceptWaveform?.call(ptr, p, n); calloc.free(p); } bool isEmpty() { final int empty = - SherpaOnnxBindings.voiceActivityDetectorEmpty?.call(this.ptr) ?? 0; + SherpaOnnxBindings.voiceActivityDetectorEmpty?.call(ptr) ?? 0; return empty == 1; } bool isDetected() { final int detected = - SherpaOnnxBindings.voiceActivityDetectorDetected?.call(this.ptr) ?? 0; + SherpaOnnxBindings.voiceActivityDetectorDetected?.call(ptr) ?? 0; return detected == 1; } void pop() { - SherpaOnnxBindings.voiceActivityDetectorPop?.call(this.ptr); + SherpaOnnxBindings.voiceActivityDetectorPop?.call(ptr); } void clear() { - SherpaOnnxBindings.voiceActivityDetectorClear?.call(this.ptr); + SherpaOnnxBindings.voiceActivityDetectorClear?.call(ptr); } SpeechSegment front() { final Pointer segment = - SherpaOnnxBindings.voiceActivityDetectorFront?.call(this.ptr) ?? - nullptr; + SherpaOnnxBindings.voiceActivityDetectorFront?.call(ptr) ?? nullptr; if (segment == nullptr) { return SpeechSegment(samples: Float32List(0), start: 0); } @@ -206,7 +204,7 @@ class VoiceActivityDetector { } void reset() { - SherpaOnnxBindings.voiceActivityDetectorReset?.call(this.ptr); + SherpaOnnxBindings.voiceActivityDetectorReset?.call(ptr); } Pointer ptr; diff --git a/sherpa-onnx/flutter/notes.md b/sherpa-onnx/flutter/notes.md index 946f08973..42d872e58 100644 --- a/sherpa-onnx/flutter/notes.md +++ b/sherpa-onnx/flutter/notes.md @@ -43,3 +43,20 @@ dart analyze FLUTTER_XCODE_ARCHS=arm64 FLUTTER_XCODE_ARCHS=x86_64 ``` + +## Examples + + - https://dart.dev/tools/pub/automated-publishing + + Use GitHub actions to publish + + - https://dart.dev/tools/pub/pubspec + + It describes the format of ./pubspec.yaml + + - https://github.com/folksable/blurhash_ffi/ + + It supports ios, android, linux, macos, and windows. + + - https://github.com/alexmercerind/dart_vlc + - https://github.com/dart-lang/native/tree/main/pkgs/jni diff --git a/sherpa-onnx/flutter/pubspec.yaml b/sherpa-onnx/flutter/pubspec.yaml index ce84f9d74..d216928da 100644 --- a/sherpa-onnx/flutter/pubspec.yaml +++ b/sherpa-onnx/flutter/pubspec.yaml @@ -1,17 +1,24 @@ name: sherpa_onnx + description: > - Dart bindings for sherpa-onnx. -repository: https://github.com/k2-fsa/sherpa-onnx/tree/main/sherpa-onnx/flutter + Speech recognition, speech synthesis, and speaker recognition using next-gen Kaldi + with onnxruntime without Internet connection. + +repository: https://github.com/k2-fsa/sherpa-onnx/tree/master/sherpa-onnx/flutter + +issue_tracker: https://github.com/k2-fsa/sherpa-onnx/issues +documentation: https://k2-fsa.github.io/sherpa/onnx/ + topics: - - speech-to-text - - text-to-speech + - speech-recognition + - speech-synthesis - speaker-identification - - spoken-language-identification - audio-tagging - voice-activity-detection # remember to change the version in macos/sherpa_onnx.podspec -version: 0.0.1 +version: 0.0.2 + homepage: https://github.com/k2-fsa/sherpa-onnx environment: @@ -22,30 +29,14 @@ dependencies: ffi: ^2.1.0 flutter: sdk: flutter - plugin_platform_interface: ^2.0.2 dev_dependencies: flutter_test: sdk: flutter flutter_lints: ^3.0.0 -# For information on the generic Dart part of this file, see the -# following page: https://dart.dev/tools/pub/pubspec - # The following section is specific to Flutter packages. flutter: - # This section identifies this Flutter project as a plugin project. - # The 'pluginClass' specifies the class (in Java, Kotlin, Swift, Objective-C, etc.) - # which should be registered in the plugin registry. This is required for - # using method channels. - # The Android 'package' specifies package in which the registered class is. - # This is required for using method channels on Android. - # The 'ffiPlugin' specifies that native code should be built and bundled. - # This is required for using `dart:ffi`. - # All these are used by the tooling to maintain consistency when - # adding or updating assets for this project. - # - # Please refer to README.md for a detailed explanation. plugin: platforms: macos: @@ -54,34 +45,3 @@ flutter: ffiPlugin: true linux: ffiPlugin: true - - # To add assets to your plugin package, add an assets section, like this: - # assets: - # - images/a_dot_burr.jpeg - # - images/a_dot_ham.jpeg - # - # For details regarding assets in packages, see - # https://flutter.dev/assets-and-images/#from-packages - # - # An image asset can refer to one or more resolution-specific "variants", see - # https://flutter.dev/assets-and-images/#resolution-aware - - # To add custom fonts to your plugin package, add a fonts section here, - # in this "flutter" section. Each entry in this list should have a - # "family" key with the font family name, and a "fonts" key with a - # list giving the asset and other descriptors for the font. For - # example: - # fonts: - # - family: Schyler - # fonts: - # - asset: fonts/Schyler-Regular.ttf - # - asset: fonts/Schyler-Italic.ttf - # style: italic - # - family: Trajan Pro - # fonts: - # - asset: fonts/TrajanPro.ttf - # - asset: fonts/TrajanPro_Bold.ttf - # weight: 700 - # - # For details regarding fonts in packages, see - # https://flutter.dev/custom-fonts/#from-packages diff --git a/sherpa-onnx/flutter/windows/CMakeLists.txt b/sherpa-onnx/flutter/windows/CMakeLists.txt index 06fa598ca..a6a262ee4 100644 --- a/sherpa-onnx/flutter/windows/CMakeLists.txt +++ b/sherpa-onnx/flutter/windows/CMakeLists.txt @@ -19,9 +19,9 @@ set(sherpa_onnx_bundled_libraries "${CMAKE_CURRENT_SOURCE_DIR}/sherpa-onnx-c-api.dll" "${CMAKE_CURRENT_SOURCE_DIR}/sherpa-onnx-core.dll" "${CMAKE_CURRENT_SOURCE_DIR}/kaldi-decoder-core.dll" - "${CMAKE_CURRENT_SOURCE_DIR}/sherpa-onnx-kaldifst-core.lib" - "${CMAKE_CURRENT_SOURCE_DIR}/sherpa-onnx-fstfar.lib" - "${CMAKE_CURRENT_SOURCE_DIR}/sherpa-onnx-fst.lib" + "${CMAKE_CURRENT_SOURCE_DIR}/sherpa-onnx-kaldifst-core.dll" + "${CMAKE_CURRENT_SOURCE_DIR}/sherpa-onnx-fstfar.dll" + "${CMAKE_CURRENT_SOURCE_DIR}/sherpa-onnx-fst.dll" "${CMAKE_CURRENT_SOURCE_DIR}/kaldi-native-fbank-core.dll" "${CMAKE_CURRENT_SOURCE_DIR}/piper_phonemize.dll" "${CMAKE_CURRENT_SOURCE_DIR}/espeak-ng.dll" From 20a21133af930dafb15a2f7b7bf46ea81800f410 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 14 Jun 2024 11:50:55 +0800 Subject: [PATCH 015/237] Use CI to publish dart packages (#1001) --- .github/workflows/release-dart-package.yaml | 51 +++ dart-api-examples/vad/pubspec.lock | 372 ++------------------ scripts/dart/release.sh | 82 +++++ scripts/dotnet/run.sh | 4 +- sherpa-onnx/flutter/CHANGELOG.md | 4 + sherpa-onnx/flutter/pubspec.yaml | 4 - 6 files changed, 171 insertions(+), 346 deletions(-) create mode 100644 .github/workflows/release-dart-package.yaml create mode 100755 scripts/dart/release.sh diff --git a/.github/workflows/release-dart-package.yaml b/.github/workflows/release-dart-package.yaml new file mode 100644 index 000000000..9ed4d0150 --- /dev/null +++ b/.github/workflows/release-dart-package.yaml @@ -0,0 +1,51 @@ +name: release-dart + +on: + push: + branches: + - ci-pub-dart + tags: + - 'v[0-9]+.[0-9]+.[0-9]+*' # tag-pattern on pub.dev: 'v{{version}}' + + workflow_dispatch: + +concurrency: + group: release-dart-${{ github.ref }} + cancel-in-progress: true + +jobs: + release_dart: + permissions: + id-token: write # Required for authentication using OIDC + name: Release dart + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Setup Flutter SDK + uses: flutter-actions/setup-flutter@v3 + with: + channel: stable + version: latest + + - uses: dart-lang/setup-dart@v1 + + - name: Copy pre-build libs + shell: bash + run: | + cd scripts/dart + ./release.sh + cd ../.. + + mv -v sherpa-onnx/flutter /tmp/to-be-published + + cp -v README.md /tmp/to-be-published + + - name: Release + shell: bash + run: | + cd /tmp/to-be-published + flutter pub get + flutter pub publish --dry-run + flutter pub publish --force diff --git a/dart-api-examples/vad/pubspec.lock b/dart-api-examples/vad/pubspec.lock index 776660ec5..a29b073e6 100644 --- a/dart-api-examples/vad/pubspec.lock +++ b/dart-api-examples/vad/pubspec.lock @@ -1,51 +1,22 @@ # Generated by pub # See https://dart.dev/tools/pub/glossary#lockfile packages: - _fe_analyzer_shared: - dependency: transitive - description: - name: _fe_analyzer_shared - sha256: "5aaf60d96c4cd00fe7f21594b5ad6a1b699c80a27420f8a837f4d68473ef09e3" - url: "https://pub.dev" - source: hosted - version: "68.0.0" - _macros: - dependency: transitive - description: dart - source: sdk - version: "0.1.0" - analyzer: - dependency: transitive - description: - name: analyzer - sha256: "21f1d3720fd1c70316399d5e2bccaebb415c434592d778cce8acb967b8578808" - url: "https://pub.dev" - source: hosted - version: "6.5.0" args: - dependency: transitive + dependency: "direct main" description: name: args sha256: "7cf60b9f0cc88203c5a190b4cd62a99feea42759a7fa695010eb5de1c0b2252a" url: "https://pub.dev" source: hosted version: "2.5.0" - async: + characters: dependency: transitive description: - name: async - sha256: "947bfcf187f74dbc5e146c9eb9c0f10c9f8b30743e341481c1e2ed3ecc18c20c" + name: characters + sha256: "04a925763edad70e8443c99234dc3328f442e811f1d8fd1a72f1c8ad0f69a605" url: "https://pub.dev" source: hosted - version: "2.11.0" - boolean_selector: - dependency: transitive - description: - name: boolean_selector - sha256: "6cfb5af12253eaf2b368f07bacc5a80d1301a071c73360d746b7f2e32d762c66" - url: "https://pub.dev" - source: hosted - version: "2.1.1" + version: "1.3.0" collection: dependency: transitive description: @@ -54,86 +25,19 @@ packages: url: "https://pub.dev" source: hosted version: "1.18.0" - convert: - dependency: transitive - description: - name: convert - sha256: "0f08b14755d163f6e2134cb58222dd25ea2a2ee8a195e53983d57c075324d592" - url: "https://pub.dev" - source: hosted - version: "3.1.1" - coverage: - dependency: transitive - description: - name: coverage - sha256: "3945034e86ea203af7a056d98e98e42a5518fff200d6e8e6647e1886b07e936e" - url: "https://pub.dev" - source: hosted - version: "1.8.0" - crypto: - dependency: transitive - description: - name: crypto - sha256: ff625774173754681d66daaf4a448684fb04b78f902da9cb3d308c19cc5e8bab - url: "https://pub.dev" - source: hosted - version: "3.0.3" - file: + ffi: dependency: transitive description: - name: file - sha256: "5fc22d7c25582e38ad9a8515372cd9a93834027aacf1801cf01164dac0ffa08c" - url: "https://pub.dev" - source: hosted - version: "7.0.0" - frontend_server_client: - dependency: transitive - description: - name: frontend_server_client - sha256: f64a0333a82f30b0cca061bc3d143813a486dc086b574bfb233b7c1372427694 - url: "https://pub.dev" - source: hosted - version: "4.0.0" - glob: - dependency: transitive - description: - name: glob - sha256: "0e7014b3b7d4dac1ca4d6114f82bf1782ee86745b9b42a92c9289c23d8a0ab63" + name: ffi + sha256: "493f37e7df1804778ff3a53bd691d8692ddf69702cf4c1c1096a2e41b4779e21" url: "https://pub.dev" source: hosted version: "2.1.2" - http_multi_server: - dependency: transitive - description: - name: http_multi_server - sha256: "97486f20f9c2f7be8f514851703d0119c3596d14ea63227af6f7a481ef2b2f8b" - url: "https://pub.dev" - source: hosted - version: "3.2.1" - http_parser: + flutter: dependency: transitive - description: - name: http_parser - sha256: "2aa08ce0341cc9b354a498388e30986515406668dbcc4f7c950c3e715496693b" - url: "https://pub.dev" - source: hosted - version: "4.0.2" - io: - dependency: transitive - description: - name: io - sha256: "2ec25704aba361659e10e3e5f5d672068d332fc8ac516421d483a11e5cbd061e" - url: "https://pub.dev" - source: hosted - version: "1.0.4" - js: - dependency: transitive - description: - name: js - sha256: c1b2e9b5ea78c45e1a0788d29606ba27dc5f71f019f32ca5140f61ef071838cf - url: "https://pub.dev" - source: hosted - version: "0.7.1" + description: flutter + source: sdk + version: "0.0.0" lints: dependency: "direct dev" description: @@ -142,261 +46,51 @@ packages: url: "https://pub.dev" source: hosted version: "3.0.0" - logging: + material_color_utilities: dependency: transitive description: - name: logging - sha256: "623a88c9594aa774443aa3eb2d41807a48486b5613e67599fb4c41c0ad47c340" + name: material_color_utilities + sha256: "0e0a020085b65b6083975e499759762399b4475f766c21668c4ecca34ea74e5a" url: "https://pub.dev" source: hosted - version: "1.2.0" - macros: - dependency: transitive - description: - name: macros - sha256: "12e8a9842b5a7390de7a781ec63d793527582398d16ea26c60fed58833c9ae79" - url: "https://pub.dev" - source: hosted - version: "0.1.0-main.0" - matcher: - dependency: transitive - description: - name: matcher - sha256: d2323aa2060500f906aa31a895b4030b6da3ebdcc5619d14ce1aada65cd161cb - url: "https://pub.dev" - source: hosted - version: "0.12.16+1" + version: "0.8.0" meta: dependency: transitive description: name: meta - sha256: bdb68674043280c3428e9ec998512fb681678676b3c54e773629ffe74419f8c7 - url: "https://pub.dev" - source: hosted - version: "1.15.0" - mime: - dependency: transitive - description: - name: mime - sha256: "2e123074287cc9fd6c09de8336dae606d1ddb88d9ac47358826db698c176a1f2" + sha256: "7687075e408b093f36e6bbf6c91878cc0d4cd10f409506f7bc996f68220b9136" url: "https://pub.dev" source: hosted - version: "1.0.5" - node_preamble: - dependency: transitive - description: - name: node_preamble - sha256: "6e7eac89047ab8a8d26cf16127b5ed26de65209847630400f9aefd7cd5c730db" - url: "https://pub.dev" - source: hosted - version: "2.0.2" - package_config: - dependency: transitive - description: - name: package_config - sha256: "1c5b77ccc91e4823a5af61ee74e6b972db1ef98c2ff5a18d3161c982a55448bd" - url: "https://pub.dev" - source: hosted - version: "2.1.0" + version: "1.12.0" path: - dependency: transitive + dependency: "direct main" description: name: path sha256: "087ce49c3f0dc39180befefc60fdb4acd8f8620e5682fe2476afd0b3688bb4af" url: "https://pub.dev" source: hosted version: "1.9.0" - pool: - dependency: transitive + sherpa_onnx: + dependency: "direct main" description: - name: pool - sha256: "20fe868b6314b322ea036ba325e6fc0711a22948856475e2c2b6306e8ab39c2a" + name: sherpa_onnx + sha256: "6cfadf7bc35001bb1284f9fac1e03e33787cafa918e0c45da96d1e91afa58751" url: "https://pub.dev" source: hosted - version: "1.5.1" - pub_semver: + version: "0.0.3" + sky_engine: dependency: transitive - description: - name: pub_semver - sha256: "40d3ab1bbd474c4c2328c91e3a7df8c6dd629b79ece4c4bd04bee496a224fb0c" - url: "https://pub.dev" - source: hosted - version: "2.1.4" - shelf: - dependency: transitive - description: - name: shelf - sha256: ad29c505aee705f41a4d8963641f91ac4cee3c8fad5947e033390a7bd8180fa4 - url: "https://pub.dev" - source: hosted - version: "1.4.1" - shelf_packages_handler: - dependency: transitive - description: - name: shelf_packages_handler - sha256: "89f967eca29607c933ba9571d838be31d67f53f6e4ee15147d5dc2934fee1b1e" - url: "https://pub.dev" - source: hosted - version: "3.0.2" - shelf_static: - dependency: transitive - description: - name: shelf_static - sha256: a41d3f53c4adf0f57480578c1d61d90342cd617de7fc8077b1304643c2d85c1e - url: "https://pub.dev" - source: hosted - version: "1.1.2" - shelf_web_socket: - dependency: transitive - description: - name: shelf_web_socket - sha256: "073c147238594ecd0d193f3456a5fe91c4b0abbcc68bf5cd95b36c4e194ac611" - url: "https://pub.dev" - source: hosted - version: "2.0.0" - source_map_stack_trace: - dependency: transitive - description: - name: source_map_stack_trace - sha256: "84cf769ad83aa6bb61e0aa5a18e53aea683395f196a6f39c4c881fb90ed4f7ae" - url: "https://pub.dev" - source: hosted - version: "2.1.1" - source_maps: - dependency: transitive - description: - name: source_maps - sha256: "708b3f6b97248e5781f493b765c3337db11c5d2c81c3094f10904bfa8004c703" - url: "https://pub.dev" - source: hosted - version: "0.10.12" - source_span: - dependency: transitive - description: - name: source_span - sha256: "53e943d4206a5e30df338fd4c6e7a077e02254531b138a15aec3bd143c1a8b3c" - url: "https://pub.dev" - source: hosted - version: "1.10.0" - stack_trace: - dependency: transitive - description: - name: stack_trace - sha256: "73713990125a6d93122541237550ee3352a2d84baad52d375a4cad2eb9b7ce0b" - url: "https://pub.dev" - source: hosted - version: "1.11.1" - stream_channel: - dependency: transitive - description: - name: stream_channel - sha256: ba2aa5d8cc609d96bbb2899c28934f9e1af5cddbd60a827822ea467161eb54e7 - url: "https://pub.dev" - source: hosted - version: "2.1.2" - string_scanner: - dependency: transitive - description: - name: string_scanner - sha256: "556692adab6cfa87322a115640c11f13cb77b3f076ddcc5d6ae3c20242bedcde" - url: "https://pub.dev" - source: hosted - version: "1.2.0" - term_glyph: - dependency: transitive - description: - name: term_glyph - sha256: a29248a84fbb7c79282b40b8c72a1209db169a2e0542bce341da992fe1bc7e84 - url: "https://pub.dev" - source: hosted - version: "1.2.1" - test: - dependency: "direct dev" - description: - name: test - sha256: "7ee44229615f8f642b68120165ae4c2a75fe77ae2065b1e55ae4711f6cf0899e" - url: "https://pub.dev" - source: hosted - version: "1.25.7" - test_api: - dependency: transitive - description: - name: test_api - sha256: "5b8a98dafc4d5c4c9c72d8b31ab2b23fc13422348d2997120294d3bac86b4ddb" - url: "https://pub.dev" - source: hosted - version: "0.7.2" - test_core: - dependency: transitive - description: - name: test_core - sha256: "55ea5a652e38a1dfb32943a7973f3681a60f872f8c3a05a14664ad54ef9c6696" - url: "https://pub.dev" - source: hosted - version: "0.6.4" - typed_data: - dependency: transitive - description: - name: typed_data - sha256: facc8d6582f16042dd49f2463ff1bd6e2c9ef9f3d5da3d9b087e244a7b564b3c - url: "https://pub.dev" - source: hosted - version: "1.3.2" - vm_service: - dependency: transitive - description: - name: vm_service - sha256: "360c4271613beb44db559547d02f8b0dc044741d0eeb9aa6ccdb47e8ec54c63a" - url: "https://pub.dev" - source: hosted - version: "14.2.3" - watcher: - dependency: transitive - description: - name: watcher - sha256: "3d2ad6751b3c16cf07c7fca317a1413b3f26530319181b37e3b9039b84fc01d8" - url: "https://pub.dev" - source: hosted - version: "1.1.0" - web: - dependency: transitive - description: - name: web - sha256: "97da13628db363c635202ad97068d47c5b8aa555808e7a9411963c533b449b27" - url: "https://pub.dev" - source: hosted - version: "0.5.1" - web_socket: - dependency: transitive - description: - name: web_socket - sha256: "24301d8c293ce6fe327ffe6f59d8fd8834735f0ec36e4fd383ec7ff8a64aa078" - url: "https://pub.dev" - source: hosted - version: "0.1.5" - web_socket_channel: - dependency: transitive - description: - name: web_socket_channel - sha256: a2d56211ee4d35d9b344d9d4ce60f362e4f5d1aafb988302906bd732bc731276 - url: "https://pub.dev" - source: hosted - version: "3.0.0" - webkit_inspection_protocol: - dependency: transitive - description: - name: webkit_inspection_protocol - sha256: "87d3f2333bb240704cd3f1c6b5b7acd8a10e7f0bc28c28dcf14e782014f4a572" - url: "https://pub.dev" - source: hosted - version: "1.2.1" - yaml: + description: flutter + source: sdk + version: "0.0.99" + vector_math: dependency: transitive description: - name: yaml - sha256: "75769501ea3489fca56601ff33454fe45507ea3bfb014161abc3b43ae25989d5" + name: vector_math + sha256: "80b3257d1492ce4d091729e3a67a60407d227c27241d6927be0130c98e741803" url: "https://pub.dev" source: hosted - version: "3.1.2" + version: "2.1.4" sdks: dart: ">=3.4.0 <4.0.0" + flutter: ">=3.3.0" diff --git a/scripts/dart/release.sh b/scripts/dart/release.sh new file mode 100755 index 000000000..78b80ae61 --- /dev/null +++ b/scripts/dart/release.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash + +# see +# https://dart.dev/tools/pub/automated-publishing + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) +echo "SCRIPT_DIR: $SCRIPT_DIR" +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +SHERPA_ONNX_VERSION=$(grep "SHERPA_ONNX_VERSION" $SHERPA_ONNX_DIR/CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2) + +src_dir=$SHERPA_ONNX_DIR/sherpa-onnx/flutter +pushd $src_dir + +v="version: $SHERPA_ONNX_VERSION" +echo "v: $v" +sed -i.bak s"/^version: .*/$v/" ./pubspec.yaml +rm *.bak +rm notes.md +git status +git diff + +HF_MIRROR=hf.co +linux_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl +linux_wheel=$src_dir/$linux_wheel_filename + +macos_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp39-cp39-macosx_10_14_universal2.whl +macos_wheel=$src_dir/$macos_wheel_filename + +windows_x64_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-win_amd64.whl +windows_x64_wheel=$src_dir/$windows_x64_wheel_filename + +function process_linux() { + mkdir -p t + cd t + curl -OL https://$HF_MIRROR/csukuangfj/sherpa-onnx-wheels/resolve/main/$linux_wheel_filename + unzip $linux_wheel_filename + cp -v sherpa_onnx/lib/*.so* ../linux + cd .. + rm -rf t + + pushd linux + + rm -v libpiper_phonemize.so libpiper_phonemize.so.1.2.0 + rm -v libonnxruntime.so + rm -v libcargs.so + + popd +} + +function process_windows_x64() { + mkdir -p t + cd t + curl -OL https://$HF_MIRROR/csukuangfj/sherpa-onnx-wheels/resolve/main/$windows_x64_wheel_filename + unzip $windows_x64_wheel_filename + cp -v sherpa_onnx-${SHERPA_ONNX_VERSION}.data/data/bin/*.dll ../windows + cd .. + rm -rf t +} + +function process_macos() { + mkdir -p t + cd t + curl -OL https://$HF_MIRROR/csukuangfj/sherpa-onnx-wheels/resolve/main/$macos_wheel_filename + unzip $macos_wheel_filename + cp -v sherpa_onnx/lib/*.dylib ../macos + cd .. + rm -rf t + + pushd macos + rm -v libcargs.dylib + rm -v libonnxruntime.dylib + rm -v libpiper_phonemize.1.2.0.dylib libpiper_phonemize.dylib + popd +} + +process_linux +process_windows_x64 +process_macos diff --git a/scripts/dotnet/run.sh b/scripts/dotnet/run.sh index ed28dfad2..a41050f18 100755 --- a/scripts/dotnet/run.sh +++ b/scripts/dotnet/run.sh @@ -30,7 +30,7 @@ mkdir -p linux macos windows-x64 windows-x86 linux_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl linux_wheel=$src_dir/$linux_wheel_filename -macos_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp39-cp39-macosx_11_0_universal2.whl +macos_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp39-cp39-macosx_10_14_universal2.whl macos_wheel=$src_dir/$macos_wheel_filename windows_x64_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-win_amd64.whl @@ -96,7 +96,6 @@ if [ ! -f $src_dir/windows-x64/sherpa-onnx-core.dll ]; then fi unzip $windows_x64_wheel_filename cp -v sherpa_onnx-${SHERPA_ONNX_VERSION}.data/data/bin/*.dll ../ - cp -v sherpa_onnx-${SHERPA_ONNX_VERSION}.data/data/bin/*.lib ../ cd .. rm -rf wheel @@ -116,7 +115,6 @@ if [ ! -f $src_dir/windows-x86/sherpa-onnx-core.dll ]; then fi unzip $windows_x86_wheel_filename cp -v sherpa_onnx-${SHERPA_ONNX_VERSION}.data/data/bin/*.dll ../ - cp -v sherpa_onnx-${SHERPA_ONNX_VERSION}.data/data/bin/*.lib ../ cd .. rm -rf wheel diff --git a/sherpa-onnx/flutter/CHANGELOG.md b/sherpa-onnx/flutter/CHANGELOG.md index d59aedaca..1e583389f 100644 --- a/sherpa-onnx/flutter/CHANGELOG.md +++ b/sherpa-onnx/flutter/CHANGELOG.md @@ -1,3 +1,7 @@ +## 1.9.29 + +* Publish with CI + ## 0.0.3 * Fix path separator on Windows. diff --git a/sherpa-onnx/flutter/pubspec.yaml b/sherpa-onnx/flutter/pubspec.yaml index d216928da..3b59ef479 100644 --- a/sherpa-onnx/flutter/pubspec.yaml +++ b/sherpa-onnx/flutter/pubspec.yaml @@ -30,10 +30,6 @@ dependencies: flutter: sdk: flutter -dev_dependencies: - flutter_test: - sdk: flutter - flutter_lints: ^3.0.0 # The following section is specific to Flutter packages. flutter: From dcd6ec31235b94ebbdc92f6285207cae3c05510a Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 14 Jun 2024 12:13:49 +0800 Subject: [PATCH 016/237] Publish osx-arm64 nuget package for .Net (#1003) --- scripts/dotnet/generate.py | 11 +++--- scripts/dotnet/run.sh | 55 +++++++++++++++++++++------- scripts/dotnet/sherpa-onnx.csproj.in | 3 +- 3 files changed, 50 insertions(+), 19 deletions(-) diff --git a/scripts/dotnet/generate.py b/scripts/dotnet/generate.py index 46514673f..db76469ed 100755 --- a/scripts/dotnet/generate.py +++ b/scripts/dotnet/generate.py @@ -64,7 +64,7 @@ def process_linux(s): f.write(s) -def process_macos(s): +def process_macos(s, rid): libs = [ "libespeak-ng.dylib", "libkaldi-decoder-core.dylib", @@ -79,18 +79,18 @@ def process_macos(s): "libsherpa-onnx-kaldifst-core.dylib", "libucd.dylib", ] - prefix = f"{src_dir}/macos/" + prefix = f"{src_dir}/macos-{rid}/" libs = [prefix + lib for lib in libs] libs = "\n ;".join(libs) d = get_dict() - d["dotnet_rid"] = "osx-x64" + d["dotnet_rid"] = f"osx-{rid}" d["libs"] = libs environment = jinja2.Environment() template = environment.from_string(s) s = template.render(**d) - with open("./macos/sherpa-onnx.runtime.csproj", "w") as f: + with open(f"./macos-{rid}/sherpa-onnx.runtime.csproj", "w") as f: f.write(s) @@ -129,7 +129,8 @@ def process_windows(s, rid): def main(): s = read_proj_file("./sherpa-onnx.csproj.runtime.in") - process_macos(s) + process_macos(s, "x64") + process_macos(s, "arm64") process_linux(s) process_windows(s, "x64") process_windows(s, "x86") diff --git a/scripts/dotnet/run.sh b/scripts/dotnet/run.sh index a41050f18..07a4153e4 100755 --- a/scripts/dotnet/run.sh +++ b/scripts/dotnet/run.sh @@ -24,14 +24,16 @@ export src_dir mkdir -p $src_dir pushd $src_dir -mkdir -p linux macos windows-x64 windows-x86 - +mkdir -p linux macos-x64 macos-arm64 windows-x64 windows-x86 linux_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl linux_wheel=$src_dir/$linux_wheel_filename -macos_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp39-cp39-macosx_10_14_universal2.whl -macos_wheel=$src_dir/$macos_wheel_filename +macos_x64_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp39-cp39-macosx_11_0_x86_64.whl +macos_x64_wheel=$src_dir/$macos_x64_wheel_filename + +macos_arm64_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp39-cp39-macosx_11_0_arm64.whl +macos_arm64_wheel=$src_dir/$macos_arm64_wheel_filename windows_x64_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-win_amd64.whl windows_x64_wheel=$src_dir/$windows_x64_wheel_filename @@ -60,17 +62,17 @@ if [ ! -f $src_dir/linux/libsherpa-onnx-core.so ]; then cd .. fi -if [ ! -f $src_dir/macos/libsherpa-onnx-core.dylib ]; then - echo "--- macOS x86_64/arm64 universal2---" - cd macos +if [ ! -f $src_dir/macos-x64/libsherpa-onnx-core.dylib ]; then + echo "--- macOS x86_64---" + cd macos-x64 mkdir -p wheel cd wheel - if [ -f $macos_wheel ]; then - cp -v $macos_wheel . + if [ -f $macos_x64_wheel ]; then + cp -v $macos_x64_wheel . else - curl -OL https://$HF_MIRROR/csukuangfj/sherpa-onnx-wheels/resolve/main/$macos_wheel_filename + curl -OL https://$HF_MIRROR/csukuangfj/sherpa-onnx-wheels/resolve/main/$macos_x64_wheel_filename fi - unzip $macos_wheel_filename + unzip $macos_x64_wheel_filename cp -v sherpa_onnx/lib/*.dylib ../ cd .. @@ -83,6 +85,28 @@ if [ ! -f $src_dir/macos/libsherpa-onnx-core.dylib ]; then cd .. fi +if [ ! -f $src_dir/macos-arm64/libsherpa-onnx-core.dylib ]; then + echo "--- macOS arm64---" + cd macos-arm64 + mkdir -p wheel + cd wheel + if [ -f $macos_arm64_wheel ]; then + cp -v $macos_arm64_wheel . + else + curl -OL https://$HF_MIRROR/csukuangfj/sherpa-onnx-wheels/resolve/main/$macos_arm64_wheel_filename + fi + unzip $macos_arm64_wheel_filename + cp -v sherpa_onnx/lib/*.dylib ../ + + cd .. + + rm -v libcargs.dylib + rm -v libonnxruntime.dylib + rm -v libpiper_phonemize.1.2.0.dylib libpiper_phonemize.dylib + rm -rf wheel + ls -lh + cd .. +fi if [ ! -f $src_dir/windows-x64/sherpa-onnx-core.dll ]; then echo "---windows x64---" @@ -124,7 +148,7 @@ fi popd -mkdir -p macos linux windows-x64 windows-x86 all +mkdir -p macos-x64 macos-arm64 linux windows-x64 windows-x86 all cp ./*.cs all @@ -135,7 +159,12 @@ dotnet build -c Release dotnet pack -c Release -o ../packages popd -pushd macos +pushd macos-x64 +dotnet build -c Release +dotnet pack -c Release -o ../packages +popd + +pushd macos-arm64 dotnet build -c Release dotnet pack -c Release -o ../packages popd diff --git a/scripts/dotnet/sherpa-onnx.csproj.in b/scripts/dotnet/sherpa-onnx.csproj.in index 60ae41878..905b65c1f 100644 --- a/scripts/dotnet/sherpa-onnx.csproj.in +++ b/scripts/dotnet/sherpa-onnx.csproj.in @@ -5,7 +5,7 @@ Library 10.0 netstandard2.0 - linux-x64;osx-x64;win-x64 + linux-x64;osx-x64;osx-arm64;win-x64;win-x86 true sherpa-onnx {{ version }} @@ -50,6 +50,7 @@ + From dd21ebdabfe2a2c1576f4f67994d88af111ee9e1 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 14 Jun 2024 13:06:52 +0800 Subject: [PATCH 017/237] Update README (#1004) --- README.md | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index e2c4e913a..75579916b 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,33 @@ -# Introduction +### Supported functions + +|Speech recognition| Speech synthesis | Speaker verification | Speaker identification | +|------------------|------------------|----------------------|------------------------| +| ✔️ | ✔️ | ✔️ | ✔️ | + +| Spoken Language identification | Audio tagging | Voice activity detection | Keyword spotting | +|--------------------------------|---------------|--------------------------|------------------| +| ✔️ | ✔️ | ✔️ | ✔️ | + +### Supported platforms + +|Architecture| Android | iOS | Windows | macOS | linux | +|------------|------------------|---------------|------------|-------|-------| +| x64 | ✔️ | | ✔️ | ✔️ | ✔️ | +| x86 | ✔️ | | ✔️ | | | +| arm64 | ✔️ | ✔️ | ✔️ | ✔️ | ✔️ | +| arm32 | ✔️ | | | | ✔️ | +| riscv64 | | | | | ✔️ | + + +### Supported programming languages + +| C++ | C | Python | C# | Java | JavaScript | Kotlin | Swift | Go | Dart | +|-----|----|--------|----|------|------------|--------|-------|----|------| +| ✔️ | ✔️ | ✔️ | ✔️ | ✔️ | ✔️ | ✔️ | ✔️ | ✔️ | ✔️ | + +It also supports WebAssembly. + +## Introduction This repository supports running the following functions **locally** @@ -33,7 +62,7 @@ with the following APIs - Swift - Dart -## Links for pre-built Android APKs +### Links for pre-built Android APKs | Description | URL | 中国用户 | |--------------------------------|-----------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------| @@ -48,7 +77,7 @@ with the following APIs | Spoken language identification | [Address](https://k2-fsa.github.io/sherpa/onnx/spoken-language-identification/apk.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/spoken-language-identification/apk-cn.html) | |Keyword spotting| [Address](https://k2-fsa.github.io/sherpa/onnx/kws/apk.html)| [点此](https://k2-fsa.github.io/sherpa/onnx/kws/apk-cn.html)| -## Links for pre-trained models +### Links for pre-trained models | Description | URL | |--------------------------------|--------------------------------------------------------------------------------------------------------------------------------| @@ -61,12 +90,12 @@ with the following APIs | Spoken language identification (Language ID) | See multi-lingual Whisper ASR models from [Speech recognition](https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models) | | Punctuation| [Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/punctuation-models)| -## Useful links +### Useful links - Documentation: https://k2-fsa.github.io/sherpa/onnx/ - Bilibili 演示视频: https://search.bilibili.com/all?keyword=%E6%96%B0%E4%B8%80%E4%BB%A3Kaldi -## How to reach us +### How to reach us Please see https://k2-fsa.github.io/sherpa/social-groups.html From b1f08c0a203a84918bb4cd4526625dc5f6b7eac2 Mon Sep 17 00:00:00 2001 From: Lovemefan Date: Fri, 14 Jun 2024 16:51:53 +0800 Subject: [PATCH 018/237] scale value fix (#1006) --- scripts/tele-speech/test.py | 2 +- scripts/wenet/test-onnx-streaming.py | 2 +- scripts/wenet/test-onnx.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/tele-speech/test.py b/scripts/tele-speech/test.py index 71953700f..b52174d11 100755 --- a/scripts/tele-speech/test.py +++ b/scripts/tele-speech/test.py @@ -80,7 +80,7 @@ def get_features(test_wav_filename): samples = librosa.resample(samples, orig_sr=sample_rate, target_sr=16000) sample_rate = 16000 - samples *= 372768 + samples *= 32768 opts = knf.MfccOptions() # See https://github.com/Tele-AI/TeleSpeech-ASR/blob/master/mfcc_hires.conf diff --git a/scripts/wenet/test-onnx-streaming.py b/scripts/wenet/test-onnx-streaming.py index 45a4b5b7d..cbf27b745 100755 --- a/scripts/wenet/test-onnx-streaming.py +++ b/scripts/wenet/test-onnx-streaming.py @@ -112,7 +112,7 @@ def get_features(test_wav_filename): audio = torchaudio.functional.resample( audio, orig_freq=sample_rate, new_freq=16000 ) - audio *= 372768 + audio *= 32768 opts = knf.FbankOptions() opts.frame_opts.dither = 0 diff --git a/scripts/wenet/test-onnx.py b/scripts/wenet/test-onnx.py index 988fef4b1..b9631ddb8 100755 --- a/scripts/wenet/test-onnx.py +++ b/scripts/wenet/test-onnx.py @@ -52,7 +52,7 @@ def get_features(test_wav_filename): audio = torchaudio.functional.resample( audio, orig_freq=sample_rate, new_freq=16000 ) - audio *= 372768 + audio *= 32768 opts = knf.FbankOptions() opts.frame_opts.dither = 0 From d94506698daba388a251fc653c6f2c486006db91 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 14 Jun 2024 18:40:16 +0800 Subject: [PATCH 019/237] Add non-streaming ASR examples for Dart API (#1007) --- .github/scripts/test-dart.sh | 43 ++++++++ .github/workflows/test-dart-package.yaml | 52 ++++++++++ .github/workflows/test-dart.yaml | 45 +++++++-- dart-api-examples/.gitignore | 1 + .../non-streaming-asr/.gitignore | 3 + .../non-streaming-asr/CHANGELOG.md | 3 + dart-api-examples/non-streaming-asr/README.md | 14 +++ .../non-streaming-asr/analysis_options.yaml | 30 ++++++ .../non-streaming-asr/bin/init.dart | 1 + .../non-streaming-asr/bin/nemo-ctc.dart | 52 ++++++++++ .../bin/nemo-transducer.dart | 62 ++++++++++++ .../non-streaming-asr/bin/paraformer.dart | 55 +++++++++++ .../non-streaming-asr/bin/telespeech-ctc.dart | 51 ++++++++++ .../bin/vad-with-paraformer.dart | 97 +++++++++++++++++++ .../non-streaming-asr/bin/whisper.dart | 59 +++++++++++ .../bin/zipformer-transducer.dart | 62 ++++++++++++ .../non-streaming-asr/pubspec.lock | 96 ++++++++++++++++++ .../non-streaming-asr/pubspec.yaml | 18 ++++ .../non-streaming-asr/run-nemo-ctc.sh | 17 ++++ .../non-streaming-asr/run-nemo-transducer.sh | 20 ++++ .../non-streaming-asr/run-paraformer.sh | 18 ++++ .../non-streaming-asr/run-telespeech-ctc.sh | 18 ++++ .../run-vad-with-paraformer.sh | 27 ++++++ .../non-streaming-asr/run-whisper.sh | 19 ++++ .../run-zipformer-transducer.sh | 20 ++++ dart-api-examples/vad/bin/init.dart | 29 ++++++ dart-api-examples/vad/bin/vad.dart | 31 ++---- dart-api-examples/vad/pubspec.lock | 4 +- dart-api-examples/vad/pubspec.yaml | 2 +- scripts/dart/non-streaming-asr-pubspec.yaml | 19 ++++ scripts/dart/vad-pubspec.yaml | 18 ++++ .../flutter/lib/src/offline_recognizer.dart | 14 ++- .../flutter/lib/src/online_recognizer.dart | 12 ++- .../flutter/lib/src/sherpa_onnx_bindings.dart | 7 ++ sherpa-onnx/flutter/pubspec.yaml | 2 +- 35 files changed, 984 insertions(+), 37 deletions(-) create mode 100755 .github/scripts/test-dart.sh create mode 100644 .github/workflows/test-dart-package.yaml create mode 100644 dart-api-examples/.gitignore create mode 100644 dart-api-examples/non-streaming-asr/.gitignore create mode 100644 dart-api-examples/non-streaming-asr/CHANGELOG.md create mode 100644 dart-api-examples/non-streaming-asr/README.md create mode 100644 dart-api-examples/non-streaming-asr/analysis_options.yaml create mode 120000 dart-api-examples/non-streaming-asr/bin/init.dart create mode 100644 dart-api-examples/non-streaming-asr/bin/nemo-ctc.dart create mode 100644 dart-api-examples/non-streaming-asr/bin/nemo-transducer.dart create mode 100644 dart-api-examples/non-streaming-asr/bin/paraformer.dart create mode 100644 dart-api-examples/non-streaming-asr/bin/telespeech-ctc.dart create mode 100644 dart-api-examples/non-streaming-asr/bin/vad-with-paraformer.dart create mode 100644 dart-api-examples/non-streaming-asr/bin/whisper.dart create mode 100644 dart-api-examples/non-streaming-asr/bin/zipformer-transducer.dart create mode 100644 dart-api-examples/non-streaming-asr/pubspec.lock create mode 100644 dart-api-examples/non-streaming-asr/pubspec.yaml create mode 100755 dart-api-examples/non-streaming-asr/run-nemo-ctc.sh create mode 100755 dart-api-examples/non-streaming-asr/run-nemo-transducer.sh create mode 100755 dart-api-examples/non-streaming-asr/run-paraformer.sh create mode 100755 dart-api-examples/non-streaming-asr/run-telespeech-ctc.sh create mode 100755 dart-api-examples/non-streaming-asr/run-vad-with-paraformer.sh create mode 100755 dart-api-examples/non-streaming-asr/run-whisper.sh create mode 100755 dart-api-examples/non-streaming-asr/run-zipformer-transducer.sh create mode 100644 dart-api-examples/vad/bin/init.dart create mode 100644 scripts/dart/non-streaming-asr-pubspec.yaml create mode 100644 scripts/dart/vad-pubspec.yaml diff --git a/.github/scripts/test-dart.sh b/.github/scripts/test-dart.sh new file mode 100755 index 000000000..35c0fa951 --- /dev/null +++ b/.github/scripts/test-dart.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash + +set -ex + +cd dart-api-examples + +pushd non-streaming-asr + +echo '----------VAD with paraformer----------' +./run-vad-with-paraformer.sh +rm -rf sherpa-onnx-* + +echo '----------NeMo transducer----------' +./run-nemo-transducer.sh +rm -rf sherpa-onnx-* + +echo '----------NeMo CTC----------' +./run-nemo-ctc.sh +rm -rf sherpa-onnx-* + +echo '----------TeleSpeech CTC----------' +./run-telespeech-ctc.sh +rm -rf sherpa-onnx-* + +echo '----------paraformer----------' +./run-paraformer.sh +rm -rf sherpa-onnx-* + +echo '----------whisper----------' +./run-whisper.sh +rm -rf sherpa-onnx-* + +echo '----------zipformer transducer----------' +./run-zipformer-transducer.sh +rm -rf sherpa-onnx-* + +popd + +pushd vad +./run.sh +rm *.onnx +popd + diff --git a/.github/workflows/test-dart-package.yaml b/.github/workflows/test-dart-package.yaml new file mode 100644 index 000000000..84556ff7d --- /dev/null +++ b/.github/workflows/test-dart-package.yaml @@ -0,0 +1,52 @@ +name: test-dart-package + +on: + schedule: + # minute (0-59) + # hour (0-23) + # day of the month (1-31) + # month (1-12) + # day of the week (0-6) + # nightly build at 15:50 UTC time every day + - cron: "50 15 * * *" + + workflow_dispatch: + +concurrency: + group: test-dart-package-${{ github.ref }} + cancel-in-progress: true + +jobs: + test_dart_package: + name: ${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [macos-latest, ubuntu-latest] #, windows-latest] + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Setup Flutter SDK + uses: flutter-actions/setup-flutter@v3 + with: + channel: stable + version: latest + + - name: Display flutter info + shell: bash + run: | + which flutter + which dart + + flutter --version + dart --version + flutter doctor + + - name: Run tests + shell: bash + run: | + .github/scripts/test-dart.sh diff --git a/.github/workflows/test-dart.yaml b/.github/workflows/test-dart.yaml index 0734df705..ae01373b3 100644 --- a/.github/workflows/test-dart.yaml +++ b/.github/workflows/test-dart.yaml @@ -21,19 +21,24 @@ concurrency: cancel-in-progress: true jobs: - dart: + test_dart: name: ${{ matrix.os }} runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: - os: [macos-latest, ubuntu-latest] #, windows-latest] + os: [ubuntu-latest] steps: - uses: actions/checkout@v4 with: fetch-depth: 0 + - name: ccache + uses: hendrikmuhs/ccache-action@v1.2 + with: + key: ${{ matrix.os }}-dart + - name: Setup Flutter SDK uses: flutter-actions/setup-flutter@v3 with: @@ -50,11 +55,39 @@ jobs: dart --version flutter doctor + - name: Build sherpa-onnx + shell: bash + run: | + export CMAKE_CXX_COMPILER_LAUNCHER=ccache + export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH" + cmake --version + mkdir build + + cd build + + cmake \ + -D BUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF \ + -DBUILD_ESPEAK_NG_EXE=OFF \ + -DSHERPA_ONNX_ENABLE_BINARY=OFF \ + -DCMAKE_INSTALL_PREFIX=./install \ + .. + make -j install + + - name: Copy libs + shell: bash + run: | + cp -v build/install/lib/lib* ./sherpa-onnx/flutter/linux/ + + echo "--------------------" + + ls -lh ./sherpa-onnx/flutter/linux/ + - name: Run tests shell: bash run: | - cd dart-api-examples + cp scripts/dart/vad-pubspec.yaml dart-api-examples/vad/pubspec.yaml + cp scripts/dart/non-streaming-asr-pubspec.yaml dart-api-examples/non-streaming-asr/pubspec.yaml - pushd vad - ./run.sh - popd + .github/scripts/test-dart.sh diff --git a/dart-api-examples/.gitignore b/dart-api-examples/.gitignore new file mode 100644 index 000000000..248f032f1 --- /dev/null +++ b/dart-api-examples/.gitignore @@ -0,0 +1 @@ +!run*.sh diff --git a/dart-api-examples/non-streaming-asr/.gitignore b/dart-api-examples/non-streaming-asr/.gitignore new file mode 100644 index 000000000..3a8579040 --- /dev/null +++ b/dart-api-examples/non-streaming-asr/.gitignore @@ -0,0 +1,3 @@ +# https://dart.dev/guides/libraries/private-files +# Created by `dart pub` +.dart_tool/ diff --git a/dart-api-examples/non-streaming-asr/CHANGELOG.md b/dart-api-examples/non-streaming-asr/CHANGELOG.md new file mode 100644 index 000000000..effe43c82 --- /dev/null +++ b/dart-api-examples/non-streaming-asr/CHANGELOG.md @@ -0,0 +1,3 @@ +## 1.0.0 + +- Initial version. diff --git a/dart-api-examples/non-streaming-asr/README.md b/dart-api-examples/non-streaming-asr/README.md new file mode 100644 index 000000000..bfa21e4cd --- /dev/null +++ b/dart-api-examples/non-streaming-asr/README.md @@ -0,0 +1,14 @@ +# Introduction + +This folder contains examples for non-streaming ASR with Dart API. + +| File | Description| +|------|------------| +|[./bin/nemo-ctc.dart](./bin/nemo-ctc.dart)| Use a NeMo Ctc model for speech recognition. See [./run-nemo-ctc.sh](./run-nemo-ctc.sh)| +|[./bin/nemo-transducer.dart](./bin/nemo-transducer.dart)| Use a NeMo transducer model for speech recognition. See [./run-nemo-transducer.sh](./run-nemo-transducer.sh)| +|[./bin/paraformer.dart](./bin/paraformer.dart)|Use a paraformer model for speech recognition. See [./run-paraformer.sh](./run-paraformer.sh)| +|[./bin/telespeech-ctc.dart](./bin/telespeech-ctc.dart)| Use models from [Tele-AI/TeleSpeech-ASR](https://github.com/Tele-AI/TeleSpeech-ASR) for speech recognition. See [./run-telespeech-ctc.sh](./run-telespeech-ctc.sh)| +|[./bin/whisper.dart](./bin/whisper.dart)| Use whisper for speech recognition. See [./run-whisper.sh](./run-whisper.sh)| +|[./bin/zipformer-transducer.dart](./bin/zipformer-transducer.dart)| Use a zipformer transducer for speech recognition. See [./run-zipformer-transducer.sh](./run-zipformer-transducer.sh)| +|[./bin/vad-with-paraformer.dart](./bin/vad-with-paraformer.dart)| Use a [silero-vad](https://github.com/snakers4/silero-vad) with paraformer for speech recognition. See [./run-vad-with-paraformer.sh](./run-vad-with-paraformer.sh)| + diff --git a/dart-api-examples/non-streaming-asr/analysis_options.yaml b/dart-api-examples/non-streaming-asr/analysis_options.yaml new file mode 100644 index 000000000..dee8927aa --- /dev/null +++ b/dart-api-examples/non-streaming-asr/analysis_options.yaml @@ -0,0 +1,30 @@ +# This file configures the static analysis results for your project (errors, +# warnings, and lints). +# +# This enables the 'recommended' set of lints from `package:lints`. +# This set helps identify many issues that may lead to problems when running +# or consuming Dart code, and enforces writing Dart using a single, idiomatic +# style and format. +# +# If you want a smaller set of lints you can change this to specify +# 'package:lints/core.yaml'. These are just the most critical lints +# (the recommended set includes the core lints). +# The core lints are also what is used by pub.dev for scoring packages. + +include: package:lints/recommended.yaml + +# Uncomment the following section to specify additional rules. + +# linter: +# rules: +# - camel_case_types + +# analyzer: +# exclude: +# - path/to/excluded/files/** + +# For more information about the core and recommended set of lints, see +# https://dart.dev/go/core-lints + +# For additional information about configuring this file, see +# https://dart.dev/guides/language/analysis-options diff --git a/dart-api-examples/non-streaming-asr/bin/init.dart b/dart-api-examples/non-streaming-asr/bin/init.dart new file mode 120000 index 000000000..48508cfd3 --- /dev/null +++ b/dart-api-examples/non-streaming-asr/bin/init.dart @@ -0,0 +1 @@ +../../vad/bin/init.dart \ No newline at end of file diff --git a/dart-api-examples/non-streaming-asr/bin/nemo-ctc.dart b/dart-api-examples/non-streaming-asr/bin/nemo-ctc.dart new file mode 100644 index 000000000..fa90635fd --- /dev/null +++ b/dart-api-examples/non-streaming-asr/bin/nemo-ctc.dart @@ -0,0 +1,52 @@ +import 'dart:io'; +import 'dart:typed_data'; + +import 'package:args/args.dart'; +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; + +import './init.dart'; + +void main(List arguments) async { + await initSherpaOnnx(); + + final parser = ArgParser() + ..addOption('model', help: 'Path to the NeMo CTC model') + ..addOption('tokens', help: 'Path to tokens.txt') + ..addOption('input-wav', help: 'Path to input.wav to transcribe'); + + final res = parser.parse(arguments); + if (res['model'] == null || + res['tokens'] == null || + res['input-wav'] == null) { + print(parser.usage); + exit(1); + } + + final model = res['model'] as String; + final tokens = res['tokens'] as String; + final inputWav = res['input-wav'] as String; + + final nemo = sherpa_onnx.OfflineNemoEncDecCtcModelConfig(model: model); + + final modelConfig = sherpa_onnx.OfflineModelConfig( + nemoCtc: nemo, + tokens: tokens, + debug: true, + numThreads: 1, + ); + final config = sherpa_onnx.OfflineRecognizerConfig(model: modelConfig); + final recognizer = sherpa_onnx.OfflineRecognizer(config); + + final waveData = sherpa_onnx.readWave(inputWav); + final stream = recognizer.createStream(); + + stream.acceptWaveform( + samples: waveData.samples, sampleRate: waveData.sampleRate); + recognizer.decode(stream); + + final result = recognizer.getResult(stream); + print(result.text); + + stream.free(); + recognizer.free(); +} diff --git a/dart-api-examples/non-streaming-asr/bin/nemo-transducer.dart b/dart-api-examples/non-streaming-asr/bin/nemo-transducer.dart new file mode 100644 index 000000000..881487455 --- /dev/null +++ b/dart-api-examples/non-streaming-asr/bin/nemo-transducer.dart @@ -0,0 +1,62 @@ +import 'dart:io'; +import 'dart:typed_data'; + +import 'package:args/args.dart'; +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; + +import './init.dart'; + +void main(List arguments) async { + await initSherpaOnnx(); + + final parser = ArgParser() + ..addOption('encoder', help: 'Path to the encoder model') + ..addOption('decoder', help: 'Path to decoder model') + ..addOption('joiner', help: 'Path to joiner model') + ..addOption('tokens', help: 'Path to tokens.txt') + ..addOption('input-wav', help: 'Path to input.wav to transcribe'); + + final res = parser.parse(arguments); + if (res['encoder'] == null || + res['decoder'] == null || + res['joiner'] == null || + res['tokens'] == null || + res['input-wav'] == null) { + print(parser.usage); + exit(1); + } + + final encoder = res['encoder'] as String; + final decoder = res['decoder'] as String; + final joiner = res['joiner'] as String; + final tokens = res['tokens'] as String; + final inputWav = res['input-wav'] as String; + + final transducer = sherpa_onnx.OfflineTransducerModelConfig( + encoder: encoder, + decoder: decoder, + joiner: joiner, + ); + + final modelConfig = sherpa_onnx.OfflineModelConfig( + transducer: transducer, + tokens: tokens, + debug: true, + numThreads: 1, + ); + final config = sherpa_onnx.OfflineRecognizerConfig(model: modelConfig); + final recognizer = sherpa_onnx.OfflineRecognizer(config); + + final waveData = sherpa_onnx.readWave(inputWav); + final stream = recognizer.createStream(); + + stream.acceptWaveform( + samples: waveData.samples, sampleRate: waveData.sampleRate); + recognizer.decode(stream); + + final result = recognizer.getResult(stream); + print(result.text); + + stream.free(); + recognizer.free(); +} diff --git a/dart-api-examples/non-streaming-asr/bin/paraformer.dart b/dart-api-examples/non-streaming-asr/bin/paraformer.dart new file mode 100644 index 000000000..fd3886788 --- /dev/null +++ b/dart-api-examples/non-streaming-asr/bin/paraformer.dart @@ -0,0 +1,55 @@ +import 'dart:io'; +import 'dart:typed_data'; + +import 'package:args/args.dart'; +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; + +import './init.dart'; + +void main(List arguments) async { + await initSherpaOnnx(); + + final parser = ArgParser() + ..addOption('model', help: 'Path to the paraformer model') + ..addOption('tokens', help: 'Path to tokens.txt') + ..addOption('input-wav', help: 'Path to input.wav to transcribe'); + + final res = parser.parse(arguments); + if (res['model'] == null || + res['tokens'] == null || + res['input-wav'] == null) { + print(parser.usage); + exit(1); + } + + final model = res['model'] as String; + final tokens = res['tokens'] as String; + final inputWav = res['input-wav'] as String; + + final paraformer = sherpa_onnx.OfflineParaformerModelConfig( + model: model, + ); + + final modelConfig = sherpa_onnx.OfflineModelConfig( + paraformer: paraformer, + tokens: tokens, + debug: true, + numThreads: 1, + modelType: 'paraformer', + ); + final config = sherpa_onnx.OfflineRecognizerConfig(model: modelConfig); + final recognizer = sherpa_onnx.OfflineRecognizer(config); + + final waveData = sherpa_onnx.readWave(inputWav); + final stream = recognizer.createStream(); + + stream.acceptWaveform( + samples: waveData.samples, sampleRate: waveData.sampleRate); + recognizer.decode(stream); + + final result = recognizer.getResult(stream); + print(result.text); + + stream.free(); + recognizer.free(); +} diff --git a/dart-api-examples/non-streaming-asr/bin/telespeech-ctc.dart b/dart-api-examples/non-streaming-asr/bin/telespeech-ctc.dart new file mode 100644 index 000000000..b9d21a0d0 --- /dev/null +++ b/dart-api-examples/non-streaming-asr/bin/telespeech-ctc.dart @@ -0,0 +1,51 @@ +import 'dart:io'; +import 'dart:typed_data'; + +import 'package:args/args.dart'; +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; + +import './init.dart'; + +void main(List arguments) async { + await initSherpaOnnx(); + + final parser = ArgParser() + ..addOption('model', help: 'Path to the telespeech CTC model') + ..addOption('tokens', help: 'Path to tokens.txt') + ..addOption('input-wav', help: 'Path to input.wav to transcribe'); + + final res = parser.parse(arguments); + if (res['model'] == null || + res['tokens'] == null || + res['input-wav'] == null) { + print(parser.usage); + exit(1); + } + + final model = res['model'] as String; + final tokens = res['tokens'] as String; + final inputWav = res['input-wav'] as String; + + final modelConfig = sherpa_onnx.OfflineModelConfig( + telespeechCtc: model, + tokens: tokens, + debug: true, + numThreads: 1, + modelType: 'telespeech_ctc', + ); + final config = sherpa_onnx.OfflineRecognizerConfig(model: modelConfig); + final recognizer = sherpa_onnx.OfflineRecognizer(config); + + final waveData = sherpa_onnx.readWave(inputWav); + final stream = recognizer.createStream(); + + stream.acceptWaveform( + samples: waveData.samples, sampleRate: waveData.sampleRate); + recognizer.decode(stream); + + final result = recognizer.getResult(stream); + print(result.text); + + stream.free(); + recognizer.free(); +} diff --git a/dart-api-examples/non-streaming-asr/bin/vad-with-paraformer.dart b/dart-api-examples/non-streaming-asr/bin/vad-with-paraformer.dart new file mode 100644 index 000000000..1f03ed3e8 --- /dev/null +++ b/dart-api-examples/non-streaming-asr/bin/vad-with-paraformer.dart @@ -0,0 +1,97 @@ +import 'dart:io'; +import 'dart:typed_data'; + +import 'package:args/args.dart'; +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; + +import './init.dart'; + +void main(List arguments) async { + await initSherpaOnnx(); + + final parser = ArgParser() + ..addOption('silero-vad', help: 'Path to silero_vad.onnx') + ..addOption('model', help: 'Path to the paraformer model') + ..addOption('tokens', help: 'Path to tokens.txt') + ..addOption('input-wav', help: 'Path to input.wav to transcribe'); + + final res = parser.parse(arguments); + if (res['silero-vad'] == null || + res['model'] == null || + res['tokens'] == null || + res['input-wav'] == null) { + print(parser.usage); + exit(1); + } + + final sileroVad = res['silero-vad'] as String; + final model = res['model'] as String; + final tokens = res['tokens'] as String; + final inputWav = res['input-wav'] as String; + + final paraformer = sherpa_onnx.OfflineParaformerModelConfig( + model: model, + ); + + final modelConfig = sherpa_onnx.OfflineModelConfig( + paraformer: paraformer, + tokens: tokens, + debug: true, + numThreads: 1, + modelType: 'paraformer', + ); + final config = sherpa_onnx.OfflineRecognizerConfig(model: modelConfig); + final recognizer = sherpa_onnx.OfflineRecognizer(config); + + final sileroVadConfig = sherpa_onnx.SileroVadModelConfig( + model: sileroVad, + minSilenceDuration: 0.25, + minSpeechDuration: 0.5, + ); + + final vadConfig = sherpa_onnx.VadModelConfig( + sileroVad: sileroVadConfig, + numThreads: 1, + debug: true, + ); + + final vad = sherpa_onnx.VoiceActivityDetector( + config: vadConfig, bufferSizeInSeconds: 10); + + final waveData = sherpa_onnx.readWave(inputWav); + + int numSamples = waveData.samples.length; + int numIter = numSamples ~/ vadConfig.sileroVad.windowSize; + + for (int i = 0; i != numIter; ++i) { + int start = i * vadConfig.sileroVad.windowSize; + vad.acceptWaveform(Float32List.sublistView( + waveData.samples, start, start + vadConfig.sileroVad.windowSize)); + + if (vad.isDetected()) { + while (!vad.isEmpty()) { + final stream = recognizer.createStream(); + final segment = vad.front(); + stream.acceptWaveform( + samples: segment.samples, sampleRate: waveData.sampleRate); + recognizer.decode(stream); + + final result = recognizer.getResult(stream); + + final startTime = segment.start * 1.0 / waveData.sampleRate; + final duration = segment.samples.length * 1.0 / waveData.sampleRate; + final stopTime = startTime + duration; + if (result.text != '') { + print( + '${startTime.toStringAsPrecision(4)} -- ${stopTime.toStringAsPrecision(4)}: ${result.text}'); + } + + stream.free(); + vad.pop(); + } + } + } + + vad.free(); + recognizer.free(); +} diff --git a/dart-api-examples/non-streaming-asr/bin/whisper.dart b/dart-api-examples/non-streaming-asr/bin/whisper.dart new file mode 100644 index 000000000..0e4e0f3d7 --- /dev/null +++ b/dart-api-examples/non-streaming-asr/bin/whisper.dart @@ -0,0 +1,59 @@ +import 'dart:io'; +import 'dart:typed_data'; + +import 'package:args/args.dart'; +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; + +import './init.dart'; + +void main(List arguments) async { + await initSherpaOnnx(); + + final parser = ArgParser() + ..addOption('encoder', help: 'Path to the whisper encoder model') + ..addOption('decoder', help: 'Path to whisper decoder model') + ..addOption('tokens', help: 'Path to tokens.txt') + ..addOption('input-wav', help: 'Path to input.wav to transcribe'); + + final res = parser.parse(arguments); + if (res['encoder'] == null || + res['decoder'] == null || + res['tokens'] == null || + res['input-wav'] == null) { + print(parser.usage); + exit(1); + } + + final encoder = res['encoder'] as String; + final decoder = res['decoder'] as String; + final tokens = res['tokens'] as String; + final inputWav = res['input-wav'] as String; + + final whisper = sherpa_onnx.OfflineWhisperModelConfig( + encoder: encoder, + decoder: decoder, + ); + + final modelConfig = sherpa_onnx.OfflineModelConfig( + whisper: whisper, + tokens: tokens, + modelType: 'whisper', + debug: false, + numThreads: 1, + ); + final config = sherpa_onnx.OfflineRecognizerConfig(model: modelConfig); + final recognizer = sherpa_onnx.OfflineRecognizer(config); + + final waveData = sherpa_onnx.readWave(inputWav); + final stream = recognizer.createStream(); + + stream.acceptWaveform( + samples: waveData.samples, sampleRate: waveData.sampleRate); + recognizer.decode(stream); + + final result = recognizer.getResult(stream); + print(result.text); + + stream.free(); + recognizer.free(); +} diff --git a/dart-api-examples/non-streaming-asr/bin/zipformer-transducer.dart b/dart-api-examples/non-streaming-asr/bin/zipformer-transducer.dart new file mode 100644 index 000000000..881487455 --- /dev/null +++ b/dart-api-examples/non-streaming-asr/bin/zipformer-transducer.dart @@ -0,0 +1,62 @@ +import 'dart:io'; +import 'dart:typed_data'; + +import 'package:args/args.dart'; +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; + +import './init.dart'; + +void main(List arguments) async { + await initSherpaOnnx(); + + final parser = ArgParser() + ..addOption('encoder', help: 'Path to the encoder model') + ..addOption('decoder', help: 'Path to decoder model') + ..addOption('joiner', help: 'Path to joiner model') + ..addOption('tokens', help: 'Path to tokens.txt') + ..addOption('input-wav', help: 'Path to input.wav to transcribe'); + + final res = parser.parse(arguments); + if (res['encoder'] == null || + res['decoder'] == null || + res['joiner'] == null || + res['tokens'] == null || + res['input-wav'] == null) { + print(parser.usage); + exit(1); + } + + final encoder = res['encoder'] as String; + final decoder = res['decoder'] as String; + final joiner = res['joiner'] as String; + final tokens = res['tokens'] as String; + final inputWav = res['input-wav'] as String; + + final transducer = sherpa_onnx.OfflineTransducerModelConfig( + encoder: encoder, + decoder: decoder, + joiner: joiner, + ); + + final modelConfig = sherpa_onnx.OfflineModelConfig( + transducer: transducer, + tokens: tokens, + debug: true, + numThreads: 1, + ); + final config = sherpa_onnx.OfflineRecognizerConfig(model: modelConfig); + final recognizer = sherpa_onnx.OfflineRecognizer(config); + + final waveData = sherpa_onnx.readWave(inputWav); + final stream = recognizer.createStream(); + + stream.acceptWaveform( + samples: waveData.samples, sampleRate: waveData.sampleRate); + recognizer.decode(stream); + + final result = recognizer.getResult(stream); + print(result.text); + + stream.free(); + recognizer.free(); +} diff --git a/dart-api-examples/non-streaming-asr/pubspec.lock b/dart-api-examples/non-streaming-asr/pubspec.lock new file mode 100644 index 000000000..7c77c2a62 --- /dev/null +++ b/dart-api-examples/non-streaming-asr/pubspec.lock @@ -0,0 +1,96 @@ +# Generated by pub +# See https://dart.dev/tools/pub/glossary#lockfile +packages: + args: + dependency: "direct main" + description: + name: args + sha256: "7cf60b9f0cc88203c5a190b4cd62a99feea42759a7fa695010eb5de1c0b2252a" + url: "https://pub.dev" + source: hosted + version: "2.5.0" + characters: + dependency: transitive + description: + name: characters + sha256: "04a925763edad70e8443c99234dc3328f442e811f1d8fd1a72f1c8ad0f69a605" + url: "https://pub.dev" + source: hosted + version: "1.3.0" + collection: + dependency: transitive + description: + name: collection + sha256: ee67cb0715911d28db6bf4af1026078bd6f0128b07a5f66fb2ed94ec6783c09a + url: "https://pub.dev" + source: hosted + version: "1.18.0" + ffi: + dependency: transitive + description: + name: ffi + sha256: "493f37e7df1804778ff3a53bd691d8692ddf69702cf4c1c1096a2e41b4779e21" + url: "https://pub.dev" + source: hosted + version: "2.1.2" + flutter: + dependency: transitive + description: flutter + source: sdk + version: "0.0.0" + lints: + dependency: "direct dev" + description: + name: lints + sha256: cbf8d4b858bb0134ef3ef87841abdf8d63bfc255c266b7bf6b39daa1085c4290 + url: "https://pub.dev" + source: hosted + version: "3.0.0" + material_color_utilities: + dependency: transitive + description: + name: material_color_utilities + sha256: "0e0a020085b65b6083975e499759762399b4475f766c21668c4ecca34ea74e5a" + url: "https://pub.dev" + source: hosted + version: "0.8.0" + meta: + dependency: transitive + description: + name: meta + sha256: "7687075e408b093f36e6bbf6c91878cc0d4cd10f409506f7bc996f68220b9136" + url: "https://pub.dev" + source: hosted + version: "1.12.0" + path: + dependency: "direct main" + description: + name: path + sha256: "087ce49c3f0dc39180befefc60fdb4acd8f8620e5682fe2476afd0b3688bb4af" + url: "https://pub.dev" + source: hosted + version: "1.9.0" + sherpa_onnx: + dependency: "direct main" + description: + name: sherpa_onnx + sha256: e45894f81e7c854ca96d678bcab5303036e884a7c90e9a6c4ec04c7b1ee215a8 + url: "https://pub.dev" + source: hosted + version: "1.9.29" + sky_engine: + dependency: transitive + description: flutter + source: sdk + version: "0.0.99" + vector_math: + dependency: transitive + description: + name: vector_math + sha256: "80b3257d1492ce4d091729e3a67a60407d227c27241d6927be0130c98e741803" + url: "https://pub.dev" + source: hosted + version: "2.1.4" +sdks: + dart: ">=3.4.0 <4.0.0" + flutter: ">=3.3.0" diff --git a/dart-api-examples/non-streaming-asr/pubspec.yaml b/dart-api-examples/non-streaming-asr/pubspec.yaml new file mode 100644 index 000000000..61dbe71f2 --- /dev/null +++ b/dart-api-examples/non-streaming-asr/pubspec.yaml @@ -0,0 +1,18 @@ +name: non_streaming_asr +description: > + This example demonstrates how to use the Dart API for Non-streaming speech recognition. Specifically, we use the following models as examples, whisper, zipformer, and paraformer. + +version: 1.0.0 +# repository: https://github.com/my_org/my_repo + +environment: + sdk: ^3.4.0 + +# Add regular dependencies here. +dependencies: + sherpa_onnx: ^1.9.29 + path: ^1.9.0 + args: ^2.5.0 + +dev_dependencies: + lints: ^3.0.0 diff --git a/dart-api-examples/non-streaming-asr/run-nemo-ctc.sh b/dart-api-examples/non-streaming-asr/run-nemo-ctc.sh new file mode 100755 index 000000000..74775c0e6 --- /dev/null +++ b/dart-api-examples/non-streaming-asr/run-nemo-ctc.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +set -ex + +dart pub get + +if [ ! -f ./sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k.tar.bz2 + tar xvf sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k.tar.bz2 + rm sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k.tar.bz2 +fi + +dart run \ + ./bin/nemo-ctc.dart \ + --model ./sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/model.onnx \ + --tokens ./sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/tokens.txt \ + --input-wav ./sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/test_wavs/de-german.wav diff --git a/dart-api-examples/non-streaming-asr/run-nemo-transducer.sh b/dart-api-examples/non-streaming-asr/run-nemo-transducer.sh new file mode 100755 index 000000000..5f4854df3 --- /dev/null +++ b/dart-api-examples/non-streaming-asr/run-nemo-transducer.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash + +set -ex + +dart pub get + +if [ ! -f ./sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k.tar.bz2 + + tar xvf sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k.tar.bz2 + rm sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k.tar.bz2 +fi + +dart run \ + ./bin/nemo-transducer.dart \ + --encoder ./sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/encoder.onnx \ + --decoder ./sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/decoder.onnx \ + --joiner ./sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/joiner.onnx \ + --tokens ./sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/tokens.txt \ + --input-wav ./sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/test_wavs/de-german.wav diff --git a/dart-api-examples/non-streaming-asr/run-paraformer.sh b/dart-api-examples/non-streaming-asr/run-paraformer.sh new file mode 100755 index 000000000..1e1f9c82a --- /dev/null +++ b/dart-api-examples/non-streaming-asr/run-paraformer.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +set -ex + +dart pub get + +if [ ! -f ./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + + tar xvf sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + rm sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 +fi + +dart run \ + ./bin/paraformer.dart \ + --model ./sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx \ + --tokens ./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt \ + --input-wav ./sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/3-sichuan.wav diff --git a/dart-api-examples/non-streaming-asr/run-telespeech-ctc.sh b/dart-api-examples/non-streaming-asr/run-telespeech-ctc.sh new file mode 100755 index 000000000..8f9023924 --- /dev/null +++ b/dart-api-examples/non-streaming-asr/run-telespeech-ctc.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +set -ex + +dart pub get + +if [ ! -f ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 + + tar xvf sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 + rm sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04.tar.bz2 +fi + +dart run \ + ./bin/telespeech-ctc.dart \ + --model ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx \ + --tokens ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt \ + --input-wav ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/3-sichuan.wav diff --git a/dart-api-examples/non-streaming-asr/run-vad-with-paraformer.sh b/dart-api-examples/non-streaming-asr/run-vad-with-paraformer.sh new file mode 100755 index 000000000..0a1670b2b --- /dev/null +++ b/dart-api-examples/non-streaming-asr/run-vad-with-paraformer.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +set -ex + +dart pub get + +if [[ ! -f ./silero_vad.onnx ]]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +fi + +if [[ ! -f ./lei-jun-test.wav ]]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav +fi + +if [ ! -f ./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + + tar xvf sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + rm sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 +fi + +dart run \ + ./bin/vad-with-paraformer.dart \ + --silero-vad ./silero_vad.onnx \ + --model ./sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx \ + --tokens ./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt \ + --input-wav ./lei-jun-test.wav diff --git a/dart-api-examples/non-streaming-asr/run-whisper.sh b/dart-api-examples/non-streaming-asr/run-whisper.sh new file mode 100755 index 000000000..739b54372 --- /dev/null +++ b/dart-api-examples/non-streaming-asr/run-whisper.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +set -ex + +dart pub get + +if [ ! -f ./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 + + tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 + rm sherpa-onnx-whisper-tiny.en.tar.bz2 +fi + +dart run \ + ./bin/whisper.dart \ + --encoder ./sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx \ + --decoder ./sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx \ + --tokens ./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt \ + --input-wav ./sherpa-onnx-whisper-tiny.en/test_wavs/0.wav diff --git a/dart-api-examples/non-streaming-asr/run-zipformer-transducer.sh b/dart-api-examples/non-streaming-asr/run-zipformer-transducer.sh new file mode 100755 index 000000000..9f115c62d --- /dev/null +++ b/dart-api-examples/non-streaming-asr/run-zipformer-transducer.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash + +set -ex + +dart pub get + +if [ ! -f ./sherpa-onnx-zipformer-gigaspeech-2023-12-12/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-gigaspeech-2023-12-12.tar.bz2 + + tar xvf sherpa-onnx-zipformer-gigaspeech-2023-12-12.tar.bz2 + rm sherpa-onnx-zipformer-gigaspeech-2023-12-12.tar.bz2 +fi + +dart run \ + ./bin/zipformer-transducer.dart \ + --encoder ./sherpa-onnx-zipformer-gigaspeech-2023-12-12/encoder-epoch-30-avg-1.int8.onnx \ + --decoder ./sherpa-onnx-zipformer-gigaspeech-2023-12-12/decoder-epoch-30-avg-1.onnx \ + --joiner ./sherpa-onnx-zipformer-gigaspeech-2023-12-12/joiner-epoch-30-avg-1.int8.onnx \ + --tokens ./sherpa-onnx-zipformer-gigaspeech-2023-12-12/tokens.txt \ + --input-wav ./sherpa-onnx-zipformer-gigaspeech-2023-12-12/test_wavs/1221-135766-0001.wav diff --git a/dart-api-examples/vad/bin/init.dart b/dart-api-examples/vad/bin/init.dart new file mode 100644 index 000000000..072ff29b8 --- /dev/null +++ b/dart-api-examples/vad/bin/init.dart @@ -0,0 +1,29 @@ +import 'dart:io'; +import 'dart:isolate'; +import 'package:path/path.dart' as p; +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; + +Future initSherpaOnnx() async { + var uri = await Isolate.resolvePackageUri( + Uri.parse('package:sherpa_onnx/sherpa_onnx.dart')); + + if (uri == null) { + print('File not found'); + exit(1); + } + + String platform = ''; + + if (Platform.isMacOS) { + platform = 'macos'; + } else if (Platform.isLinux) { + platform = 'linux'; + } else if (Platform.isWindows) { + platform = 'windows'; + } else { + throw UnsupportedError('Unknown platform: ${Platform.operatingSystem}'); + } + + final libPath = p.join(p.dirname(p.fromUri(uri)), '..', platform); + sherpa_onnx.initBindings(libPath); +} diff --git a/dart-api-examples/vad/bin/vad.dart b/dart-api-examples/vad/bin/vad.dart index d981bad94..3ef9f66da 100644 --- a/dart-api-examples/vad/bin/vad.dart +++ b/dart-api-examples/vad/bin/vad.dart @@ -1,33 +1,9 @@ import 'dart:io'; -import 'dart:isolate'; import 'dart:typed_data'; import 'package:args/args.dart'; -import 'package:path/path.dart' as p; import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; - -Future initSherpaOnnx() async { - var uri = await Isolate.resolvePackageUri( - Uri.parse('package:sherpa_onnx/sherpa_onnx.dart')); - - if (uri == null) { - print('File not found'); - exit(1); - } - String platform = ''; - if (Platform.isMacOS) { - platform = 'macos'; - } else if (Platform.isLinux) { - platform = 'linux'; - } else if (Platform.isWindows) { - platform = 'windows'; - } else { - throw UnsupportedError('Unknown platform: ${Platform.operatingSystem}'); - } - - final libPath = p.join(p.dirname(p.fromUri(uri)), '..', platform); - sherpa_onnx.initBindings(libPath); -} +import './init.dart'; void main(List arguments) async { await initSherpaOnnx(); @@ -36,6 +12,7 @@ void main(List arguments) async { ..addOption('silero-vad', help: 'Path to silero_vad.onnx') ..addOption('input-wav', help: 'Path to input.wav') ..addOption('output-wav', help: 'Path to output.wav'); + final res = parser.parse(arguments); if (res['silero-vad'] == null || res['input-wav'] == null || @@ -53,6 +30,7 @@ void main(List arguments) async { minSilenceDuration: 0.25, minSpeechDuration: 0.5, ); + final config = sherpa_onnx.VadModelConfig( sileroVad: sileroVadConfig, numThreads: 1, @@ -86,8 +64,11 @@ void main(List arguments) async { } } + vad.free(); + final s = Float32List.fromList(allSamples.expand((x) => x).toList()); sherpa_onnx.writeWave( filename: outputWav, samples: s, sampleRate: waveData.sampleRate); + print('Saved to ${outputWav}'); } diff --git a/dart-api-examples/vad/pubspec.lock b/dart-api-examples/vad/pubspec.lock index a29b073e6..7c77c2a62 100644 --- a/dart-api-examples/vad/pubspec.lock +++ b/dart-api-examples/vad/pubspec.lock @@ -74,10 +74,10 @@ packages: dependency: "direct main" description: name: sherpa_onnx - sha256: "6cfadf7bc35001bb1284f9fac1e03e33787cafa918e0c45da96d1e91afa58751" + sha256: e45894f81e7c854ca96d678bcab5303036e884a7c90e9a6c4ec04c7b1ee215a8 url: "https://pub.dev" source: hosted - version: "0.0.3" + version: "1.9.29" sky_engine: dependency: transitive description: flutter diff --git a/dart-api-examples/vad/pubspec.yaml b/dart-api-examples/vad/pubspec.yaml index e7c7bc1cc..54c13e0fa 100644 --- a/dart-api-examples/vad/pubspec.yaml +++ b/dart-api-examples/vad/pubspec.yaml @@ -9,7 +9,7 @@ environment: sdk: ^3.4.0 dependencies: - sherpa_onnx: ^0.0.3 + sherpa_onnx: ^1.9.29 path: ^1.9.0 args: ^2.5.0 diff --git a/scripts/dart/non-streaming-asr-pubspec.yaml b/scripts/dart/non-streaming-asr-pubspec.yaml new file mode 100644 index 000000000..8d389f38f --- /dev/null +++ b/scripts/dart/non-streaming-asr-pubspec.yaml @@ -0,0 +1,19 @@ +name: non_streaming_asr +description: > + This example demonstrates how to use the Dart API for Non-streaming speech recognition. Specifically, we use the following models as examples, whisper, zipformer, and paraformer. + +version: 1.0.0 + +environment: + sdk: ^3.4.0 + +# Add regular dependencies here. +dependencies: + sherpa_onnx: + path: ../../sherpa-onnx/flutter + + path: ^1.9.0 + args: ^2.5.0 + +dev_dependencies: + lints: ^3.0.0 diff --git a/scripts/dart/vad-pubspec.yaml b/scripts/dart/vad-pubspec.yaml new file mode 100644 index 000000000..2d9758df1 --- /dev/null +++ b/scripts/dart/vad-pubspec.yaml @@ -0,0 +1,18 @@ +name: vad + +description: > + This example demonstrates how to use the Dart API for VAD (voice activity detection). + +version: 1.0.0 + +environment: + sdk: ^3.4.0 + +dependencies: + sherpa_onnx: + path: ../../sherpa-onnx/flutter + path: ^1.9.0 + args: ^2.5.0 + +dev_dependencies: + lints: ^3.0.0 diff --git a/sherpa-onnx/flutter/lib/src/offline_recognizer.dart b/sherpa-onnx/flutter/lib/src/offline_recognizer.dart index 633312424..7a220105c 100644 --- a/sherpa-onnx/flutter/lib/src/offline_recognizer.dart +++ b/sherpa-onnx/flutter/lib/src/offline_recognizer.dart @@ -102,11 +102,14 @@ class OfflineModelConfig { this.debug = true, this.provider = 'cpu', this.modelType = '', + this.modelingUnit = '', + this.bpeVocab = '', + this.telespeechCtc = '', }); @override String toString() { - return 'OfflineModelConfig(transducer: $transducer, paraformer: $paraformer, nemoCtc: $nemoCtc, whisper: $whisper, tdnn: $tdnn, tokens: $tokens, numThreads: $numThreads, debug: $debug, provider: $provider, modelType: $modelType)'; + return 'OfflineModelConfig(transducer: $transducer, paraformer: $paraformer, nemoCtc: $nemoCtc, whisper: $whisper, tdnn: $tdnn, tokens: $tokens, numThreads: $numThreads, debug: $debug, provider: $provider, modelType: $modelType, modelingUnit: $modelingUnit, bpeVocab: $bpeVocab, telespeechCtc: $telespeechCtc)'; } final OfflineTransducerModelConfig transducer; @@ -120,6 +123,9 @@ class OfflineModelConfig { final bool debug; final String provider; final String modelType; + final String modelingUnit; + final String bpeVocab; + final String telespeechCtc; } class OfflineRecognizerConfig { @@ -213,6 +219,9 @@ class OfflineRecognizer { c.ref.model.debug = config.model.debug ? 1 : 0; c.ref.model.provider = config.model.provider.toNativeUtf8(); c.ref.model.modelType = config.model.modelType.toNativeUtf8(); + c.ref.model.modelingUnit = config.model.modelingUnit.toNativeUtf8(); + c.ref.model.bpeVocab = config.model.bpeVocab.toNativeUtf8(); + c.ref.model.telespeechCtc = config.model.telespeechCtc.toNativeUtf8(); c.ref.lm.model = config.lm.model.toNativeUtf8(); c.ref.lm.scale = config.lm.scale; @@ -228,6 +237,9 @@ class OfflineRecognizer { calloc.free(c.ref.hotwordsFile); calloc.free(c.ref.decodingMethod); calloc.free(c.ref.lm.model); + calloc.free(c.ref.model.telespeechCtc); + calloc.free(c.ref.model.bpeVocab); + calloc.free(c.ref.model.modelingUnit); calloc.free(c.ref.model.modelType); calloc.free(c.ref.model.provider); calloc.free(c.ref.model.tokens); diff --git a/sherpa-onnx/flutter/lib/src/online_recognizer.dart b/sherpa-onnx/flutter/lib/src/online_recognizer.dart index 538c68dda..bee1f2683 100644 --- a/sherpa-onnx/flutter/lib/src/online_recognizer.dart +++ b/sherpa-onnx/flutter/lib/src/online_recognizer.dart @@ -58,11 +58,13 @@ class OnlineModelConfig { this.provider = 'cpu', this.debug = true, this.modelType = '', + this.modelingUnit = '', + this.bpeVocab = '', }); @override String toString() { - return 'OnlineModelConfig(transducer: $transducer, paraformer: $paraformer, zipformer2Ctc: $zipformer2Ctc, tokens: $tokens, numThreads: $numThreads, provider: $provider, debug: $debug, modelType: $modelType)'; + return 'OnlineModelConfig(transducer: $transducer, paraformer: $paraformer, zipformer2Ctc: $zipformer2Ctc, tokens: $tokens, numThreads: $numThreads, provider: $provider, debug: $debug, modelType: $modelType, modelingUnit: $modelingUnit, bpeVocab: $bpeVocab)'; } final OnlineTransducerModelConfig transducer; @@ -78,6 +80,10 @@ class OnlineModelConfig { final bool debug; final String modelType; + + final String modelingUnit; + + final String bpeVocab; } class OnlineCtcFstDecoderConfig { @@ -180,6 +186,8 @@ class OnlineRecognizer { c.ref.model.provider = config.model.provider.toNativeUtf8(); c.ref.model.debug = config.model.debug ? 1 : 0; c.ref.model.modelType = config.model.modelType.toNativeUtf8(); + c.ref.model.modelingUnit = config.model.modelingUnit.toNativeUtf8(); + c.ref.model.bpeVocab = config.model.bpeVocab.toNativeUtf8(); c.ref.decodingMethod = config.decodingMethod.toNativeUtf8(); c.ref.maxActivePaths = config.maxActivePaths; @@ -199,6 +207,8 @@ class OnlineRecognizer { calloc.free(c.ref.ctcFstDecoderConfig.graph); calloc.free(c.ref.hotwordsFile); calloc.free(c.ref.decodingMethod); + calloc.free(c.ref.model.bpeVocab); + calloc.free(c.ref.model.modelingUnit); calloc.free(c.ref.model.modelType); calloc.free(c.ref.model.provider); calloc.free(c.ref.model.tokens); diff --git a/sherpa-onnx/flutter/lib/src/sherpa_onnx_bindings.dart b/sherpa-onnx/flutter/lib/src/sherpa_onnx_bindings.dart index 997bfc70e..efc96a8f7 100644 --- a/sherpa-onnx/flutter/lib/src/sherpa_onnx_bindings.dart +++ b/sherpa-onnx/flutter/lib/src/sherpa_onnx_bindings.dart @@ -63,6 +63,9 @@ final class SherpaOnnxOfflineModelConfig extends Struct { external Pointer provider; external Pointer modelType; + external Pointer modelingUnit; + external Pointer bpeVocab; + external Pointer telespeechCtc; } final class SherpaOnnxOfflineRecognizerConfig extends Struct { @@ -111,6 +114,10 @@ final class SherpaOnnxOnlineModelConfig extends Struct { external int debug; external Pointer modelType; + + external Pointer modelingUnit; + + external Pointer bpeVocab; } final class SherpaOnnxOnlineCtcFstDecoderConfig extends Struct { diff --git a/sherpa-onnx/flutter/pubspec.yaml b/sherpa-onnx/flutter/pubspec.yaml index 3b59ef479..2944a9a27 100644 --- a/sherpa-onnx/flutter/pubspec.yaml +++ b/sherpa-onnx/flutter/pubspec.yaml @@ -17,7 +17,7 @@ topics: - voice-activity-detection # remember to change the version in macos/sherpa_onnx.podspec -version: 0.0.2 +version: 1.9.29 homepage: https://github.com/k2-fsa/sherpa-onnx From e3077670c66263ee541e6b49f686e5fdf0a6005c Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sat, 15 Jun 2024 11:48:54 +0800 Subject: [PATCH 020/237] Add streaming ASR examples for Dart API (#1009) --- .github/scripts/test-dart.sh | 26 +- .github/workflows/test-dart.yaml | 3 + .../non-streaming-asr/bin/nemo-ctc.dart | 1 + .../bin/nemo-transducer.dart | 1 + .../non-streaming-asr/bin/paraformer.dart | 1 + .../non-streaming-asr/bin/telespeech-ctc.dart | 1 + .../bin/vad-with-paraformer.dart | 1 + .../non-streaming-asr/bin/whisper.dart | 1 + .../bin/zipformer-transducer.dart | 1 + dart-api-examples/streaming-asr/.gitignore | 3 + dart-api-examples/streaming-asr/CHANGELOG.md | 3 + dart-api-examples/streaming-asr/README.md | 11 + .../streaming-asr/analysis_options.yaml | 30 ++ dart-api-examples/streaming-asr/bin/init.dart | 1 + .../streaming-asr/bin/nemo-transducer.dart | 1 + .../streaming-asr/bin/paraformer.dart | 92 ++++ .../streaming-asr/bin/zipformer-ctc-hlg.dart | 94 ++++ .../streaming-asr/bin/zipformer-ctc.dart | 88 ++++ .../bin/zipformer-transducer.dart | 96 ++++ dart-api-examples/streaming-asr/pubspec.lock | 432 ++++++++++++++++++ dart-api-examples/streaming-asr/pubspec.yaml | 20 + .../streaming-asr/run-nemo-transducer.sh | 19 + .../streaming-asr/run-paraformer.sh | 18 + .../streaming-asr/run-zipformer-ctc-hlg.sh | 18 + .../streaming-asr/run-zipformer-ctc.sh | 17 + .../streaming-asr/run-zipformer-transducer.sh | 19 + dart-api-examples/vad/bin/init.dart | 1 + dart-api-examples/vad/bin/vad.dart | 1 + nodejs-addon-examples/package.json | 2 +- scripts/dart/streaming-asr-pubspec.yaml | 21 + 30 files changed, 1021 insertions(+), 2 deletions(-) create mode 100644 dart-api-examples/streaming-asr/.gitignore create mode 100644 dart-api-examples/streaming-asr/CHANGELOG.md create mode 100644 dart-api-examples/streaming-asr/README.md create mode 100644 dart-api-examples/streaming-asr/analysis_options.yaml create mode 120000 dart-api-examples/streaming-asr/bin/init.dart create mode 120000 dart-api-examples/streaming-asr/bin/nemo-transducer.dart create mode 100644 dart-api-examples/streaming-asr/bin/paraformer.dart create mode 100644 dart-api-examples/streaming-asr/bin/zipformer-ctc-hlg.dart create mode 100644 dart-api-examples/streaming-asr/bin/zipformer-ctc.dart create mode 100644 dart-api-examples/streaming-asr/bin/zipformer-transducer.dart create mode 100644 dart-api-examples/streaming-asr/pubspec.lock create mode 100644 dart-api-examples/streaming-asr/pubspec.yaml create mode 100755 dart-api-examples/streaming-asr/run-nemo-transducer.sh create mode 100755 dart-api-examples/streaming-asr/run-paraformer.sh create mode 100755 dart-api-examples/streaming-asr/run-zipformer-ctc-hlg.sh create mode 100755 dart-api-examples/streaming-asr/run-zipformer-ctc.sh create mode 100755 dart-api-examples/streaming-asr/run-zipformer-transducer.sh create mode 100644 scripts/dart/streaming-asr-pubspec.yaml diff --git a/.github/scripts/test-dart.sh b/.github/scripts/test-dart.sh index 35c0fa951..0599864c0 100755 --- a/.github/scripts/test-dart.sh +++ b/.github/scripts/test-dart.sh @@ -4,6 +4,30 @@ set -ex cd dart-api-examples +pushd streaming-asr + +echo '----------streaming zipformer ctc HLG----------' +./run-zipformer-ctc-hlg.sh +rm -rf sherpa-onnx-* + +echo '----------streaming zipformer ctc----------' +./run-zipformer-ctc.sh +rm -rf sherpa-onnx-* + +echo '----------streaming zipformer transducer----------' +./run-zipformer-transducer.sh +rm -rf sherpa-onnx-* + +echo '----------streaming NeMo transducer----------' +./run-nemo-transducer.sh +rm -rf sherpa-onnx-* + +echo '----------streaming paraformer----------' +./run-paraformer.sh +rm -rf sherpa-onnx-* + +popd # streaming-asr + pushd non-streaming-asr echo '----------VAD with paraformer----------' @@ -34,7 +58,7 @@ echo '----------zipformer transducer----------' ./run-zipformer-transducer.sh rm -rf sherpa-onnx-* -popd +popd # non-streaming-asr pushd vad ./run.sh diff --git a/.github/workflows/test-dart.yaml b/.github/workflows/test-dart.yaml index ae01373b3..c1605a382 100644 --- a/.github/workflows/test-dart.yaml +++ b/.github/workflows/test-dart.yaml @@ -6,12 +6,14 @@ on: - master paths: - '.github/workflows/test-dart.yaml' + - '.github/scripts/test-dart.sh' - 'dart-api-examples/**' pull_request: branches: - master paths: - '.github/workflows/test-dart.yaml' + - '.github/scripts/test-dart.sh' - 'dart-api-examples/**' workflow_dispatch: @@ -89,5 +91,6 @@ jobs: run: | cp scripts/dart/vad-pubspec.yaml dart-api-examples/vad/pubspec.yaml cp scripts/dart/non-streaming-asr-pubspec.yaml dart-api-examples/non-streaming-asr/pubspec.yaml + cp scripts/dart/streaming-asr-pubspec.yaml dart-api-examples/streaming-asr/pubspec.yaml .github/scripts/test-dart.sh diff --git a/dart-api-examples/non-streaming-asr/bin/nemo-ctc.dart b/dart-api-examples/non-streaming-asr/bin/nemo-ctc.dart index fa90635fd..2565862bb 100644 --- a/dart-api-examples/non-streaming-asr/bin/nemo-ctc.dart +++ b/dart-api-examples/non-streaming-asr/bin/nemo-ctc.dart @@ -1,3 +1,4 @@ +// Copyright (c) 2024 Xiaomi Corporation import 'dart:io'; import 'dart:typed_data'; diff --git a/dart-api-examples/non-streaming-asr/bin/nemo-transducer.dart b/dart-api-examples/non-streaming-asr/bin/nemo-transducer.dart index 881487455..3df8095c6 100644 --- a/dart-api-examples/non-streaming-asr/bin/nemo-transducer.dart +++ b/dart-api-examples/non-streaming-asr/bin/nemo-transducer.dart @@ -1,3 +1,4 @@ +// Copyright (c) 2024 Xiaomi Corporation import 'dart:io'; import 'dart:typed_data'; diff --git a/dart-api-examples/non-streaming-asr/bin/paraformer.dart b/dart-api-examples/non-streaming-asr/bin/paraformer.dart index fd3886788..15f45a1c5 100644 --- a/dart-api-examples/non-streaming-asr/bin/paraformer.dart +++ b/dart-api-examples/non-streaming-asr/bin/paraformer.dart @@ -1,3 +1,4 @@ +// Copyright (c) 2024 Xiaomi Corporation import 'dart:io'; import 'dart:typed_data'; diff --git a/dart-api-examples/non-streaming-asr/bin/telespeech-ctc.dart b/dart-api-examples/non-streaming-asr/bin/telespeech-ctc.dart index b9d21a0d0..633baabef 100644 --- a/dart-api-examples/non-streaming-asr/bin/telespeech-ctc.dart +++ b/dart-api-examples/non-streaming-asr/bin/telespeech-ctc.dart @@ -1,3 +1,4 @@ +// Copyright (c) 2024 Xiaomi Corporation import 'dart:io'; import 'dart:typed_data'; diff --git a/dart-api-examples/non-streaming-asr/bin/vad-with-paraformer.dart b/dart-api-examples/non-streaming-asr/bin/vad-with-paraformer.dart index 1f03ed3e8..5e7cfb485 100644 --- a/dart-api-examples/non-streaming-asr/bin/vad-with-paraformer.dart +++ b/dart-api-examples/non-streaming-asr/bin/vad-with-paraformer.dart @@ -1,3 +1,4 @@ +// Copyright (c) 2024 Xiaomi Corporation import 'dart:io'; import 'dart:typed_data'; diff --git a/dart-api-examples/non-streaming-asr/bin/whisper.dart b/dart-api-examples/non-streaming-asr/bin/whisper.dart index 0e4e0f3d7..1fffcd835 100644 --- a/dart-api-examples/non-streaming-asr/bin/whisper.dart +++ b/dart-api-examples/non-streaming-asr/bin/whisper.dart @@ -1,3 +1,4 @@ +// Copyright (c) 2024 Xiaomi Corporation import 'dart:io'; import 'dart:typed_data'; diff --git a/dart-api-examples/non-streaming-asr/bin/zipformer-transducer.dart b/dart-api-examples/non-streaming-asr/bin/zipformer-transducer.dart index 881487455..3df8095c6 100644 --- a/dart-api-examples/non-streaming-asr/bin/zipformer-transducer.dart +++ b/dart-api-examples/non-streaming-asr/bin/zipformer-transducer.dart @@ -1,3 +1,4 @@ +// Copyright (c) 2024 Xiaomi Corporation import 'dart:io'; import 'dart:typed_data'; diff --git a/dart-api-examples/streaming-asr/.gitignore b/dart-api-examples/streaming-asr/.gitignore new file mode 100644 index 000000000..3a8579040 --- /dev/null +++ b/dart-api-examples/streaming-asr/.gitignore @@ -0,0 +1,3 @@ +# https://dart.dev/guides/libraries/private-files +# Created by `dart pub` +.dart_tool/ diff --git a/dart-api-examples/streaming-asr/CHANGELOG.md b/dart-api-examples/streaming-asr/CHANGELOG.md new file mode 100644 index 000000000..effe43c82 --- /dev/null +++ b/dart-api-examples/streaming-asr/CHANGELOG.md @@ -0,0 +1,3 @@ +## 1.0.0 + +- Initial version. diff --git a/dart-api-examples/streaming-asr/README.md b/dart-api-examples/streaming-asr/README.md new file mode 100644 index 000000000..357b5f154 --- /dev/null +++ b/dart-api-examples/streaming-asr/README.md @@ -0,0 +1,11 @@ +# Introduction + +This folder contains examples for streaming ASR with Dart API. +| File | Description| +|------|------------| +|[./bin/nemo-transducer.dart](./bin/nemo-transducer.dart)| Use a NeMo transducer model for speech recognition. See [./run-nemo-transducer.sh](./run-nemo-transducer.sh)| +|[./bin/paraformer.dart](./bin/paraformer.dart)| Use a Paraformer model for speech recognition. See [./run-paraformer.sh](./run-paraformer.sh)| +|[./bin/zipformer-ctc-hlg.dart](./bin/zipformer-ctc-hlg.dart)| Use a Zipformer CTC model with HLG graph for speech recognition. See [./run-zipformer-ctc-hlg.sh](./run-zipformer-ctc-hlg.sh)| +|[./bin/zipformer-ctc.dart](./bin/zipformer-ctc.dart)| Use a Zipformer CTC model for speech recognition. See [./run-zipformer-ctc.sh](./run-zipformer-ctc.sh)| +|[./bin/zipformer-transducer.dart](./bin/zipformer-transducer.dart)| Use a Zipformer transducer model for speech recognition. See [./run-zipformer-transducer.sh](./run-zipformer-transducer.sh)| + diff --git a/dart-api-examples/streaming-asr/analysis_options.yaml b/dart-api-examples/streaming-asr/analysis_options.yaml new file mode 100644 index 000000000..dee8927aa --- /dev/null +++ b/dart-api-examples/streaming-asr/analysis_options.yaml @@ -0,0 +1,30 @@ +# This file configures the static analysis results for your project (errors, +# warnings, and lints). +# +# This enables the 'recommended' set of lints from `package:lints`. +# This set helps identify many issues that may lead to problems when running +# or consuming Dart code, and enforces writing Dart using a single, idiomatic +# style and format. +# +# If you want a smaller set of lints you can change this to specify +# 'package:lints/core.yaml'. These are just the most critical lints +# (the recommended set includes the core lints). +# The core lints are also what is used by pub.dev for scoring packages. + +include: package:lints/recommended.yaml + +# Uncomment the following section to specify additional rules. + +# linter: +# rules: +# - camel_case_types + +# analyzer: +# exclude: +# - path/to/excluded/files/** + +# For more information about the core and recommended set of lints, see +# https://dart.dev/go/core-lints + +# For additional information about configuring this file, see +# https://dart.dev/guides/language/analysis-options diff --git a/dart-api-examples/streaming-asr/bin/init.dart b/dart-api-examples/streaming-asr/bin/init.dart new file mode 120000 index 000000000..48508cfd3 --- /dev/null +++ b/dart-api-examples/streaming-asr/bin/init.dart @@ -0,0 +1 @@ +../../vad/bin/init.dart \ No newline at end of file diff --git a/dart-api-examples/streaming-asr/bin/nemo-transducer.dart b/dart-api-examples/streaming-asr/bin/nemo-transducer.dart new file mode 120000 index 000000000..6e2a04343 --- /dev/null +++ b/dart-api-examples/streaming-asr/bin/nemo-transducer.dart @@ -0,0 +1 @@ +zipformer-transducer.dart \ No newline at end of file diff --git a/dart-api-examples/streaming-asr/bin/paraformer.dart b/dart-api-examples/streaming-asr/bin/paraformer.dart new file mode 100644 index 000000000..5781407df --- /dev/null +++ b/dart-api-examples/streaming-asr/bin/paraformer.dart @@ -0,0 +1,92 @@ +// Copyright (c) 2024 Xiaomi Corporation +import 'dart:io'; +import 'dart:typed_data'; + +import 'package:args/args.dart'; +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; + +import './init.dart'; + +void main(List arguments) async { + await initSherpaOnnx(); + + final parser = ArgParser() + ..addOption('encoder', help: 'Path to the encoder model') + ..addOption('decoder', help: 'Path to decoder model') + ..addOption('tokens', help: 'Path to tokens.txt') + ..addOption('input-wav', help: 'Path to input.wav to transcribe'); + + final res = parser.parse(arguments); + if (res['encoder'] == null || + res['decoder'] == null || + res['tokens'] == null || + res['input-wav'] == null) { + print(parser.usage); + exit(1); + } + + final encoder = res['encoder'] as String; + final decoder = res['decoder'] as String; + final tokens = res['tokens'] as String; + final inputWav = res['input-wav'] as String; + + final paraformer = sherpa_onnx.OnlineParaformerModelConfig( + encoder: encoder, + decoder: decoder, + ); + + final modelConfig = sherpa_onnx.OnlineModelConfig( + paraformer: paraformer, + tokens: tokens, + debug: true, + numThreads: 1, + ); + final config = sherpa_onnx.OnlineRecognizerConfig(model: modelConfig); + final recognizer = sherpa_onnx.OnlineRecognizer(config); + + final waveData = sherpa_onnx.readWave(inputWav); + final stream = recognizer.createStream(); + + // simulate streaming. You can choose an arbitrary chunk size. + // chunkSize of a single sample is also ok, i.e, chunkSize = 1 + final chunkSize = 1600; // 0.1 second for 16kHz + final numChunks = waveData.samples.length ~/ chunkSize; + + var last = ''; + for (int i = 0; i != numChunks; ++i) { + int start = i * chunkSize; + stream.acceptWaveform( + samples: + Float32List.sublistView(waveData.samples, start, start + chunkSize), + sampleRate: waveData.sampleRate, + ); + while (recognizer.isReady(stream)) { + recognizer.decode(stream); + } + final result = recognizer.getResult(stream); + if (result.text != last && result.text != '') { + last = result.text; + print(last); + } + } + + // 0.5 seconds, assume sampleRate is 16kHz + final tailPaddings = Float32List(8000); + stream.acceptWaveform( + samples: tailPaddings, + sampleRate: waveData.sampleRate, + ); + + while (recognizer.isReady(stream)) { + recognizer.decode(stream); + } + + final result = recognizer.getResult(stream); + + if (result.text != '') { + print(result.text); + } + + stream.free(); + recognizer.free(); +} diff --git a/dart-api-examples/streaming-asr/bin/zipformer-ctc-hlg.dart b/dart-api-examples/streaming-asr/bin/zipformer-ctc-hlg.dart new file mode 100644 index 000000000..6dd95c12b --- /dev/null +++ b/dart-api-examples/streaming-asr/bin/zipformer-ctc-hlg.dart @@ -0,0 +1,94 @@ +// Copyright (c) 2024 Xiaomi Corporation +import 'dart:io'; +import 'dart:typed_data'; + +import 'package:args/args.dart'; +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; + +import './init.dart'; + +void main(List arguments) async { + await initSherpaOnnx(); + + final parser = ArgParser() + ..addOption('model', help: 'Path to the model') + ..addOption('hlg', help: 'Path to HLG.fst') + ..addOption('tokens', help: 'Path to tokens.txt') + ..addOption('input-wav', help: 'Path to input.wav to transcribe'); + + final res = parser.parse(arguments); + if (res['model'] == null || + res['hlg'] == null || + res['tokens'] == null || + res['input-wav'] == null) { + print(parser.usage); + exit(1); + } + + final model = res['model'] as String; + final hlg = res['hlg'] as String; + final tokens = res['tokens'] as String; + final inputWav = res['input-wav'] as String; + + final ctc = sherpa_onnx.OnlineZipformer2CtcModelConfig( + model: model, + ); + + final modelConfig = sherpa_onnx.OnlineModelConfig( + zipformer2Ctc: ctc, + tokens: tokens, + debug: true, + numThreads: 1, + ); + final config = sherpa_onnx.OnlineRecognizerConfig( + model: modelConfig, + ctcFstDecoderConfig: sherpa_onnx.OnlineCtcFstDecoderConfig(graph: hlg), + ); + final recognizer = sherpa_onnx.OnlineRecognizer(config); + + final waveData = sherpa_onnx.readWave(inputWav); + final stream = recognizer.createStream(); + + // simulate streaming. You can choose an arbitrary chunk size. + // chunkSize of a single sample is also ok, i.e, chunkSize = 1 + final chunkSize = 1600; // 0.1 second for 16kHz + final numChunks = waveData.samples.length ~/ chunkSize; + + var last = ''; + for (int i = 0; i != numChunks; ++i) { + int start = i * chunkSize; + stream.acceptWaveform( + samples: + Float32List.sublistView(waveData.samples, start, start + chunkSize), + sampleRate: waveData.sampleRate, + ); + while (recognizer.isReady(stream)) { + recognizer.decode(stream); + } + final result = recognizer.getResult(stream); + if (result.text != last && result.text != '') { + last = result.text; + print(last); + } + } + + // 0.5 seconds, assume sampleRate is 16kHz + final tailPaddings = Float32List(8000); + stream.acceptWaveform( + samples: tailPaddings, + sampleRate: waveData.sampleRate, + ); + + while (recognizer.isReady(stream)) { + recognizer.decode(stream); + } + + final result = recognizer.getResult(stream); + + if (result.text != '') { + print(result.text); + } + + stream.free(); + recognizer.free(); +} diff --git a/dart-api-examples/streaming-asr/bin/zipformer-ctc.dart b/dart-api-examples/streaming-asr/bin/zipformer-ctc.dart new file mode 100644 index 000000000..f00d66fec --- /dev/null +++ b/dart-api-examples/streaming-asr/bin/zipformer-ctc.dart @@ -0,0 +1,88 @@ +// Copyright (c) 2024 Xiaomi Corporation +import 'dart:io'; +import 'dart:typed_data'; + +import 'package:args/args.dart'; +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; + +import './init.dart'; + +void main(List arguments) async { + await initSherpaOnnx(); + + final parser = ArgParser() + ..addOption('model', help: 'Path to the model') + ..addOption('tokens', help: 'Path to tokens.txt') + ..addOption('input-wav', help: 'Path to input.wav to transcribe'); + + final res = parser.parse(arguments); + if (res['model'] == null || + res['tokens'] == null || + res['input-wav'] == null) { + print(parser.usage); + exit(1); + } + + final model = res['model'] as String; + final tokens = res['tokens'] as String; + final inputWav = res['input-wav'] as String; + + final ctc = sherpa_onnx.OnlineZipformer2CtcModelConfig( + model: model, + ); + + final modelConfig = sherpa_onnx.OnlineModelConfig( + zipformer2Ctc: ctc, + tokens: tokens, + debug: true, + numThreads: 1, + ); + final config = sherpa_onnx.OnlineRecognizerConfig(model: modelConfig); + final recognizer = sherpa_onnx.OnlineRecognizer(config); + + final waveData = sherpa_onnx.readWave(inputWav); + final stream = recognizer.createStream(); + + // simulate streaming. You can choose an arbitrary chunk size. + // chunkSize of a single sample is also ok, i.e, chunkSize = 1 + final chunkSize = 1600; // 0.1 second for 16kHz + final numChunks = waveData.samples.length ~/ chunkSize; + + var last = ''; + for (int i = 0; i != numChunks; ++i) { + int start = i * chunkSize; + stream.acceptWaveform( + samples: + Float32List.sublistView(waveData.samples, start, start + chunkSize), + sampleRate: waveData.sampleRate, + ); + while (recognizer.isReady(stream)) { + recognizer.decode(stream); + } + final result = recognizer.getResult(stream); + if (result.text != last && result.text != '') { + last = result.text; + print(last); + } + } + + // 0.5 seconds, assume sampleRate is 16kHz + final tailPaddings = Float32List(8000); + stream.acceptWaveform( + samples: tailPaddings, + sampleRate: waveData.sampleRate, + ); + + while (recognizer.isReady(stream)) { + recognizer.decode(stream); + } + + final result = recognizer.getResult(stream); + + if (result.text != '') { + print(result.text); + } + + stream.free(); + recognizer.free(); +} diff --git a/dart-api-examples/streaming-asr/bin/zipformer-transducer.dart b/dart-api-examples/streaming-asr/bin/zipformer-transducer.dart new file mode 100644 index 000000000..438af31e5 --- /dev/null +++ b/dart-api-examples/streaming-asr/bin/zipformer-transducer.dart @@ -0,0 +1,96 @@ +// Copyright (c) 2024 Xiaomi Corporation +import 'dart:io'; +import 'dart:typed_data'; + +import 'package:args/args.dart'; +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; + +import './init.dart'; + +void main(List arguments) async { + await initSherpaOnnx(); + + final parser = ArgParser() + ..addOption('encoder', help: 'Path to the encoder model') + ..addOption('decoder', help: 'Path to decoder model') + ..addOption('joiner', help: 'Path to joiner model') + ..addOption('tokens', help: 'Path to tokens.txt') + ..addOption('input-wav', help: 'Path to input.wav to transcribe'); + + final res = parser.parse(arguments); + if (res['encoder'] == null || + res['decoder'] == null || + res['joiner'] == null || + res['tokens'] == null || + res['input-wav'] == null) { + print(parser.usage); + exit(1); + } + + final encoder = res['encoder'] as String; + final decoder = res['decoder'] as String; + final joiner = res['joiner'] as String; + final tokens = res['tokens'] as String; + final inputWav = res['input-wav'] as String; + + final transducer = sherpa_onnx.OnlineTransducerModelConfig( + encoder: encoder, + decoder: decoder, + joiner: joiner, + ); + + final modelConfig = sherpa_onnx.OnlineModelConfig( + transducer: transducer, + tokens: tokens, + debug: true, + numThreads: 1, + ); + final config = sherpa_onnx.OnlineRecognizerConfig(model: modelConfig); + final recognizer = sherpa_onnx.OnlineRecognizer(config); + + final waveData = sherpa_onnx.readWave(inputWav); + final stream = recognizer.createStream(); + + // simulate streaming. You can choose an arbitrary chunk size. + // chunkSize of a single sample is also ok, i.e, chunkSize = 1 + final chunkSize = 1600; // 0.1 second for 16kHz + final numChunks = waveData.samples.length ~/ chunkSize; + + var last = ''; + for (int i = 0; i != numChunks; ++i) { + int start = i * chunkSize; + stream.acceptWaveform( + samples: + Float32List.sublistView(waveData.samples, start, start + chunkSize), + sampleRate: waveData.sampleRate, + ); + while (recognizer.isReady(stream)) { + recognizer.decode(stream); + } + final result = recognizer.getResult(stream); + if (result.text != last && result.text != '') { + last = result.text; + print(last); + } + } + + // 0.5 seconds, assume sampleRate is 16kHz + final tailPaddings = Float32List(8000); + stream.acceptWaveform( + samples: tailPaddings, + sampleRate: waveData.sampleRate, + ); + + while (recognizer.isReady(stream)) { + recognizer.decode(stream); + } + + final result = recognizer.getResult(stream); + + if (result.text != '') { + print(result.text); + } + + stream.free(); + recognizer.free(); +} diff --git a/dart-api-examples/streaming-asr/pubspec.lock b/dart-api-examples/streaming-asr/pubspec.lock new file mode 100644 index 000000000..349b3b461 --- /dev/null +++ b/dart-api-examples/streaming-asr/pubspec.lock @@ -0,0 +1,432 @@ +# Generated by pub +# See https://dart.dev/tools/pub/glossary#lockfile +packages: + _fe_analyzer_shared: + dependency: transitive + description: + name: _fe_analyzer_shared + sha256: "0b2f2bd91ba804e53a61d757b986f89f1f9eaed5b11e4b2f5a2468d86d6c9fc7" + url: "https://pub.dev" + source: hosted + version: "67.0.0" + analyzer: + dependency: transitive + description: + name: analyzer + sha256: "37577842a27e4338429a1cbc32679d508836510b056f1eedf0c8d20e39c1383d" + url: "https://pub.dev" + source: hosted + version: "6.4.1" + args: + dependency: "direct main" + description: + name: args + sha256: "7cf60b9f0cc88203c5a190b4cd62a99feea42759a7fa695010eb5de1c0b2252a" + url: "https://pub.dev" + source: hosted + version: "2.5.0" + async: + dependency: transitive + description: + name: async + sha256: "947bfcf187f74dbc5e146c9eb9c0f10c9f8b30743e341481c1e2ed3ecc18c20c" + url: "https://pub.dev" + source: hosted + version: "2.11.0" + boolean_selector: + dependency: transitive + description: + name: boolean_selector + sha256: "6cfb5af12253eaf2b368f07bacc5a80d1301a071c73360d746b7f2e32d762c66" + url: "https://pub.dev" + source: hosted + version: "2.1.1" + characters: + dependency: transitive + description: + name: characters + sha256: "04a925763edad70e8443c99234dc3328f442e811f1d8fd1a72f1c8ad0f69a605" + url: "https://pub.dev" + source: hosted + version: "1.3.0" + collection: + dependency: transitive + description: + name: collection + sha256: ee67cb0715911d28db6bf4af1026078bd6f0128b07a5f66fb2ed94ec6783c09a + url: "https://pub.dev" + source: hosted + version: "1.18.0" + convert: + dependency: transitive + description: + name: convert + sha256: "0f08b14755d163f6e2134cb58222dd25ea2a2ee8a195e53983d57c075324d592" + url: "https://pub.dev" + source: hosted + version: "3.1.1" + coverage: + dependency: transitive + description: + name: coverage + sha256: "3945034e86ea203af7a056d98e98e42a5518fff200d6e8e6647e1886b07e936e" + url: "https://pub.dev" + source: hosted + version: "1.8.0" + crypto: + dependency: transitive + description: + name: crypto + sha256: ff625774173754681d66daaf4a448684fb04b78f902da9cb3d308c19cc5e8bab + url: "https://pub.dev" + source: hosted + version: "3.0.3" + ffi: + dependency: transitive + description: + name: ffi + sha256: "493f37e7df1804778ff3a53bd691d8692ddf69702cf4c1c1096a2e41b4779e21" + url: "https://pub.dev" + source: hosted + version: "2.1.2" + file: + dependency: transitive + description: + name: file + sha256: "5fc22d7c25582e38ad9a8515372cd9a93834027aacf1801cf01164dac0ffa08c" + url: "https://pub.dev" + source: hosted + version: "7.0.0" + flutter: + dependency: transitive + description: flutter + source: sdk + version: "0.0.0" + frontend_server_client: + dependency: transitive + description: + name: frontend_server_client + sha256: f64a0333a82f30b0cca061bc3d143813a486dc086b574bfb233b7c1372427694 + url: "https://pub.dev" + source: hosted + version: "4.0.0" + glob: + dependency: transitive + description: + name: glob + sha256: "0e7014b3b7d4dac1ca4d6114f82bf1782ee86745b9b42a92c9289c23d8a0ab63" + url: "https://pub.dev" + source: hosted + version: "2.1.2" + http_multi_server: + dependency: transitive + description: + name: http_multi_server + sha256: "97486f20f9c2f7be8f514851703d0119c3596d14ea63227af6f7a481ef2b2f8b" + url: "https://pub.dev" + source: hosted + version: "3.2.1" + http_parser: + dependency: transitive + description: + name: http_parser + sha256: "2aa08ce0341cc9b354a498388e30986515406668dbcc4f7c950c3e715496693b" + url: "https://pub.dev" + source: hosted + version: "4.0.2" + io: + dependency: transitive + description: + name: io + sha256: "2ec25704aba361659e10e3e5f5d672068d332fc8ac516421d483a11e5cbd061e" + url: "https://pub.dev" + source: hosted + version: "1.0.4" + js: + dependency: transitive + description: + name: js + sha256: c1b2e9b5ea78c45e1a0788d29606ba27dc5f71f019f32ca5140f61ef071838cf + url: "https://pub.dev" + source: hosted + version: "0.7.1" + lints: + dependency: "direct dev" + description: + name: lints + sha256: cbf8d4b858bb0134ef3ef87841abdf8d63bfc255c266b7bf6b39daa1085c4290 + url: "https://pub.dev" + source: hosted + version: "3.0.0" + logging: + dependency: transitive + description: + name: logging + sha256: "623a88c9594aa774443aa3eb2d41807a48486b5613e67599fb4c41c0ad47c340" + url: "https://pub.dev" + source: hosted + version: "1.2.0" + matcher: + dependency: transitive + description: + name: matcher + sha256: d2323aa2060500f906aa31a895b4030b6da3ebdcc5619d14ce1aada65cd161cb + url: "https://pub.dev" + source: hosted + version: "0.12.16+1" + material_color_utilities: + dependency: transitive + description: + name: material_color_utilities + sha256: "0e0a020085b65b6083975e499759762399b4475f766c21668c4ecca34ea74e5a" + url: "https://pub.dev" + source: hosted + version: "0.8.0" + meta: + dependency: transitive + description: + name: meta + sha256: "7687075e408b093f36e6bbf6c91878cc0d4cd10f409506f7bc996f68220b9136" + url: "https://pub.dev" + source: hosted + version: "1.12.0" + mime: + dependency: transitive + description: + name: mime + sha256: "2e123074287cc9fd6c09de8336dae606d1ddb88d9ac47358826db698c176a1f2" + url: "https://pub.dev" + source: hosted + version: "1.0.5" + node_preamble: + dependency: transitive + description: + name: node_preamble + sha256: "6e7eac89047ab8a8d26cf16127b5ed26de65209847630400f9aefd7cd5c730db" + url: "https://pub.dev" + source: hosted + version: "2.0.2" + package_config: + dependency: transitive + description: + name: package_config + sha256: "1c5b77ccc91e4823a5af61ee74e6b972db1ef98c2ff5a18d3161c982a55448bd" + url: "https://pub.dev" + source: hosted + version: "2.1.0" + path: + dependency: "direct main" + description: + name: path + sha256: "087ce49c3f0dc39180befefc60fdb4acd8f8620e5682fe2476afd0b3688bb4af" + url: "https://pub.dev" + source: hosted + version: "1.9.0" + pool: + dependency: transitive + description: + name: pool + sha256: "20fe868b6314b322ea036ba325e6fc0711a22948856475e2c2b6306e8ab39c2a" + url: "https://pub.dev" + source: hosted + version: "1.5.1" + pub_semver: + dependency: transitive + description: + name: pub_semver + sha256: "40d3ab1bbd474c4c2328c91e3a7df8c6dd629b79ece4c4bd04bee496a224fb0c" + url: "https://pub.dev" + source: hosted + version: "2.1.4" + shelf: + dependency: transitive + description: + name: shelf + sha256: ad29c505aee705f41a4d8963641f91ac4cee3c8fad5947e033390a7bd8180fa4 + url: "https://pub.dev" + source: hosted + version: "1.4.1" + shelf_packages_handler: + dependency: transitive + description: + name: shelf_packages_handler + sha256: "89f967eca29607c933ba9571d838be31d67f53f6e4ee15147d5dc2934fee1b1e" + url: "https://pub.dev" + source: hosted + version: "3.0.2" + shelf_static: + dependency: transitive + description: + name: shelf_static + sha256: a41d3f53c4adf0f57480578c1d61d90342cd617de7fc8077b1304643c2d85c1e + url: "https://pub.dev" + source: hosted + version: "1.1.2" + shelf_web_socket: + dependency: transitive + description: + name: shelf_web_socket + sha256: "9ca081be41c60190ebcb4766b2486a7d50261db7bd0f5d9615f2d653637a84c1" + url: "https://pub.dev" + source: hosted + version: "1.0.4" + sherpa_onnx: + dependency: "direct main" + description: + name: sherpa_onnx + sha256: e45894f81e7c854ca96d678bcab5303036e884a7c90e9a6c4ec04c7b1ee215a8 + url: "https://pub.dev" + source: hosted + version: "1.9.29" + sky_engine: + dependency: transitive + description: flutter + source: sdk + version: "0.0.99" + source_map_stack_trace: + dependency: transitive + description: + name: source_map_stack_trace + sha256: "84cf769ad83aa6bb61e0aa5a18e53aea683395f196a6f39c4c881fb90ed4f7ae" + url: "https://pub.dev" + source: hosted + version: "2.1.1" + source_maps: + dependency: transitive + description: + name: source_maps + sha256: "708b3f6b97248e5781f493b765c3337db11c5d2c81c3094f10904bfa8004c703" + url: "https://pub.dev" + source: hosted + version: "0.10.12" + source_span: + dependency: transitive + description: + name: source_span + sha256: "53e943d4206a5e30df338fd4c6e7a077e02254531b138a15aec3bd143c1a8b3c" + url: "https://pub.dev" + source: hosted + version: "1.10.0" + stack_trace: + dependency: transitive + description: + name: stack_trace + sha256: "73713990125a6d93122541237550ee3352a2d84baad52d375a4cad2eb9b7ce0b" + url: "https://pub.dev" + source: hosted + version: "1.11.1" + stream_channel: + dependency: transitive + description: + name: stream_channel + sha256: ba2aa5d8cc609d96bbb2899c28934f9e1af5cddbd60a827822ea467161eb54e7 + url: "https://pub.dev" + source: hosted + version: "2.1.2" + string_scanner: + dependency: transitive + description: + name: string_scanner + sha256: "556692adab6cfa87322a115640c11f13cb77b3f076ddcc5d6ae3c20242bedcde" + url: "https://pub.dev" + source: hosted + version: "1.2.0" + term_glyph: + dependency: transitive + description: + name: term_glyph + sha256: a29248a84fbb7c79282b40b8c72a1209db169a2e0542bce341da992fe1bc7e84 + url: "https://pub.dev" + source: hosted + version: "1.2.1" + test: + dependency: "direct dev" + description: + name: test + sha256: "7ee446762c2c50b3bd4ea96fe13ffac69919352bd3b4b17bac3f3465edc58073" + url: "https://pub.dev" + source: hosted + version: "1.25.2" + test_api: + dependency: transitive + description: + name: test_api + sha256: "9955ae474176f7ac8ee4e989dadfb411a58c30415bcfb648fa04b2b8a03afa7f" + url: "https://pub.dev" + source: hosted + version: "0.7.0" + test_core: + dependency: transitive + description: + name: test_core + sha256: "2bc4b4ecddd75309300d8096f781c0e3280ca1ef85beda558d33fcbedc2eead4" + url: "https://pub.dev" + source: hosted + version: "0.6.0" + typed_data: + dependency: transitive + description: + name: typed_data + sha256: facc8d6582f16042dd49f2463ff1bd6e2c9ef9f3d5da3d9b087e244a7b564b3c + url: "https://pub.dev" + source: hosted + version: "1.3.2" + vector_math: + dependency: transitive + description: + name: vector_math + sha256: "80b3257d1492ce4d091729e3a67a60407d227c27241d6927be0130c98e741803" + url: "https://pub.dev" + source: hosted + version: "2.1.4" + vm_service: + dependency: transitive + description: + name: vm_service + sha256: f652077d0bdf60abe4c1f6377448e8655008eef28f128bc023f7b5e8dfeb48fc + url: "https://pub.dev" + source: hosted + version: "14.2.4" + watcher: + dependency: transitive + description: + name: watcher + sha256: "3d2ad6751b3c16cf07c7fca317a1413b3f26530319181b37e3b9039b84fc01d8" + url: "https://pub.dev" + source: hosted + version: "1.1.0" + web: + dependency: transitive + description: + name: web + sha256: "97da13628db363c635202ad97068d47c5b8aa555808e7a9411963c533b449b27" + url: "https://pub.dev" + source: hosted + version: "0.5.1" + web_socket_channel: + dependency: transitive + description: + name: web_socket_channel + sha256: "58c6666b342a38816b2e7e50ed0f1e261959630becd4c879c4f26bfa14aa5a42" + url: "https://pub.dev" + source: hosted + version: "2.4.5" + webkit_inspection_protocol: + dependency: transitive + description: + name: webkit_inspection_protocol + sha256: "87d3f2333bb240704cd3f1c6b5b7acd8a10e7f0bc28c28dcf14e782014f4a572" + url: "https://pub.dev" + source: hosted + version: "1.2.1" + yaml: + dependency: transitive + description: + name: yaml + sha256: "75769501ea3489fca56601ff33454fe45507ea3bfb014161abc3b43ae25989d5" + url: "https://pub.dev" + source: hosted + version: "3.1.2" +sdks: + dart: ">=3.4.0 <4.0.0" + flutter: ">=3.3.0" diff --git a/dart-api-examples/streaming-asr/pubspec.yaml b/dart-api-examples/streaming-asr/pubspec.yaml new file mode 100644 index 000000000..372fbf6e3 --- /dev/null +++ b/dart-api-examples/streaming-asr/pubspec.yaml @@ -0,0 +1,20 @@ +name: streaming_asr + +description: > + This example demonstrates how to use the Dart API for streaming speech recognition. + +version: 1.0.0 +# repository: https://github.com/my_org/my_repo + +environment: + sdk: ^3.4.0 + +# Add regular dependencies here. +dependencies: + sherpa_onnx: ^1.9.29 + path: ^1.9.0 + args: ^2.5.0 + +dev_dependencies: + lints: ^3.0.0 + test: ^1.24.0 diff --git a/dart-api-examples/streaming-asr/run-nemo-transducer.sh b/dart-api-examples/streaming-asr/run-nemo-transducer.sh new file mode 100755 index 000000000..5f2f50de6 --- /dev/null +++ b/dart-api-examples/streaming-asr/run-nemo-transducer.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +set -ex + +dart pub get + +if [ ! -f ./sherpa-onnx-nemo-streaming-fast-conformer-transducer-en-80ms/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-streaming-fast-conformer-transducer-en-80ms.tar.bz2 + tar xvf sherpa-onnx-nemo-streaming-fast-conformer-transducer-en-80ms.tar.bz2 + rm sherpa-onnx-nemo-streaming-fast-conformer-transducer-en-80ms.tar.bz2 +fi + +dart run \ + ./bin/zipformer-transducer.dart \ + --encoder ./sherpa-onnx-nemo-streaming-fast-conformer-transducer-en-80ms/encoder.onnx \ + --decoder ./sherpa-onnx-nemo-streaming-fast-conformer-transducer-en-80ms/decoder.onnx \ + --joiner ./sherpa-onnx-nemo-streaming-fast-conformer-transducer-en-80ms/joiner.onnx \ + --tokens ./sherpa-onnx-nemo-streaming-fast-conformer-transducer-en-80ms/tokens.txt \ + --input-wav ./sherpa-onnx-nemo-streaming-fast-conformer-transducer-en-80ms/test_wavs/0.wav diff --git a/dart-api-examples/streaming-asr/run-paraformer.sh b/dart-api-examples/streaming-asr/run-paraformer.sh new file mode 100755 index 000000000..95a8588ef --- /dev/null +++ b/dart-api-examples/streaming-asr/run-paraformer.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +set -ex + +dart pub get + +if [ ! -f ./sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 + tar xvf sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 + rm sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 +fi + +dart run \ + ./bin/paraformer.dart \ + --encoder ./sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx \ + --decoder ./sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx \ + --tokens ./sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt \ + --input-wav ./sherpa-onnx-streaming-paraformer-bilingual-zh-en/test_wavs/0.wav diff --git a/dart-api-examples/streaming-asr/run-zipformer-ctc-hlg.sh b/dart-api-examples/streaming-asr/run-zipformer-ctc-hlg.sh new file mode 100755 index 000000000..2e55406f8 --- /dev/null +++ b/dart-api-examples/streaming-asr/run-zipformer-ctc-hlg.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +set -ex + +dart pub get + +if [ ! -f ./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 + tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 + rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 +fi + +dart run \ + ./bin/zipformer-ctc-hlg.dart \ + --model ./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx \ + --hlg ./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst \ + --tokens ./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt \ + --input-wav ./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/1.wav diff --git a/dart-api-examples/streaming-asr/run-zipformer-ctc.sh b/dart-api-examples/streaming-asr/run-zipformer-ctc.sh new file mode 100755 index 000000000..26fa00546 --- /dev/null +++ b/dart-api-examples/streaming-asr/run-zipformer-ctc.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +set -ex + +dart pub get + +if [ ! -f ./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 + tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 + rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 +fi + +dart run \ + ./bin/zipformer-ctc.dart \ + --model ./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx \ + --tokens ./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/tokens.txt \ + --input-wav ./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/1.wav diff --git a/dart-api-examples/streaming-asr/run-zipformer-transducer.sh b/dart-api-examples/streaming-asr/run-zipformer-transducer.sh new file mode 100755 index 000000000..767d90a8d --- /dev/null +++ b/dart-api-examples/streaming-asr/run-zipformer-transducer.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +set -ex + +dart pub get + +if [ ! -f ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 +fi + +dart run \ + ./bin/zipformer-transducer.dart \ + --encoder ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.int8.onnx \ + --decoder ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx \ + --joiner ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.int8.onnx \ + --tokens ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt \ + --input-wav ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/test_wavs/0.wav diff --git a/dart-api-examples/vad/bin/init.dart b/dart-api-examples/vad/bin/init.dart index 072ff29b8..09f24793c 100644 --- a/dart-api-examples/vad/bin/init.dart +++ b/dart-api-examples/vad/bin/init.dart @@ -1,3 +1,4 @@ +// Copyright (c) 2024 Xiaomi Corporation import 'dart:io'; import 'dart:isolate'; import 'package:path/path.dart' as p; diff --git a/dart-api-examples/vad/bin/vad.dart b/dart-api-examples/vad/bin/vad.dart index 3ef9f66da..01618cc19 100644 --- a/dart-api-examples/vad/bin/vad.dart +++ b/dart-api-examples/vad/bin/vad.dart @@ -1,3 +1,4 @@ +// Copyright (c) 2024 Xiaomi Corporation import 'dart:io'; import 'dart:typed_data'; diff --git a/nodejs-addon-examples/package.json b/nodejs-addon-examples/package.json index 5fb5743af..55298ec94 100644 --- a/nodejs-addon-examples/package.json +++ b/nodejs-addon-examples/package.json @@ -1,5 +1,5 @@ { "dependencies": { - "sherpa-onnx-node": "^1.0.27" + "sherpa-onnx-node": "^1.9.29" } } diff --git a/scripts/dart/streaming-asr-pubspec.yaml b/scripts/dart/streaming-asr-pubspec.yaml new file mode 100644 index 000000000..c9ee9e3ce --- /dev/null +++ b/scripts/dart/streaming-asr-pubspec.yaml @@ -0,0 +1,21 @@ +name: streaming_asr + +description: > + This example demonstrates how to use the Dart API for streaming speech recognition. + +version: 1.0.0 +# repository: https://github.com/my_org/my_repo + +environment: + sdk: ^3.4.0 + +# Add regular dependencies here. +dependencies: + sherpa_onnx: + path: ../../sherpa-onnx/flutter + path: ^1.9.0 + args: ^2.5.0 + +dev_dependencies: + lints: ^3.0.0 + test: ^1.24.0 From e52d32b95b0258b5e603fa61816080507413a500 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sat, 15 Jun 2024 14:30:36 +0800 Subject: [PATCH 021/237] Add TTS API and examples for Dart (#1010) --- .github/scripts/test-dart.sh | 16 ++ .github/workflows/test-dart.yaml | 1 + dart-api-examples/streaming-asr/README.md | 1 + dart-api-examples/tts/.gitignore | 3 + dart-api-examples/tts/CHANGELOG.md | 3 + dart-api-examples/tts/README.md | 10 + dart-api-examples/tts/analysis_options.yaml | 30 +++ dart-api-examples/tts/bin/coqui.dart | 69 +++++++ dart-api-examples/tts/bin/init.dart | 1 + dart-api-examples/tts/bin/piper.dart | 80 +++++++ dart-api-examples/tts/bin/zh.dart | 86 ++++++++ dart-api-examples/tts/pubspec.lock | 96 +++++++++ dart-api-examples/tts/pubspec.yaml | 16 ++ dart-api-examples/tts/run-coqui.sh | 28 +++ dart-api-examples/tts/run-piper.sh | 28 +++ dart-api-examples/tts/run-zh.sh | 41 ++++ scripts/dart/tts-pubspec.yaml | 17 ++ sherpa-onnx/flutter/lib/sherpa_onnx.dart | 1 + .../flutter/lib/src/sherpa_onnx_bindings.dart | 152 ++++++++++++++ sherpa-onnx/flutter/lib/src/tts.dart | 195 ++++++++++++++++++ 20 files changed, 874 insertions(+) create mode 100644 dart-api-examples/tts/.gitignore create mode 100644 dart-api-examples/tts/CHANGELOG.md create mode 100644 dart-api-examples/tts/README.md create mode 100644 dart-api-examples/tts/analysis_options.yaml create mode 100644 dart-api-examples/tts/bin/coqui.dart create mode 120000 dart-api-examples/tts/bin/init.dart create mode 100644 dart-api-examples/tts/bin/piper.dart create mode 100644 dart-api-examples/tts/bin/zh.dart create mode 100644 dart-api-examples/tts/pubspec.lock create mode 100644 dart-api-examples/tts/pubspec.yaml create mode 100755 dart-api-examples/tts/run-coqui.sh create mode 100755 dart-api-examples/tts/run-piper.sh create mode 100755 dart-api-examples/tts/run-zh.sh create mode 100644 scripts/dart/tts-pubspec.yaml create mode 100644 sherpa-onnx/flutter/lib/src/tts.dart diff --git a/.github/scripts/test-dart.sh b/.github/scripts/test-dart.sh index 0599864c0..763f2bcc4 100755 --- a/.github/scripts/test-dart.sh +++ b/.github/scripts/test-dart.sh @@ -4,6 +4,22 @@ set -ex cd dart-api-examples +pushd tts + +echo '----------piper tts----------' +./run-piper.sh +rm -rf vits-piper-* + +echo '----------coqui tts----------' +./run-coqui.sh +rm -rf vits-coqui-* + +echo '----------zh tts----------' +./run-zh.sh +rm -rf sherpa-onnx-* + +popd # tts + pushd streaming-asr echo '----------streaming zipformer ctc HLG----------' diff --git a/.github/workflows/test-dart.yaml b/.github/workflows/test-dart.yaml index c1605a382..580071b81 100644 --- a/.github/workflows/test-dart.yaml +++ b/.github/workflows/test-dart.yaml @@ -92,5 +92,6 @@ jobs: cp scripts/dart/vad-pubspec.yaml dart-api-examples/vad/pubspec.yaml cp scripts/dart/non-streaming-asr-pubspec.yaml dart-api-examples/non-streaming-asr/pubspec.yaml cp scripts/dart/streaming-asr-pubspec.yaml dart-api-examples/streaming-asr/pubspec.yaml + cp scripts/dart/tts-pubspec.yaml dart-api-examples/tts/pubspec.yaml .github/scripts/test-dart.sh diff --git a/dart-api-examples/streaming-asr/README.md b/dart-api-examples/streaming-asr/README.md index 357b5f154..6c090b043 100644 --- a/dart-api-examples/streaming-asr/README.md +++ b/dart-api-examples/streaming-asr/README.md @@ -1,6 +1,7 @@ # Introduction This folder contains examples for streaming ASR with Dart API. + | File | Description| |------|------------| |[./bin/nemo-transducer.dart](./bin/nemo-transducer.dart)| Use a NeMo transducer model for speech recognition. See [./run-nemo-transducer.sh](./run-nemo-transducer.sh)| diff --git a/dart-api-examples/tts/.gitignore b/dart-api-examples/tts/.gitignore new file mode 100644 index 000000000..3a8579040 --- /dev/null +++ b/dart-api-examples/tts/.gitignore @@ -0,0 +1,3 @@ +# https://dart.dev/guides/libraries/private-files +# Created by `dart pub` +.dart_tool/ diff --git a/dart-api-examples/tts/CHANGELOG.md b/dart-api-examples/tts/CHANGELOG.md new file mode 100644 index 000000000..effe43c82 --- /dev/null +++ b/dart-api-examples/tts/CHANGELOG.md @@ -0,0 +1,3 @@ +## 1.0.0 + +- Initial version. diff --git a/dart-api-examples/tts/README.md b/dart-api-examples/tts/README.md new file mode 100644 index 000000000..6e7fd0fbf --- /dev/null +++ b/dart-api-examples/tts/README.md @@ -0,0 +1,10 @@ +# Introduction + +This folder contains examples for text to speech with Dart API. + +| File | Description| +|------|------------| +|[./bin/piper.dart](./bin/piper.dart)| Use a Piper tts model for text to speech. See [./run-piper.sh](./run-piper.sh)| +|[./bin/coqui.dart](./bin/coqui.dart)| Use a Coqui tts model for text to speech. See [./run-coqui.sh](./run-coqui.sh)| +|[./bin/zh.dart](./bin/zh.dart)| Use a Chinese VITS tts model for text to speech. See [./run-zh.sh](./run-zh.sh)| + diff --git a/dart-api-examples/tts/analysis_options.yaml b/dart-api-examples/tts/analysis_options.yaml new file mode 100644 index 000000000..dee8927aa --- /dev/null +++ b/dart-api-examples/tts/analysis_options.yaml @@ -0,0 +1,30 @@ +# This file configures the static analysis results for your project (errors, +# warnings, and lints). +# +# This enables the 'recommended' set of lints from `package:lints`. +# This set helps identify many issues that may lead to problems when running +# or consuming Dart code, and enforces writing Dart using a single, idiomatic +# style and format. +# +# If you want a smaller set of lints you can change this to specify +# 'package:lints/core.yaml'. These are just the most critical lints +# (the recommended set includes the core lints). +# The core lints are also what is used by pub.dev for scoring packages. + +include: package:lints/recommended.yaml + +# Uncomment the following section to specify additional rules. + +# linter: +# rules: +# - camel_case_types + +# analyzer: +# exclude: +# - path/to/excluded/files/** + +# For more information about the core and recommended set of lints, see +# https://dart.dev/go/core-lints + +# For additional information about configuring this file, see +# https://dart.dev/guides/language/analysis-options diff --git a/dart-api-examples/tts/bin/coqui.dart b/dart-api-examples/tts/bin/coqui.dart new file mode 100644 index 000000000..264d671b6 --- /dev/null +++ b/dart-api-examples/tts/bin/coqui.dart @@ -0,0 +1,69 @@ +// Copyright (c) 2024 Xiaomi Corporation +import 'dart:io'; +import 'dart:typed_data'; + +import 'package:args/args.dart'; +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; + +import './init.dart'; + +void main(List arguments) async { + await initSherpaOnnx(); + + final parser = ArgParser() + ..addOption('model', help: 'Path to the ONNX model') + ..addOption('tokens', help: 'Path to tokens.txt') + ..addOption('text', help: 'Text to generate TTS for') + ..addOption('output-wav', help: 'Filename to save the generated audio') + ..addOption('speed', help: 'Speech speed', defaultsTo: '1.0') + ..addOption( + 'sid', + help: 'Speaker ID to select. Used only for multi-speaker TTS', + defaultsTo: '0', + ); + final res = parser.parse(arguments); + if (res['model'] == null || + res['tokens'] == null || + res['output-wav'] == null || + res['text'] == null) { + print(parser.usage); + exit(1); + } + final model = res['model'] as String; + final tokens = res['tokens'] as String; + final text = res['text'] as String; + final outputWav = res['output-wav'] as String; + var speed = double.tryParse(res['speed'] as String) ?? 1.0; + final sid = int.tryParse(res['sid'] as String) ?? 0; + + if (speed == 0) { + speed = 1.0; + } + + final vits = sherpa_onnx.OfflineTtsVitsModelConfig( + model: model, + tokens: tokens, + lengthScale: 1 / speed, + ); + + final modelConfig = sherpa_onnx.OfflineTtsModelConfig( + vits: vits, + numThreads: 1, + debug: true, + ); + final config = sherpa_onnx.OfflineTtsConfig( + model: modelConfig, + maxNumSenetences: 1, + ); + + final tts = sherpa_onnx.OfflineTts(config); + final audio = tts.generate(text: text, sid: sid, speed: speed); + tts.free(); + + sherpa_onnx.writeWave( + filename: outputWav, + samples: audio.samples, + sampleRate: audio.sampleRate, + ); + print('Saved to ${outputWav}'); +} diff --git a/dart-api-examples/tts/bin/init.dart b/dart-api-examples/tts/bin/init.dart new file mode 120000 index 000000000..48508cfd3 --- /dev/null +++ b/dart-api-examples/tts/bin/init.dart @@ -0,0 +1 @@ +../../vad/bin/init.dart \ No newline at end of file diff --git a/dart-api-examples/tts/bin/piper.dart b/dart-api-examples/tts/bin/piper.dart new file mode 100644 index 000000000..7adcf0962 --- /dev/null +++ b/dart-api-examples/tts/bin/piper.dart @@ -0,0 +1,80 @@ +// Copyright (c) 2024 Xiaomi Corporation +import 'dart:io'; +import 'dart:typed_data'; + +import 'package:args/args.dart'; +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; + +import './init.dart'; + +void main(List arguments) async { + await initSherpaOnnx(); + + final parser = ArgParser() + ..addOption('model', help: 'Path to the ONNX model') + ..addOption('tokens', help: 'Path to tokens.txt') + ..addOption('data-dir', help: 'Path to espeak-ng-data directory') + ..addOption('text', help: 'Text to generate TTS for') + ..addOption('output-wav', help: 'Filename to save the generated audio') + ..addOption('speed', help: 'Speech speed', defaultsTo: '1.0') + ..addOption( + 'sid', + help: 'Speaker ID to select. Used only for multi-speaker TTS', + defaultsTo: '0', + ); + final res = parser.parse(arguments); + if (res['model'] == null || + res['tokens'] == null || + res['data-dir'] == null || + res['output-wav'] == null || + res['text'] == null) { + print(parser.usage); + exit(1); + } + final model = res['model'] as String; + final tokens = res['tokens'] as String; + final dataDir = res['data-dir'] as String; + final text = res['text'] as String; + final outputWav = res['output-wav'] as String; + var speed = double.tryParse(res['speed'] as String) ?? 1.0; + final sid = int.tryParse(res['sid'] as String) ?? 0; + + if (speed == 0) { + speed = 1.0; + } + + final vits = sherpa_onnx.OfflineTtsVitsModelConfig( + model: model, + tokens: tokens, + dataDir: dataDir, + lengthScale: 1 / speed, + ); + + final modelConfig = sherpa_onnx.OfflineTtsModelConfig( + vits: vits, + numThreads: 1, + debug: true, + ); + final config = sherpa_onnx.OfflineTtsConfig( + model: modelConfig, + maxNumSenetences: 1, + ); + + final tts = sherpa_onnx.OfflineTts(config); + final audio = tts.generateWithCallback( + text: text, + sid: sid, + speed: speed, + callback: (Float32List samples) { + print('${samples.length} samples received'); + // You can play samples in a separate thread/isolate + }); + tts.free(); + + sherpa_onnx.writeWave( + filename: outputWav, + samples: audio.samples, + sampleRate: audio.sampleRate, + ); + print('Saved to ${outputWav}'); +} diff --git a/dart-api-examples/tts/bin/zh.dart b/dart-api-examples/tts/bin/zh.dart new file mode 100644 index 000000000..44770ab3f --- /dev/null +++ b/dart-api-examples/tts/bin/zh.dart @@ -0,0 +1,86 @@ +// Copyright (c) 2024 Xiaomi Corporation +import 'dart:io'; +import 'dart:typed_data'; + +import 'package:args/args.dart'; +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; + +import './init.dart'; + +void main(List arguments) async { + await initSherpaOnnx(); + + final parser = ArgParser() + ..addOption('model', help: 'Path to the ONNX model') + ..addOption('tokens', help: 'Path to tokens.txt') + ..addOption('lexicon', help: 'Path to lexicon.txt') + ..addOption( + 'dict-dir', + help: 'Path to jieba dict directory', + defaultsTo: '', + ) + ..addOption('rule-fsts', help: 'Path to rule fsts', defaultsTo: '') + ..addOption('rule-fars', help: 'Path to rule fars', defaultsTo: '') + ..addOption('text', help: 'Text to generate TTS for') + ..addOption('output-wav', help: 'Filename to save the generated audio') + ..addOption('speed', help: 'Speech speed', defaultsTo: '1.0') + ..addOption( + 'sid', + help: 'Speaker ID to select. Used only for multi-speaker TTS', + defaultsTo: '0', + ); + final res = parser.parse(arguments); + if (res['model'] == null || + res['lexicon'] == null || + res['tokens'] == null || + res['output-wav'] == null || + res['text'] == null) { + print(parser.usage); + exit(1); + } + final model = res['model'] as String; + final lexicon = res['lexicon'] as String; + final tokens = res['tokens'] as String; + final dictDir = res['dict-dir'] as String; + final ruleFsts = res['rule-fsts'] as String; + final ruleFars = res['rule-fars'] as String; + final text = res['text'] as String; + final outputWav = res['output-wav'] as String; + var speed = double.tryParse(res['speed'] as String) ?? 1.0; + final sid = int.tryParse(res['sid'] as String) ?? 0; + + if (speed == 0) { + speed = 1.0; + } + + final vits = sherpa_onnx.OfflineTtsVitsModelConfig( + model: model, + lexicon: lexicon, + tokens: tokens, + dictDir: dictDir, + lengthScale: 1 / speed, + ); + + final modelConfig = sherpa_onnx.OfflineTtsModelConfig( + vits: vits, + numThreads: 1, + debug: true, + ); + final config = sherpa_onnx.OfflineTtsConfig( + model: modelConfig, + maxNumSenetences: 1, + ruleFsts: ruleFsts, + ruleFars: ruleFars, + ); + + final tts = sherpa_onnx.OfflineTts(config); + final audio = tts.generate(text: text, sid: sid, speed: speed); + tts.free(); + + sherpa_onnx.writeWave( + filename: outputWav, + samples: audio.samples, + sampleRate: audio.sampleRate, + ); + print('Saved to ${outputWav}'); +} diff --git a/dart-api-examples/tts/pubspec.lock b/dart-api-examples/tts/pubspec.lock new file mode 100644 index 000000000..7c77c2a62 --- /dev/null +++ b/dart-api-examples/tts/pubspec.lock @@ -0,0 +1,96 @@ +# Generated by pub +# See https://dart.dev/tools/pub/glossary#lockfile +packages: + args: + dependency: "direct main" + description: + name: args + sha256: "7cf60b9f0cc88203c5a190b4cd62a99feea42759a7fa695010eb5de1c0b2252a" + url: "https://pub.dev" + source: hosted + version: "2.5.0" + characters: + dependency: transitive + description: + name: characters + sha256: "04a925763edad70e8443c99234dc3328f442e811f1d8fd1a72f1c8ad0f69a605" + url: "https://pub.dev" + source: hosted + version: "1.3.0" + collection: + dependency: transitive + description: + name: collection + sha256: ee67cb0715911d28db6bf4af1026078bd6f0128b07a5f66fb2ed94ec6783c09a + url: "https://pub.dev" + source: hosted + version: "1.18.0" + ffi: + dependency: transitive + description: + name: ffi + sha256: "493f37e7df1804778ff3a53bd691d8692ddf69702cf4c1c1096a2e41b4779e21" + url: "https://pub.dev" + source: hosted + version: "2.1.2" + flutter: + dependency: transitive + description: flutter + source: sdk + version: "0.0.0" + lints: + dependency: "direct dev" + description: + name: lints + sha256: cbf8d4b858bb0134ef3ef87841abdf8d63bfc255c266b7bf6b39daa1085c4290 + url: "https://pub.dev" + source: hosted + version: "3.0.0" + material_color_utilities: + dependency: transitive + description: + name: material_color_utilities + sha256: "0e0a020085b65b6083975e499759762399b4475f766c21668c4ecca34ea74e5a" + url: "https://pub.dev" + source: hosted + version: "0.8.0" + meta: + dependency: transitive + description: + name: meta + sha256: "7687075e408b093f36e6bbf6c91878cc0d4cd10f409506f7bc996f68220b9136" + url: "https://pub.dev" + source: hosted + version: "1.12.0" + path: + dependency: "direct main" + description: + name: path + sha256: "087ce49c3f0dc39180befefc60fdb4acd8f8620e5682fe2476afd0b3688bb4af" + url: "https://pub.dev" + source: hosted + version: "1.9.0" + sherpa_onnx: + dependency: "direct main" + description: + name: sherpa_onnx + sha256: e45894f81e7c854ca96d678bcab5303036e884a7c90e9a6c4ec04c7b1ee215a8 + url: "https://pub.dev" + source: hosted + version: "1.9.29" + sky_engine: + dependency: transitive + description: flutter + source: sdk + version: "0.0.99" + vector_math: + dependency: transitive + description: + name: vector_math + sha256: "80b3257d1492ce4d091729e3a67a60407d227c27241d6927be0130c98e741803" + url: "https://pub.dev" + source: hosted + version: "2.1.4" +sdks: + dart: ">=3.4.0 <4.0.0" + flutter: ">=3.3.0" diff --git a/dart-api-examples/tts/pubspec.yaml b/dart-api-examples/tts/pubspec.yaml new file mode 100644 index 000000000..205ef9b00 --- /dev/null +++ b/dart-api-examples/tts/pubspec.yaml @@ -0,0 +1,16 @@ +name: tts +description: A sample command-line application. +version: 1.0.0 +# repository: https://github.com/my_org/my_repo + +environment: + sdk: ^3.4.0 + +# Add regular dependencies here. +dependencies: + sherpa_onnx: ^1.9.29 + path: ^1.9.0 + args: ^2.5.0 + +dev_dependencies: + lints: ^3.0.0 diff --git a/dart-api-examples/tts/run-coqui.sh b/dart-api-examples/tts/run-coqui.sh new file mode 100755 index 000000000..5bde6c272 --- /dev/null +++ b/dart-api-examples/tts/run-coqui.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +set -ex + +dart pub get + + +# Please visit +# https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models +# to download more models + +if [[ ! -f ./vits-coqui-de-css10/tokens.txt ]]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-coqui-de-css10.tar.bz2 + tar xvf vits-coqui-de-css10.tar.bz2 + rm vits-coqui-de-css10.tar.bz2 +fi + +# It is a character-based TTS model, so there is no need to use a lexicon +dart run \ + ./bin/coqui.dart \ + --model ./vits-coqui-de-css10/model.onnx \ + --tokens ./vits-coqui-de-css10/tokens.txt \ + --sid 0 \ + --speed 0.7 \ + --text 'Alles hat ein Ende, nur die Wurst hat zwei.' \ + --output-wav coqui-0.wav + +ls -lh *.wav diff --git a/dart-api-examples/tts/run-piper.sh b/dart-api-examples/tts/run-piper.sh new file mode 100755 index 000000000..ce8d42c70 --- /dev/null +++ b/dart-api-examples/tts/run-piper.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +set -ex + +dart pub get + + +# Please visit +# https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models +# to download more models + +if [[ ! -f ./vits-piper-en_US-libritts_r-medium/tokens.txt ]]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-libritts_r-medium.tar.bz2 + tar xf vits-piper-en_US-libritts_r-medium.tar.bz2 + rm vits-piper-en_US-libritts_r-medium.tar.bz2 +fi + +dart run \ + ./bin/piper.dart \ + --model ./vits-piper-en_US-libritts_r-medium/en_US-libritts_r-medium.onnx \ + --tokens ./vits-piper-en_US-libritts_r-medium/tokens.txt \ + --data-dir ./vits-piper-en_US-libritts_r-medium/espeak-ng-data \ + --sid 351 \ + --speed 1.0 \ + --text 'How are you doing? This is a speech to text example, using next generation kaldi with piper.' \ + --output-wav piper-351.wav + +ls -lh *.wav diff --git a/dart-api-examples/tts/run-zh.sh b/dart-api-examples/tts/run-zh.sh new file mode 100755 index 000000000..057260b61 --- /dev/null +++ b/dart-api-examples/tts/run-zh.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash + +set -ex + +dart pub get + + +# Please visit +# https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models +# to download more models + +if [[ ! -f ./sherpa-onnx-vits-zh-ll/tokens.txt ]]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 + tar xvf sherpa-onnx-vits-zh-ll.tar.bz2 + rm sherpa-onnx-vits-zh-ll.tar.bz2 +fi + +dart run \ + ./bin/zh.dart \ + --model ./sherpa-onnx-vits-zh-ll/model.onnx \ + --lexicon ./sherpa-onnx-vits-zh-ll/lexicon.txt \ + --tokens ./sherpa-onnx-vits-zh-ll/tokens.txt \ + --dict-dir ./sherpa-onnx-vits-zh-ll/dict \ + --sid 2 \ + --speed 1.0 \ + --text '当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。' \ + --output-wav zh-jieba-2.wav + +dart run \ + ./bin/zh.dart \ + --model ./sherpa-onnx-vits-zh-ll/model.onnx \ + --lexicon ./sherpa-onnx-vits-zh-ll/lexicon.txt \ + --tokens ./sherpa-onnx-vits-zh-ll/tokens.txt \ + --dict-dir ./sherpa-onnx-vits-zh-ll/dict \ + --rule-fsts "./sherpa-onnx-vits-zh-ll/phone.fst,./sherpa-onnx-vits-zh-ll/date.fst,./sherpa-onnx-vits-zh-ll/number.fst" \ + --sid 3 \ + --speed 1.0 \ + --text '今天是2024年6月15号,13点23分。如果有困难,请拨打110或者18920240511。123456块钱。' \ + --output-wav zh-jieba-3.wav + +ls -lh *.wav diff --git a/scripts/dart/tts-pubspec.yaml b/scripts/dart/tts-pubspec.yaml new file mode 100644 index 000000000..0bbb3fcb7 --- /dev/null +++ b/scripts/dart/tts-pubspec.yaml @@ -0,0 +1,17 @@ +name: tts +description: A sample command-line application. +version: 1.0.0 +# repository: https://github.com/my_org/my_repo + +environment: + sdk: ^3.4.0 + +# Add regular dependencies here. +dependencies: + sherpa_onnx: + path: ../../sherpa-onnx/flutter + path: ^1.9.0 + args: ^2.5.0 + +dev_dependencies: + lints: ^3.0.0 diff --git a/sherpa-onnx/flutter/lib/sherpa_onnx.dart b/sherpa-onnx/flutter/lib/sherpa_onnx.dart index ff968a3d8..8cf2b16cc 100644 --- a/sherpa-onnx/flutter/lib/sherpa_onnx.dart +++ b/sherpa-onnx/flutter/lib/sherpa_onnx.dart @@ -8,6 +8,7 @@ export 'src/offline_stream.dart'; export 'src/online_recognizer.dart'; export 'src/online_stream.dart'; export 'src/speaker_identification.dart'; +export 'src/tts.dart'; export 'src/vad.dart'; export 'src/wave_reader.dart'; export 'src/wave_writer.dart'; diff --git a/sherpa-onnx/flutter/lib/src/sherpa_onnx_bindings.dart b/sherpa-onnx/flutter/lib/src/sherpa_onnx_bindings.dart index efc96a8f7..bce1be589 100644 --- a/sherpa-onnx/flutter/lib/src/sherpa_onnx_bindings.dart +++ b/sherpa-onnx/flutter/lib/src/sherpa_onnx_bindings.dart @@ -2,6 +2,55 @@ import 'dart:ffi'; import 'package:ffi/ffi.dart'; +final class SherpaOnnxOfflineTtsVitsModelConfig extends Struct { + external Pointer model; + external Pointer lexicon; + external Pointer tokens; + external Pointer dataDir; + + @Float() + external double noiseScale; + + @Float() + external double noiseScaleW; + + @Float() + external double lengthScale; + + external Pointer dictDir; +} + +final class SherpaOnnxOfflineTtsModelConfig extends Struct { + external SherpaOnnxOfflineTtsVitsModelConfig vits; + @Int32() + external int numThreads; + + @Int32() + external int debug; + + external Pointer provider; +} + +final class SherpaOnnxOfflineTtsConfig extends Struct { + external SherpaOnnxOfflineTtsModelConfig model; + external Pointer ruleFsts; + + @Int32() + external int maxNumSenetences; + + external Pointer ruleFars; +} + +final class SherpaOnnxGeneratedAudio extends Struct { + external Pointer samples; + + @Int32() + external int n; + + @Int32() + external int sampleRate; +} + final class SherpaOnnxFeatureConfig extends Struct { @Int32() external int sampleRate; @@ -218,6 +267,8 @@ final class SherpaOnnxSpeakerEmbeddingExtractorConfig extends Struct { external Pointer provider; } +final class SherpaOnnxOfflineTts extends Opaque {} + final class SherpaOnnxCircularBuffer extends Opaque {} final class SherpaOnnxVoiceActivityDetector extends Opaque {} @@ -234,6 +285,60 @@ final class SherpaOnnxSpeakerEmbeddingExtractor extends Opaque {} final class SherpaOnnxSpeakerEmbeddingManager extends Opaque {} +typedef SherpaOnnxCreateOfflineTtsNative = Pointer + Function(Pointer); + +typedef SherpaOnnxCreateOfflineTts = SherpaOnnxCreateOfflineTtsNative; + +typedef SherpaOnnxDestroyOfflineTtsNative = Void Function( + Pointer); + +typedef SherpaOnnxDestroyOfflineTts = void Function( + Pointer); + +typedef SherpaOnnxOfflineTtsSampleRateNative = Int32 Function( + Pointer); + +typedef SherpaOnnxOfflineTtsSampleRate = int Function( + Pointer); + +typedef SherpaOnnxOfflineTtsNumSpeakersNative = Int32 Function( + Pointer); + +typedef SherpaOnnxOfflineTtsNumSpeakers = int Function( + Pointer); + +typedef SherpaOnnxOfflineTtsGenerateNative = Pointer + Function(Pointer, Pointer, Int32, Float); + +typedef SherpaOnnxOfflineTtsGenerate = Pointer + Function(Pointer, Pointer, int, double); + +typedef SherpaOnnxDestroyOfflineTtsGeneratedAudioNative = Void Function( + Pointer); + +typedef SherpaOnnxDestroyOfflineTtsGeneratedAudio = void Function( + Pointer); + +typedef SherpaOnnxGeneratedAudioCallbackNative = Void Function( + Pointer, Int32); + +typedef SherpaOnnxOfflineTtsGenerateWithCallbackNative + = Pointer Function( + Pointer, + Pointer, + Int32, + Float, + Pointer>); + +typedef SherpaOnnxOfflineTtsGenerateWithCallback + = Pointer Function( + Pointer, + Pointer, + int, + double, + Pointer>); + typedef CreateOfflineRecognizerNative = Pointer Function(Pointer); @@ -608,6 +713,16 @@ typedef SherpaOnnxFreeWaveNative = Void Function(Pointer); typedef SherpaOnnxFreeWave = void Function(Pointer); class SherpaOnnxBindings { + static SherpaOnnxCreateOfflineTts? createOfflineTts; + static SherpaOnnxDestroyOfflineTts? destroyOfflineTts; + static SherpaOnnxOfflineTtsSampleRate? offlineTtsSampleRate; + static SherpaOnnxOfflineTtsNumSpeakers? offlineTtsNumSpeakers; + static SherpaOnnxOfflineTtsGenerate? offlineTtsGenerate; + static SherpaOnnxDestroyOfflineTtsGeneratedAudio? + destroyOfflineTtsGeneratedAudio; + static SherpaOnnxOfflineTtsGenerateWithCallback? + offlineTtsGenerateWithCallback; + static CreateOfflineRecognizer? createOfflineRecognizer; static DestroyOfflineRecognizer? destroyOfflineRecognizer; static CreateOfflineStream? createOfflineStream; @@ -740,6 +855,43 @@ class SherpaOnnxBindings { static SherpaOnnxFreeWave? freeWave; static void init(DynamicLibrary dynamicLibrary) { + createOfflineTts ??= dynamicLibrary + .lookup>( + 'SherpaOnnxCreateOfflineTts') + .asFunction(); + + destroyOfflineTts ??= dynamicLibrary + .lookup>( + 'SherpaOnnxDestroyOfflineTts') + .asFunction(); + + offlineTtsSampleRate ??= dynamicLibrary + .lookup>( + 'SherpaOnnxOfflineTtsSampleRate') + .asFunction(); + + offlineTtsNumSpeakers ??= dynamicLibrary + .lookup>( + 'SherpaOnnxOfflineTtsNumSpeakers') + .asFunction(); + + offlineTtsGenerate ??= dynamicLibrary + .lookup>( + 'SherpaOnnxOfflineTtsGenerate') + .asFunction(); + + destroyOfflineTtsGeneratedAudio ??= dynamicLibrary + .lookup< + NativeFunction< + SherpaOnnxDestroyOfflineTtsGeneratedAudioNative>>( + 'SherpaOnnxDestroyOfflineTtsGeneratedAudio') + .asFunction(); + + offlineTtsGenerateWithCallback ??= dynamicLibrary + .lookup>( + 'SherpaOnnxOfflineTtsGenerateWithCallback') + .asFunction(); + createOfflineRecognizer ??= dynamicLibrary .lookup>( 'CreateOfflineRecognizer') diff --git a/sherpa-onnx/flutter/lib/src/tts.dart b/sherpa-onnx/flutter/lib/src/tts.dart new file mode 100644 index 000000000..76e00a44a --- /dev/null +++ b/sherpa-onnx/flutter/lib/src/tts.dart @@ -0,0 +1,195 @@ +// Copyright (c) 2024 Xiaomi Corporation +import 'dart:convert'; +import 'dart:ffi'; +import 'dart:typed_data'; + +import 'package:ffi/ffi.dart'; + +import './sherpa_onnx_bindings.dart'; + +class OfflineTtsVitsModelConfig { + const OfflineTtsVitsModelConfig({ + required this.model, + this.lexicon = '', + required this.tokens, + this.dataDir = '', + this.noiseScale = 0.667, + this.noiseScaleW = 0.8, + this.lengthScale = 1.0, + this.dictDir = '', + }); + + @override + String toString() { + return 'OfflineTtsVitsModelConfig(model: $model, lexicon: $lexicon, tokens: $tokens, dataDir: $dataDir, noiseScale: $noiseScale, noiseScaleW: $noiseScaleW, lengthScale: $lengthScale, dictDir: $dictDir)'; + } + + final String model; + final String lexicon; + final String tokens; + final String dataDir; + final double noiseScale; + final double noiseScaleW; + final double lengthScale; + final String dictDir; +} + +class OfflineTtsModelConfig { + const OfflineTtsModelConfig({ + required this.vits, + this.numThreads = 1, + this.debug = true, + this.provider = 'cpu', + }); + + @override + String toString() { + return 'OfflineTtsModelConfig(vits: $vits, numThreads: $numThreads, debug: $debug, provider: $provider)'; + } + + final OfflineTtsVitsModelConfig vits; + final int numThreads; + final bool debug; + final String provider; +} + +class OfflineTtsConfig { + const OfflineTtsConfig({ + required this.model, + this.ruleFsts = '', + this.maxNumSenetences = 1, + this.ruleFars = '', + }); + + @override + String toString() { + return 'OfflineTtsConfig(model: $model, ruleFsts: $ruleFsts, maxNumSenetences: $maxNumSenetences, ruleFars: $ruleFars)'; + } + + final OfflineTtsModelConfig model; + final String ruleFsts; + final int maxNumSenetences; + final String ruleFars; +} + +class GeneratedAudio { + GeneratedAudio({ + required this.samples, + required this.sampleRate, + }); + + final Float32List samples; + final int sampleRate; +} + +class OfflineTts { + OfflineTts._({required this.ptr, required this.config}); + + /// The user is responsible to call the OfflineTts.free() + /// method of the returned instance to avoid memory leak. + factory OfflineTts(OfflineTtsConfig config) { + final c = calloc(); + c.ref.model.vits.model = config.model.vits.model.toNativeUtf8(); + c.ref.model.vits.lexicon = config.model.vits.lexicon.toNativeUtf8(); + c.ref.model.vits.tokens = config.model.vits.tokens.toNativeUtf8(); + c.ref.model.vits.dataDir = config.model.vits.dataDir.toNativeUtf8(); + c.ref.model.vits.noiseScale = config.model.vits.noiseScale; + c.ref.model.vits.noiseScaleW = config.model.vits.noiseScaleW; + c.ref.model.vits.lengthScale = config.model.vits.lengthScale; + c.ref.model.vits.dictDir = config.model.vits.dictDir.toNativeUtf8(); + + c.ref.model.numThreads = config.model.numThreads; + c.ref.model.debug = config.model.debug ? 1 : 0; + c.ref.model.provider = config.model.provider.toNativeUtf8(); + + c.ref.ruleFsts = config.ruleFsts.toNativeUtf8(); + c.ref.maxNumSenetences = config.maxNumSenetences; + c.ref.ruleFars = config.ruleFars.toNativeUtf8(); + + final ptr = SherpaOnnxBindings.createOfflineTts?.call(c) ?? nullptr; + + calloc.free(c.ref.ruleFars); + calloc.free(c.ref.ruleFsts); + calloc.free(c.ref.model.provider); + calloc.free(c.ref.model.vits.dictDir); + calloc.free(c.ref.model.vits.dataDir); + calloc.free(c.ref.model.vits.tokens); + calloc.free(c.ref.model.vits.lexicon); + calloc.free(c.ref.model.vits.model); + + return OfflineTts._(ptr: ptr, config: config); + } + + void free() { + SherpaOnnxBindings.destroyOfflineTts?.call(ptr); + ptr = nullptr; + } + + GeneratedAudio generate( + {required String text, int sid = 0, double speed = 1.0}) { + final Pointer textPtr = text.toNativeUtf8(); + final p = + SherpaOnnxBindings.offlineTtsGenerate?.call(ptr, textPtr, sid, speed) ?? + nullptr; + calloc.free(textPtr); + + if (p == nullptr) { + return GeneratedAudio(samples: Float32List(0), sampleRate: 0); + } + + final samples = p.ref.samples.asTypedList(p.ref.n); + final sampleRate = p.ref.sampleRate; + final newSamples = Float32List.fromList(samples); + + SherpaOnnxBindings.destroyOfflineTtsGeneratedAudio?.call(p); + + return GeneratedAudio(samples: newSamples, sampleRate: sampleRate); + } + + GeneratedAudio generateWithCallback( + {required String text, + int sid = 0, + double speed = 1.0, + required void Function(Float32List samples) callback}) { + // see + // https://github.com/dart-lang/sdk/issues/54276#issuecomment-1846109285 + // https://stackoverflow.com/questions/69537440/callbacks-in-dart-dartffi-only-supports-calling-static-dart-functions-from-nat + // https://github.com/dart-lang/sdk/blob/main/tests/ffi/isolate_local_function_callbacks_test.dart#L46 + final wrapper = + NativeCallable.isolateLocal( + (Pointer samples, int n) { + final s = samples.asTypedList(n); + final newSamples = Float32List.fromList(s); + callback(newSamples); + }); + + final Pointer textPtr = text.toNativeUtf8(); + final p = SherpaOnnxBindings.offlineTtsGenerateWithCallback + ?.call(ptr, textPtr, sid, speed, wrapper.nativeFunction) ?? + nullptr; + + calloc.free(textPtr); + wrapper.close(); + + if (p == nullptr) { + return GeneratedAudio(samples: Float32List(0), sampleRate: 0); + } + + final samples = p.ref.samples.asTypedList(p.ref.n); + final sampleRate = p.ref.sampleRate; + final newSamples = Float32List.fromList(samples); + + SherpaOnnxBindings.destroyOfflineTtsGeneratedAudio?.call(p); + + return GeneratedAudio(samples: newSamples, sampleRate: sampleRate); + } + + int get sampleRate => + SherpaOnnxBindings.offlineTtsSampleRate?.call(this.ptr) ?? 0; + + int get numSpeakers => + SherpaOnnxBindings.offlineTtsNumSpeakers?.call(this.ptr) ?? 0; + + Pointer ptr; + OfflineTtsConfig config; +} From 99a9da19de4fca60ed315fd08fb97b2ec01b9634 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sat, 15 Jun 2024 14:52:19 +0800 Subject: [PATCH 022/237] add example description for the dart package (#1011) --- sherpa-onnx/flutter/example/example.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 sherpa-onnx/flutter/example/example.md diff --git a/sherpa-onnx/flutter/example/example.md b/sherpa-onnx/flutter/example/example.md new file mode 100644 index 000000000..3f8957b2e --- /dev/null +++ b/sherpa-onnx/flutter/example/example.md @@ -0,0 +1,18 @@ +# sherpa-onnx app example + +## Streaming speech recognition + +Please see https://github.com/k2-fsa/sherpa-onnx/tree/master/dart-api-examples/streaming-asr + +## Non-streaming speech recognition + +Please see https://github.com/k2-fsa/sherpa-onnx/tree/master/dart-api-examples/non-streaming-asr + +## Text to speech (TTS) + +Please see https://github.com/k2-fsa/sherpa-onnx/tree/master/dart-api-examples/tts + +## Voice activity detection (VAD) + +Please see https://github.com/k2-fsa/sherpa-onnx/tree/master/dart-api-examples/vad + From e1201225f2b97e3fc001051927a2ce49aed455e0 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sun, 16 Jun 2024 19:17:15 +0800 Subject: [PATCH 023/237] Add Android APK for Korean (#1015) --- scripts/apk/generate-asr-apk-script.py | 21 +++++++++++++++++++ .../csrc/online-recognizer-transducer-impl.h | 2 +- sherpa-onnx/kotlin-api/OnlineRecognizer.kt | 13 ++++++++++++ 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/scripts/apk/generate-asr-apk-script.py b/scripts/apk/generate-asr-apk-script.py index 09a8915e6..8684877cf 100755 --- a/scripts/apk/generate-asr-apk-script.py +++ b/scripts/apk/generate-asr-apk-script.py @@ -209,6 +209,27 @@ def get_models(): ls -lh + popd + """, + ), + Model( + model_name="sherpa-onnx-streaming-zipformer-korean-2024-06-16", + idx=14, + lang="ko", + short_name="zipformer", + cmd=""" + pushd $model_name + rm -fv decoder-epoch-99-avg-1.int8.onnx + rm -fv encoder-epoch-99-avg-1.onnx + rm -fv joiner-epoch-99-avg-1.onnx + + rm -fv bpe.model + rm -fv README.md + rm -fv .gitattributes + rm -rfv test_wavs + + ls -lh + popd """, ), diff --git a/sherpa-onnx/csrc/online-recognizer-transducer-impl.h b/sherpa-onnx/csrc/online-recognizer-transducer-impl.h index 16c44b9de..a2531b10c 100644 --- a/sherpa-onnx/csrc/online-recognizer-transducer-impl.h +++ b/sherpa-onnx/csrc/online-recognizer-transducer-impl.h @@ -364,7 +364,7 @@ class OnlineRecognizerTransducerImpl : public OnlineRecognizerImpl { } // reset encoder states - s->SetStates(model_->GetEncoderInitStates()); + // s->SetStates(model_->GetEncoderInitStates()); // we keep the decoder_out decoder_->UpdateDecoderOut(&s->GetResult()); diff --git a/sherpa-onnx/kotlin-api/OnlineRecognizer.kt b/sherpa-onnx/kotlin-api/OnlineRecognizer.kt index e78fb6549..93a21e07e 100644 --- a/sherpa-onnx/kotlin-api/OnlineRecognizer.kt +++ b/sherpa-onnx/kotlin-api/OnlineRecognizer.kt @@ -355,6 +355,19 @@ fun getModelConfig(type: Int): OnlineModelConfig? { tokens = "$modelDir/tokens.txt", ) } + + 14 -> { + val modelDir = "sherpa-onnx-streaming-zipformer-korean-2024-06-16" + return OnlineModelConfig( + transducer = OnlineTransducerModelConfig( + encoder = "$modelDir/encoder-epoch-99-avg-1.int8.onnx", + decoder = "$modelDir/decoder-epoch-99-avg-1.onnx", + joiner = "$modelDir/joiner-epoch-99-avg-1.int8.onnx", + ), + tokens = "$modelDir/tokens.txt", + modelType = "zipformer", + ) + } } return null } From dd69a1b56bac7b1c92de31c6e2c2ae220cda2382 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Mon, 17 Jun 2024 11:57:38 +0800 Subject: [PATCH 024/237] Release v1.9.30 (#1016) --- .../workflows/aarch64-linux-gnu-shared.yaml | 2 +- .../workflows/aarch64-linux-gnu-static.yaml | 2 +- .github/workflows/android.yaml | 2 +- .github/workflows/apk-asr-2pass.yaml | 2 +- .github/workflows/apk-asr.yaml | 2 +- .../workflows/apk-audio-tagging-wearos.yaml | 2 +- .github/workflows/apk-audio-tagging.yaml | 2 +- .github/workflows/apk-kws.yaml | 2 +- .../workflows/apk-speaker-identification.yaml | 2 +- .../apk-spoken-language-identification.yaml | 2 +- .github/workflows/apk-tts-engine.yaml | 2 +- .github/workflows/apk-tts.yaml | 2 +- .github/workflows/apk-vad-asr.yaml | 2 +- .github/workflows/apk-vad.yaml | 2 +- .github/workflows/arm-linux-gnueabihf.yaml | 2 +- .github/workflows/build-wheels-aarch64.yaml | 2 +- .github/workflows/build-wheels-armv7l.yaml | 2 +- .github/workflows/build-wheels-linux.yaml | 2 +- .../workflows/build-wheels-macos-arm64.yaml | 2 +- .../build-wheels-macos-universal2.yaml | 2 +- .github/workflows/build-wheels-macos-x64.yaml | 2 +- .github/workflows/build-wheels-win32.yaml | 2 +- .github/workflows/build-wheels-win64.yaml | 2 +- .github/workflows/build-xcframework.yaml | 2 +- .github/workflows/flutter-linux.yaml | 2 +- .github/workflows/flutter-macos.yaml | 2 +- .github/workflows/flutter-windows-x64.yaml | 2 +- .github/workflows/linux-gpu.yaml | 2 +- .github/workflows/linux.yaml | 2 +- .github/workflows/macos.yaml | 2 +- .github/workflows/mfc.yaml | 2 +- .github/workflows/pkg-config.yaml | 2 +- .github/workflows/release-dart-package.yaml | 2 +- .github/workflows/riscv64-linux.yaml | 2 +- .github/workflows/windows-arm64.yaml | 2 +- .github/workflows/windows-x64-cuda.yaml | 2 +- .github/workflows/windows-x64-debug.yaml | 2 +- .github/workflows/windows-x64.yaml | 2 +- .github/workflows/windows-x86-debug.yaml | 2 +- .github/workflows/windows-x86.yaml | 2 +- CMakeLists.txt | 2 +- .../non-streaming-asr/pubspec.yaml | 2 +- dart-api-examples/streaming-asr/pubspec.yaml | 2 +- dart-api-examples/tts/pubspec.yaml | 2 +- dart-api-examples/vad/pubspec.yaml | 2 +- nodejs-addon-examples/package.json | 2 +- scripts/dart/release.sh | 2 +- sherpa-onnx/flutter/CHANGELOG.md | 4 ++ sherpa-onnx/flutter/lib/sherpa_onnx.dart | 6 +- .../flutter/lib/src/online_stream.dart | 2 +- .../lib/src/speaker_identification.dart | 66 +++++++++---------- sherpa-onnx/flutter/lib/src/tts.dart | 6 +- sherpa-onnx/flutter/lib/src/vad.dart | 8 +-- 53 files changed, 93 insertions(+), 93 deletions(-) diff --git a/.github/workflows/aarch64-linux-gnu-shared.yaml b/.github/workflows/aarch64-linux-gnu-shared.yaml index 50ba2c236..5e82d9b3a 100644 --- a/.github/workflows/aarch64-linux-gnu-shared.yaml +++ b/.github/workflows/aarch64-linux-gnu-shared.yaml @@ -6,7 +6,7 @@ on: branches: - master tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' paths: - '.github/workflows/aarch64-linux-gnu-shared.yaml' - 'CMakeLists.txt' diff --git a/.github/workflows/aarch64-linux-gnu-static.yaml b/.github/workflows/aarch64-linux-gnu-static.yaml index 13edc9c17..765e2422f 100644 --- a/.github/workflows/aarch64-linux-gnu-static.yaml +++ b/.github/workflows/aarch64-linux-gnu-static.yaml @@ -6,7 +6,7 @@ on: branches: - master tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' paths: - '.github/workflows/aarch64-linux-gnu-static.yaml' - 'CMakeLists.txt' diff --git a/.github/workflows/android.yaml b/.github/workflows/android.yaml index 69bfe2e3e..35dfd6b26 100644 --- a/.github/workflows/android.yaml +++ b/.github/workflows/android.yaml @@ -12,7 +12,7 @@ on: - 'sherpa-onnx/jni/*' - 'build-android*.sh' tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' pull_request: branches: - master diff --git a/.github/workflows/apk-asr-2pass.yaml b/.github/workflows/apk-asr-2pass.yaml index a79a28af3..1e781694f 100644 --- a/.github/workflows/apk-asr-2pass.yaml +++ b/.github/workflows/apk-asr-2pass.yaml @@ -3,7 +3,7 @@ name: apk-asr-2pass on: push: tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' workflow_dispatch: diff --git a/.github/workflows/apk-asr.yaml b/.github/workflows/apk-asr.yaml index d381cb1af..69490cc89 100644 --- a/.github/workflows/apk-asr.yaml +++ b/.github/workflows/apk-asr.yaml @@ -3,7 +3,7 @@ name: apk-asr on: push: tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' workflow_dispatch: diff --git a/.github/workflows/apk-audio-tagging-wearos.yaml b/.github/workflows/apk-audio-tagging-wearos.yaml index b2d2d959a..577b79acb 100644 --- a/.github/workflows/apk-audio-tagging-wearos.yaml +++ b/.github/workflows/apk-audio-tagging-wearos.yaml @@ -3,7 +3,7 @@ name: apk-audio-tagging-wearos on: push: tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' workflow_dispatch: diff --git a/.github/workflows/apk-audio-tagging.yaml b/.github/workflows/apk-audio-tagging.yaml index db739021f..efd71037a 100644 --- a/.github/workflows/apk-audio-tagging.yaml +++ b/.github/workflows/apk-audio-tagging.yaml @@ -3,7 +3,7 @@ name: apk-audio-tagging on: push: tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' workflow_dispatch: diff --git a/.github/workflows/apk-kws.yaml b/.github/workflows/apk-kws.yaml index 8c348e5a3..5629c5d06 100644 --- a/.github/workflows/apk-kws.yaml +++ b/.github/workflows/apk-kws.yaml @@ -3,7 +3,7 @@ name: apk-kws on: push: tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' workflow_dispatch: diff --git a/.github/workflows/apk-speaker-identification.yaml b/.github/workflows/apk-speaker-identification.yaml index 2d674c78c..ecaf06974 100644 --- a/.github/workflows/apk-speaker-identification.yaml +++ b/.github/workflows/apk-speaker-identification.yaml @@ -3,7 +3,7 @@ name: apk-speaker-identification on: push: tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' workflow_dispatch: diff --git a/.github/workflows/apk-spoken-language-identification.yaml b/.github/workflows/apk-spoken-language-identification.yaml index dd4322156..ad63dde6c 100644 --- a/.github/workflows/apk-spoken-language-identification.yaml +++ b/.github/workflows/apk-spoken-language-identification.yaml @@ -3,7 +3,7 @@ name: apk-slid on: push: tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' workflow_dispatch: diff --git a/.github/workflows/apk-tts-engine.yaml b/.github/workflows/apk-tts-engine.yaml index 89013a59c..d225df022 100644 --- a/.github/workflows/apk-tts-engine.yaml +++ b/.github/workflows/apk-tts-engine.yaml @@ -3,7 +3,7 @@ name: apk-tts-engine on: push: tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' workflow_dispatch: diff --git a/.github/workflows/apk-tts.yaml b/.github/workflows/apk-tts.yaml index 1b51368ed..60ae9c723 100644 --- a/.github/workflows/apk-tts.yaml +++ b/.github/workflows/apk-tts.yaml @@ -3,7 +3,7 @@ name: apk-tts on: push: tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' workflow_dispatch: diff --git a/.github/workflows/apk-vad-asr.yaml b/.github/workflows/apk-vad-asr.yaml index aea8028d7..574b4bf11 100644 --- a/.github/workflows/apk-vad-asr.yaml +++ b/.github/workflows/apk-vad-asr.yaml @@ -3,7 +3,7 @@ name: apk-vad-asr on: push: tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' workflow_dispatch: diff --git a/.github/workflows/apk-vad.yaml b/.github/workflows/apk-vad.yaml index ddd8c6afb..6d5259a79 100644 --- a/.github/workflows/apk-vad.yaml +++ b/.github/workflows/apk-vad.yaml @@ -3,7 +3,7 @@ name: apk-vad on: push: tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' workflow_dispatch: diff --git a/.github/workflows/arm-linux-gnueabihf.yaml b/.github/workflows/arm-linux-gnueabihf.yaml index 269456815..c260c5a0b 100644 --- a/.github/workflows/arm-linux-gnueabihf.yaml +++ b/.github/workflows/arm-linux-gnueabihf.yaml @@ -13,7 +13,7 @@ on: - 'sherpa-onnx/c-api/*' - 'toolchains/arm-linux-gnueabihf.toolchain.cmake' tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' pull_request: branches: - master diff --git a/.github/workflows/build-wheels-aarch64.yaml b/.github/workflows/build-wheels-aarch64.yaml index 971324e9b..4bc5b79c2 100644 --- a/.github/workflows/build-wheels-aarch64.yaml +++ b/.github/workflows/build-wheels-aarch64.yaml @@ -3,7 +3,7 @@ name: build-wheels-aarch64 on: push: tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' workflow_dispatch: env: diff --git a/.github/workflows/build-wheels-armv7l.yaml b/.github/workflows/build-wheels-armv7l.yaml index a720a39f4..6b7d74460 100644 --- a/.github/workflows/build-wheels-armv7l.yaml +++ b/.github/workflows/build-wheels-armv7l.yaml @@ -3,7 +3,7 @@ name: build-wheels-armv7l on: push: tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' workflow_dispatch: env: diff --git a/.github/workflows/build-wheels-linux.yaml b/.github/workflows/build-wheels-linux.yaml index 7a4f59011..426545622 100644 --- a/.github/workflows/build-wheels-linux.yaml +++ b/.github/workflows/build-wheels-linux.yaml @@ -3,7 +3,7 @@ name: build-wheels-linux on: push: tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' workflow_dispatch: env: diff --git a/.github/workflows/build-wheels-macos-arm64.yaml b/.github/workflows/build-wheels-macos-arm64.yaml index c31d92bf4..bc02ce38f 100644 --- a/.github/workflows/build-wheels-macos-arm64.yaml +++ b/.github/workflows/build-wheels-macos-arm64.yaml @@ -3,7 +3,7 @@ name: build-wheels-macos-arm64 on: push: tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' workflow_dispatch: env: diff --git a/.github/workflows/build-wheels-macos-universal2.yaml b/.github/workflows/build-wheels-macos-universal2.yaml index d08a93075..2ffaf93c9 100644 --- a/.github/workflows/build-wheels-macos-universal2.yaml +++ b/.github/workflows/build-wheels-macos-universal2.yaml @@ -5,7 +5,7 @@ on: branches: - wheel tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' workflow_dispatch: env: diff --git a/.github/workflows/build-wheels-macos-x64.yaml b/.github/workflows/build-wheels-macos-x64.yaml index 250ef76c7..8ad21d0ed 100644 --- a/.github/workflows/build-wheels-macos-x64.yaml +++ b/.github/workflows/build-wheels-macos-x64.yaml @@ -5,7 +5,7 @@ on: branches: - wheel tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' workflow_dispatch: env: diff --git a/.github/workflows/build-wheels-win32.yaml b/.github/workflows/build-wheels-win32.yaml index 752fbef32..ab3d32b13 100644 --- a/.github/workflows/build-wheels-win32.yaml +++ b/.github/workflows/build-wheels-win32.yaml @@ -3,7 +3,7 @@ name: build-wheels-win32 on: push: tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' workflow_dispatch: env: diff --git a/.github/workflows/build-wheels-win64.yaml b/.github/workflows/build-wheels-win64.yaml index ee9f020a8..7e7d810d9 100644 --- a/.github/workflows/build-wheels-win64.yaml +++ b/.github/workflows/build-wheels-win64.yaml @@ -5,7 +5,7 @@ on: branches: - wheel tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' workflow_dispatch: env: diff --git a/.github/workflows/build-xcframework.yaml b/.github/workflows/build-xcframework.yaml index f6d0dce73..2afd95cab 100644 --- a/.github/workflows/build-xcframework.yaml +++ b/.github/workflows/build-xcframework.yaml @@ -12,7 +12,7 @@ on: - 'sherpa-onnx/csrc/*' - 'sherpa-onnx/c-api/*' tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' pull_request: branches: - master diff --git a/.github/workflows/flutter-linux.yaml b/.github/workflows/flutter-linux.yaml index a74f45007..ca665fd8b 100644 --- a/.github/workflows/flutter-linux.yaml +++ b/.github/workflows/flutter-linux.yaml @@ -5,7 +5,7 @@ on: branches: - flutter tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' paths: - '.github/workflows/flutter-linux.yaml' - 'CMakeLists.txt' diff --git a/.github/workflows/flutter-macos.yaml b/.github/workflows/flutter-macos.yaml index 25a53d89a..d92d30879 100644 --- a/.github/workflows/flutter-macos.yaml +++ b/.github/workflows/flutter-macos.yaml @@ -5,7 +5,7 @@ on: branches: - flutter tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' paths: - '.github/workflows/flutter-macos.yaml' - 'CMakeLists.txt' diff --git a/.github/workflows/flutter-windows-x64.yaml b/.github/workflows/flutter-windows-x64.yaml index cfe54b053..ce4e7456a 100644 --- a/.github/workflows/flutter-windows-x64.yaml +++ b/.github/workflows/flutter-windows-x64.yaml @@ -5,7 +5,7 @@ on: branches: - flutter tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' paths: - '.github/workflows/flutter-windows-x64.yaml' - 'CMakeLists.txt' diff --git a/.github/workflows/linux-gpu.yaml b/.github/workflows/linux-gpu.yaml index bccde7390..3f6052e63 100644 --- a/.github/workflows/linux-gpu.yaml +++ b/.github/workflows/linux-gpu.yaml @@ -5,7 +5,7 @@ on: branches: - master tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' paths: - '.github/workflows/linux-gpu.yaml' - '.github/scripts/test-online-transducer.sh' diff --git a/.github/workflows/linux.yaml b/.github/workflows/linux.yaml index 861a2df0e..3d95d3c5d 100644 --- a/.github/workflows/linux.yaml +++ b/.github/workflows/linux.yaml @@ -5,7 +5,7 @@ on: branches: - master tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' paths: - '.github/workflows/linux.yaml' - '.github/scripts/test-kws.sh' diff --git a/.github/workflows/macos.yaml b/.github/workflows/macos.yaml index 1c7fe2ee1..cb88c8534 100644 --- a/.github/workflows/macos.yaml +++ b/.github/workflows/macos.yaml @@ -5,7 +5,7 @@ on: branches: - master tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' paths: - '.github/workflows/macos.yaml' - '.github/scripts/test-kws.sh' diff --git a/.github/workflows/mfc.yaml b/.github/workflows/mfc.yaml index 197386bbf..e501478a2 100644 --- a/.github/workflows/mfc.yaml +++ b/.github/workflows/mfc.yaml @@ -5,7 +5,7 @@ on: branches: - master tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' paths: - '.github/workflows/mfc.yaml' - 'CMakeLists.txt' diff --git a/.github/workflows/pkg-config.yaml b/.github/workflows/pkg-config.yaml index b1b24218e..57ed8a21a 100644 --- a/.github/workflows/pkg-config.yaml +++ b/.github/workflows/pkg-config.yaml @@ -6,7 +6,7 @@ on: - master - pkg-config tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' paths: - '.github/workflows/pkg-config.yaml' - '.github/scripts/test-offline-tts.sh' diff --git a/.github/workflows/release-dart-package.yaml b/.github/workflows/release-dart-package.yaml index 9ed4d0150..9f577ed25 100644 --- a/.github/workflows/release-dart-package.yaml +++ b/.github/workflows/release-dart-package.yaml @@ -5,7 +5,7 @@ on: branches: - ci-pub-dart tags: - - 'v[0-9]+.[0-9]+.[0-9]+*' # tag-pattern on pub.dev: 'v{{version}}' + - 'dart-v[0-9]+.[0-9]+.[0-9]+*' # tag-pattern on pub.dev: 'v{{version}}' workflow_dispatch: diff --git a/.github/workflows/riscv64-linux.yaml b/.github/workflows/riscv64-linux.yaml index 5393e9735..acae80e41 100644 --- a/.github/workflows/riscv64-linux.yaml +++ b/.github/workflows/riscv64-linux.yaml @@ -13,7 +13,7 @@ on: - 'toolchains/riscv64-linux-gnu.toolchain.cmake' - 'build-riscv64-linux-gnu.sh' tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' pull_request: branches: - master diff --git a/.github/workflows/windows-arm64.yaml b/.github/workflows/windows-arm64.yaml index 4fb85a9e1..9b56c7b22 100644 --- a/.github/workflows/windows-arm64.yaml +++ b/.github/workflows/windows-arm64.yaml @@ -5,7 +5,7 @@ on: branches: - master tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' paths: - '.github/workflows/windows-arm64.yaml' - 'CMakeLists.txt' diff --git a/.github/workflows/windows-x64-cuda.yaml b/.github/workflows/windows-x64-cuda.yaml index 0672065c2..557d1d34e 100644 --- a/.github/workflows/windows-x64-cuda.yaml +++ b/.github/workflows/windows-x64-cuda.yaml @@ -5,7 +5,7 @@ on: branches: - master tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' paths: - '.github/workflows/windows-x64-cuda.yaml' - '.github/scripts/test-online-transducer.sh' diff --git a/.github/workflows/windows-x64-debug.yaml b/.github/workflows/windows-x64-debug.yaml index fd4e1dd13..501763248 100644 --- a/.github/workflows/windows-x64-debug.yaml +++ b/.github/workflows/windows-x64-debug.yaml @@ -5,7 +5,7 @@ on: branches: - master tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' paths: - '.github/workflows/windows-x64-debug.yaml' - '.github/scripts/test-online-transducer.sh' diff --git a/.github/workflows/windows-x64.yaml b/.github/workflows/windows-x64.yaml index d1e0a2d4f..ebd61866a 100644 --- a/.github/workflows/windows-x64.yaml +++ b/.github/workflows/windows-x64.yaml @@ -5,7 +5,7 @@ on: branches: - master tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' paths: - '.github/workflows/windows-x64.yaml' - '.github/scripts/test-online-transducer.sh' diff --git a/.github/workflows/windows-x86-debug.yaml b/.github/workflows/windows-x86-debug.yaml index 0b56e157c..03dce165a 100644 --- a/.github/workflows/windows-x86-debug.yaml +++ b/.github/workflows/windows-x86-debug.yaml @@ -5,7 +5,7 @@ on: branches: - master tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' paths: - '.github/workflows/windows-x86-debug.yaml' - '.github/scripts/test-online-transducer.sh' diff --git a/.github/workflows/windows-x86.yaml b/.github/workflows/windows-x86.yaml index 1230b20a4..44321114a 100644 --- a/.github/workflows/windows-x86.yaml +++ b/.github/workflows/windows-x86.yaml @@ -5,7 +5,7 @@ on: branches: - master tags: - - '*' + - 'v[0-9]+.[0-9]+.[0-9]+*' paths: - '.github/workflows/windows-x86.yaml' - '.github/scripts/test-online-transducer.sh' diff --git a/CMakeLists.txt b/CMakeLists.txt index 4c1f66416..974dd1c8f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,7 @@ project(sherpa-onnx) # Remember to update # ./nodejs-addon-examples # ./dart-api-examples/ -set(SHERPA_ONNX_VERSION "1.9.29") +set(SHERPA_ONNX_VERSION "1.9.30") # Disable warning about # diff --git a/dart-api-examples/non-streaming-asr/pubspec.yaml b/dart-api-examples/non-streaming-asr/pubspec.yaml index 61dbe71f2..9253f105d 100644 --- a/dart-api-examples/non-streaming-asr/pubspec.yaml +++ b/dart-api-examples/non-streaming-asr/pubspec.yaml @@ -10,7 +10,7 @@ environment: # Add regular dependencies here. dependencies: - sherpa_onnx: ^1.9.29 + sherpa_onnx: ^1.9.30 path: ^1.9.0 args: ^2.5.0 diff --git a/dart-api-examples/streaming-asr/pubspec.yaml b/dart-api-examples/streaming-asr/pubspec.yaml index 372fbf6e3..a740b371c 100644 --- a/dart-api-examples/streaming-asr/pubspec.yaml +++ b/dart-api-examples/streaming-asr/pubspec.yaml @@ -11,7 +11,7 @@ environment: # Add regular dependencies here. dependencies: - sherpa_onnx: ^1.9.29 + sherpa_onnx: ^1.9.30 path: ^1.9.0 args: ^2.5.0 diff --git a/dart-api-examples/tts/pubspec.yaml b/dart-api-examples/tts/pubspec.yaml index 205ef9b00..fed0bf4e6 100644 --- a/dart-api-examples/tts/pubspec.yaml +++ b/dart-api-examples/tts/pubspec.yaml @@ -8,7 +8,7 @@ environment: # Add regular dependencies here. dependencies: - sherpa_onnx: ^1.9.29 + sherpa_onnx: ^1.9.30 path: ^1.9.0 args: ^2.5.0 diff --git a/dart-api-examples/vad/pubspec.yaml b/dart-api-examples/vad/pubspec.yaml index 54c13e0fa..e0d6f6dce 100644 --- a/dart-api-examples/vad/pubspec.yaml +++ b/dart-api-examples/vad/pubspec.yaml @@ -9,7 +9,7 @@ environment: sdk: ^3.4.0 dependencies: - sherpa_onnx: ^1.9.29 + sherpa_onnx: ^1.9.30 path: ^1.9.0 args: ^2.5.0 diff --git a/nodejs-addon-examples/package.json b/nodejs-addon-examples/package.json index 55298ec94..28d846a11 100644 --- a/nodejs-addon-examples/package.json +++ b/nodejs-addon-examples/package.json @@ -1,5 +1,5 @@ { "dependencies": { - "sherpa-onnx-node": "^1.9.29" + "sherpa-onnx-node": "^1.9.30" } } diff --git a/scripts/dart/release.sh b/scripts/dart/release.sh index 78b80ae61..d1d728364 100755 --- a/scripts/dart/release.sh +++ b/scripts/dart/release.sh @@ -27,7 +27,7 @@ HF_MIRROR=hf.co linux_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl linux_wheel=$src_dir/$linux_wheel_filename -macos_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp39-cp39-macosx_10_14_universal2.whl +macos_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp39-cp39-macosx_11_0_universal2.whl macos_wheel=$src_dir/$macos_wheel_filename windows_x64_wheel_filename=sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-win_amd64.whl diff --git a/sherpa-onnx/flutter/CHANGELOG.md b/sherpa-onnx/flutter/CHANGELOG.md index 1e583389f..5f912155b 100644 --- a/sherpa-onnx/flutter/CHANGELOG.md +++ b/sherpa-onnx/flutter/CHANGELOG.md @@ -1,3 +1,7 @@ +## 1.9.30 + +* Add TTS + ## 1.9.29 * Publish with CI diff --git a/sherpa-onnx/flutter/lib/sherpa_onnx.dart b/sherpa-onnx/flutter/lib/sherpa_onnx.dart index 8cf2b16cc..0a2445619 100644 --- a/sherpa-onnx/flutter/lib/sherpa_onnx.dart +++ b/sherpa-onnx/flutter/lib/sherpa_onnx.dart @@ -25,7 +25,7 @@ final DynamicLibrary _dylib = () { if (_path == null) { return DynamicLibrary.open('libsherpa-onnx-c-api.dylib'); } else { - return DynamicLibrary.open('${_path}/libsherpa-onnx-c-api.dylib'); + return DynamicLibrary.open('$_path/libsherpa-onnx-c-api.dylib'); } } @@ -33,7 +33,7 @@ final DynamicLibrary _dylib = () { if (_path == null) { return DynamicLibrary.open('libsherpa-onnx-c-api.so'); } else { - return DynamicLibrary.open('${_path}/libsherpa-onnx-c-api.so'); + return DynamicLibrary.open('$_path/libsherpa-onnx-c-api.so'); } } @@ -41,7 +41,7 @@ final DynamicLibrary _dylib = () { if (_path == null) { return DynamicLibrary.open('sherpa-onnx-c-api.dll'); } else { - return DynamicLibrary.open('${_path}\\sherpa-onnx-c-api.dll'); + return DynamicLibrary.open('$_path\\sherpa-onnx-c-api.dll'); } } diff --git a/sherpa-onnx/flutter/lib/src/online_stream.dart b/sherpa-onnx/flutter/lib/src/online_stream.dart index 29b196221..e1f61e15c 100644 --- a/sherpa-onnx/flutter/lib/src/online_stream.dart +++ b/sherpa-onnx/flutter/lib/src/online_stream.dart @@ -34,7 +34,7 @@ class OnlineStream { } void inputFinished() { - SherpaOnnxBindings.onlineStreamInputFinished?.call(this.ptr); + SherpaOnnxBindings.onlineStreamInputFinished?.call(ptr); } Pointer ptr; diff --git a/sherpa-onnx/flutter/lib/src/speaker_identification.dart b/sherpa-onnx/flutter/lib/src/speaker_identification.dart index 0e01044fd..5c2e10744 100644 --- a/sherpa-onnx/flutter/lib/src/speaker_identification.dart +++ b/sherpa-onnx/flutter/lib/src/speaker_identification.dart @@ -71,7 +71,7 @@ class SpeakerEmbeddingExtractor { bool isReady(OnlineStream stream) { final int ready = SherpaOnnxBindings.speakerEmbeddingExtractorIsReady - ?.call(this.ptr, stream.ptr) ?? + ?.call(ptr, stream.ptr) ?? 0; return ready == 1; } @@ -79,15 +79,15 @@ class SpeakerEmbeddingExtractor { Float32List compute(OnlineStream stream) { final Pointer embedding = SherpaOnnxBindings .speakerEmbeddingExtractorComputeEmbedding - ?.call(this.ptr, stream.ptr) ?? + ?.call(ptr, stream.ptr) ?? nullptr; if (embedding == nullptr) { return Float32List(0); } - final embeddingList = embedding.asTypedList(this.dim); - final ans = Float32List(this.dim); + final embeddingList = embedding.asTypedList(dim); + final ans = Float32List(dim); ans.setAll(0, embeddingList); SherpaOnnxBindings.speakerEmbeddingExtractorDestroyEmbedding @@ -111,13 +111,13 @@ class SpeakerEmbeddingManager { } void free() { - SherpaOnnxBindings.destroySpeakerEmbeddingManager?.call(this.ptr); - this.ptr = nullptr; + SherpaOnnxBindings.destroySpeakerEmbeddingManager?.call(ptr); + ptr = nullptr; } /// Return true if added successfully; return false otherwise bool add({required String name, required Float32List embedding}) { - assert(embedding.length == this.dim, '${embedding.length} vs ${this.dim}'); + assert(embedding.length == dim, '${embedding.length} vs $dim'); final Pointer namePtr = name.toNativeUtf8(); final int n = embedding.length; @@ -126,9 +126,9 @@ class SpeakerEmbeddingManager { final pList = p.asTypedList(n); pList.setAll(0, embedding); - final int ok = SherpaOnnxBindings.speakerEmbeddingManagerAdd - ?.call(this.ptr, namePtr, p) ?? - 0; + final int ok = + SherpaOnnxBindings.speakerEmbeddingManagerAdd?.call(ptr, namePtr, p) ?? + 0; calloc.free(p); calloc.free(namePtr); @@ -141,19 +141,19 @@ class SpeakerEmbeddingManager { final Pointer namePtr = name.toNativeUtf8(); final int n = embeddingList.length; - final Pointer p = calloc(n * this.dim); - final pList = p.asTypedList(n * this.dim); + final Pointer p = calloc(n * dim); + final pList = p.asTypedList(n * dim); int offset = 0; for (final e in embeddingList) { - assert(e.length == this.dim, '${e.length} vs ${this.dim}'); + assert(e.length == dim, '${e.length} vs $dim'); pList.setAll(offset, e); - offset += this.dim; + offset += dim; } final int ok = SherpaOnnxBindings.speakerEmbeddingManagerAddListFlattened - ?.call(this.ptr, namePtr, p, n) ?? + ?.call(ptr, namePtr, p, n) ?? 0; calloc.free(p); @@ -166,7 +166,7 @@ class SpeakerEmbeddingManager { final Pointer namePtr = name.toNativeUtf8(); final int found = SherpaOnnxBindings.speakerEmbeddingManagerContains - ?.call(this.ptr, namePtr) ?? + ?.call(ptr, namePtr) ?? 0; calloc.free(namePtr); @@ -177,9 +177,9 @@ class SpeakerEmbeddingManager { bool remove(String name) { final Pointer namePtr = name.toNativeUtf8(); - final int ok = SherpaOnnxBindings.speakerEmbeddingManagerRemove - ?.call(this.ptr, namePtr) ?? - 0; + final int ok = + SherpaOnnxBindings.speakerEmbeddingManagerRemove?.call(ptr, namePtr) ?? + 0; calloc.free(namePtr); @@ -188,14 +188,14 @@ class SpeakerEmbeddingManager { /// Return an empty string if no speaker is found String search({required Float32List embedding, required double threshold}) { - assert(embedding.length == this.dim); + assert(embedding.length == dim); - final Pointer p = calloc(this.dim); - final pList = p.asTypedList(this.dim); + final Pointer p = calloc(dim); + final pList = p.asTypedList(dim); pList.setAll(0, embedding); final Pointer name = SherpaOnnxBindings.speakerEmbeddingManagerSearch - ?.call(this.ptr, p, threshold) ?? + ?.call(ptr, p, threshold) ?? nullptr; calloc.free(p); @@ -215,16 +215,16 @@ class SpeakerEmbeddingManager { {required String name, required Float32List embedding, required double threshold}) { - assert(embedding.length == this.dim); + assert(embedding.length == dim); final Pointer namePtr = name.toNativeUtf8(); - final Pointer p = calloc(this.dim); - final pList = p.asTypedList(this.dim); + final Pointer p = calloc(dim); + final pList = p.asTypedList(dim); pList.setAll(0, embedding); final int ok = SherpaOnnxBindings.speakerEmbeddingManagerVerify - ?.call(this.ptr, namePtr, p, threshold) ?? + ?.call(ptr, namePtr, p, threshold) ?? 0; calloc.free(p); @@ -234,19 +234,17 @@ class SpeakerEmbeddingManager { } int get numSpeakers => - SherpaOnnxBindings.speakerEmbeddingManagerNumSpeakers?.call(this.ptr) ?? - 0; + SherpaOnnxBindings.speakerEmbeddingManagerNumSpeakers?.call(ptr) ?? 0; List get allSpeakerNames { - int n = this.numSpeakers; + int n = numSpeakers; if (n == 0) { return []; } - final Pointer> names = SherpaOnnxBindings - .speakerEmbeddingManagerGetAllSpeakers - ?.call(this.ptr) ?? - nullptr; + final Pointer> names = + SherpaOnnxBindings.speakerEmbeddingManagerGetAllSpeakers?.call(ptr) ?? + nullptr; if (names == nullptr) { return []; diff --git a/sherpa-onnx/flutter/lib/src/tts.dart b/sherpa-onnx/flutter/lib/src/tts.dart index 76e00a44a..ad304df2a 100644 --- a/sherpa-onnx/flutter/lib/src/tts.dart +++ b/sherpa-onnx/flutter/lib/src/tts.dart @@ -1,5 +1,4 @@ // Copyright (c) 2024 Xiaomi Corporation -import 'dart:convert'; import 'dart:ffi'; import 'dart:typed_data'; @@ -184,11 +183,10 @@ class OfflineTts { return GeneratedAudio(samples: newSamples, sampleRate: sampleRate); } - int get sampleRate => - SherpaOnnxBindings.offlineTtsSampleRate?.call(this.ptr) ?? 0; + int get sampleRate => SherpaOnnxBindings.offlineTtsSampleRate?.call(ptr) ?? 0; int get numSpeakers => - SherpaOnnxBindings.offlineTtsNumSpeakers?.call(this.ptr) ?? 0; + SherpaOnnxBindings.offlineTtsNumSpeakers?.call(ptr) ?? 0; Pointer ptr; OfflineTtsConfig config; diff --git a/sherpa-onnx/flutter/lib/src/vad.dart b/sherpa-onnx/flutter/lib/src/vad.dart index 7f8b412a1..6c36cd8f0 100644 --- a/sherpa-onnx/flutter/lib/src/vad.dart +++ b/sherpa-onnx/flutter/lib/src/vad.dart @@ -76,14 +76,14 @@ class CircularBuffer { final pList = p.asTypedList(n); pList.setAll(0, data); - SherpaOnnxBindings.circularBufferPush?.call(this.ptr, p, n); + SherpaOnnxBindings.circularBufferPush?.call(ptr, p, n); calloc.free(p); } Float32List get({required int startIndex, required int n}) { final Pointer p = - SherpaOnnxBindings.circularBufferGet?.call(this.ptr, startIndex, n) ?? + SherpaOnnxBindings.circularBufferGet?.call(ptr, startIndex, n) ?? nullptr; if (p == nullptr) { @@ -99,11 +99,11 @@ class CircularBuffer { } void pop(int n) { - SherpaOnnxBindings.circularBufferPop?.call(this.ptr, n); + SherpaOnnxBindings.circularBufferPop?.call(ptr, n); } void reset() { - SherpaOnnxBindings.circularBufferReset?.call(this.ptr); + SherpaOnnxBindings.circularBufferReset?.call(ptr); } int get size => SherpaOnnxBindings.circularBufferSize?.call(ptr) ?? 0; From b0f7ed3ee30818d347d6660b84bf9c8597200d7b Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Mon, 17 Jun 2024 14:28:53 +0800 Subject: [PATCH 025/237] Add inverse text normalization for non-streaming ASR (#1017) --- .github/scripts/test-python.sh | 13 +- .../inverse-text-normalization-offline-asr.py | 81 ++++++++++++ .../csrc/offline-recognizer-ctc-impl.h | 8 +- sherpa-onnx/csrc/offline-recognizer-impl.cc | 118 ++++++++++++++++++ sherpa-onnx/csrc/offline-recognizer-impl.h | 15 +++ .../csrc/offline-recognizer-paraformer-impl.h | 7 +- .../csrc/offline-recognizer-transducer-impl.h | 7 +- .../offline-recognizer-transducer-nemo-impl.h | 7 +- .../csrc/offline-recognizer-whisper-impl.h | 7 +- sherpa-onnx/csrc/offline-recognizer.cc | 46 ++++++- sherpa-onnx/csrc/offline-recognizer.h | 13 +- sherpa-onnx/python/csrc/offline-recognizer.cc | 7 +- .../python/sherpa_onnx/offline_recognizer.py | 70 +++++++++++ 13 files changed, 380 insertions(+), 19 deletions(-) create mode 100755 python-api-examples/inverse-text-normalization-offline-asr.py diff --git a/.github/scripts/test-python.sh b/.github/scripts/test-python.sh index a52b5b910..c03b95426 100755 --- a/.github/scripts/test-python.sh +++ b/.github/scripts/test-python.sh @@ -248,7 +248,7 @@ if [[ x$OS != x'windows-latest' ]]; then python3 ./python-api-examples/online-decode-files.py \ --tokens=$repo/tokens.txt \ --encoder=$repo/encoder-epoch-99-avg-1.int8.onnx \ - --decoder=$repo/decoder-epoch-99-avg-1.int8.onnx \ + --decoder=$repo/decoder-epoch-99-avg-1.onnx \ --joiner=$repo/joiner-epoch-99-avg-1.int8.onnx \ $repo/test_wavs/0.wav \ $repo/test_wavs/1.wav \ @@ -286,7 +286,7 @@ python3 ./python-api-examples/offline-decode-files.py \ python3 ./python-api-examples/offline-decode-files.py \ --tokens=$repo/tokens.txt \ --encoder=$repo/encoder-epoch-99-avg-1.int8.onnx \ - --decoder=$repo/decoder-epoch-99-avg-1.int8.onnx \ + --decoder=$repo/decoder-epoch-99-avg-1.onnx \ --joiner=$repo/joiner-epoch-99-avg-1.int8.onnx \ $repo/test_wavs/0.wav \ $repo/test_wavs/1.wav \ @@ -330,6 +330,15 @@ if [[ x$OS != x'windows-latest' ]]; then python3 sherpa-onnx/python/tests/test_offline_recognizer.py --verbose + ln -s $repo $PWD/ + + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn-zh-number.wav + + python3 ./python-api-examples/inverse-text-normalization-offline-asr.py + + rm -rfv sherpa-onnx-paraformer-zh-2023-03-28 + rm -rf $repo fi diff --git a/python-api-examples/inverse-text-normalization-offline-asr.py b/python-api-examples/inverse-text-normalization-offline-asr.py new file mode 100755 index 000000000..3228e01b9 --- /dev/null +++ b/python-api-examples/inverse-text-normalization-offline-asr.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2024 Xiaomi Corporation + +""" +This script shows how to use inverse text normalization with non-streaming ASR. + +Usage: + +(1) Download the test model + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 +tar xvf sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 +rm sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + +(2) Download rule fst + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst + +Please refer to +https://github.com/k2-fsa/colab/blob/master/sherpa-onnx/itn_zh_number.ipynb +for how itn_zh_number.fst is generated. + +(3) Download test wave + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn-zh-number.wav + +(4) Run this script + +python3 ./python-api-examples/inverse-text-normalization-offline-asr.py +""" +from pathlib import Path + +import sherpa_onnx +import soundfile as sf + + +def create_recognizer(): + model = "./sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx" + tokens = "./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt" + rule_fsts = "./itn_zh_number.fst" + + if ( + not Path(model).is_file() + or not Path(tokens).is_file() + or not Path(rule_fsts).is_file() + ): + raise ValueError( + """Please download model files from + https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models + """ + ) + return sherpa_onnx.OfflineRecognizer.from_paraformer( + paraformer=model, + tokens=tokens, + debug=True, + rule_fsts=rule_fsts, + ) + + +def main(): + recognizer = create_recognizer() + wave_filename = "./itn-zh-number.wav" + if not Path(wave_filename).is_file(): + raise ValueError( + """Please download model files from + https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models + """ + ) + audio, sample_rate = sf.read(wave_filename, dtype="float32", always_2d=True) + audio = audio[:, 0] # only use the first channel + + stream = recognizer.create_stream() + stream.accept_waveform(sample_rate, audio) + recognizer.decode_stream(stream) + print(wave_filename) + print(stream.result) + + +if __name__ == "__main__": + main() diff --git a/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h b/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h index c64da12af..cbe9a9e88 100644 --- a/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h +++ b/sherpa-onnx/csrc/offline-recognizer-ctc-impl.h @@ -73,7 +73,8 @@ static OfflineRecognitionResult Convert(const OfflineCtcDecoderResult &src, class OfflineRecognizerCtcImpl : public OfflineRecognizerImpl { public: explicit OfflineRecognizerCtcImpl(const OfflineRecognizerConfig &config) - : config_(config), + : OfflineRecognizerImpl(config), + config_(config), symbol_table_(config_.model_config.tokens), model_(OfflineCtcModel::Create(config_.model_config)) { Init(); @@ -82,7 +83,8 @@ class OfflineRecognizerCtcImpl : public OfflineRecognizerImpl { #if __ANDROID_API__ >= 9 OfflineRecognizerCtcImpl(AAssetManager *mgr, const OfflineRecognizerConfig &config) - : config_(config), + : OfflineRecognizerImpl(mgr, config), + config_(config), symbol_table_(mgr, config_.model_config.tokens), model_(OfflineCtcModel::Create(mgr, config_.model_config)) { Init(); @@ -205,6 +207,7 @@ class OfflineRecognizerCtcImpl : public OfflineRecognizerImpl { for (int32_t i = 0; i != n; ++i) { auto r = Convert(results[i], symbol_table_, frame_shift_ms, model_->SubsamplingFactor()); + r.text = ApplyInverseTextNormalization(std::move(r.text)); ss[i]->SetResult(r); } } @@ -238,6 +241,7 @@ class OfflineRecognizerCtcImpl : public OfflineRecognizerImpl { auto r = Convert(results[0], symbol_table_, frame_shift_ms, model_->SubsamplingFactor()); + r.text = ApplyInverseTextNormalization(std::move(r.text)); s->SetResult(r); } diff --git a/sherpa-onnx/csrc/offline-recognizer-impl.cc b/sherpa-onnx/csrc/offline-recognizer-impl.cc index 656425778..546d0f9bf 100644 --- a/sherpa-onnx/csrc/offline-recognizer-impl.cc +++ b/sherpa-onnx/csrc/offline-recognizer-impl.cc @@ -5,7 +5,18 @@ #include "sherpa-onnx/csrc/offline-recognizer-impl.h" #include +#include +#include +#if __ANDROID_API__ >= 9 +#include + +#include "android/asset_manager.h" +#include "android/asset_manager_jni.h" +#endif + +#include "fst/extensions/far/far.h" +#include "kaldifst/csrc/kaldi-fst-io.h" #include "onnxruntime_cxx_api.h" // NOLINT #include "sherpa-onnx/csrc/macros.h" #include "sherpa-onnx/csrc/offline-recognizer-ctc-impl.h" @@ -316,4 +327,111 @@ std::unique_ptr OfflineRecognizerImpl::Create( } #endif +OfflineRecognizerImpl::OfflineRecognizerImpl( + const OfflineRecognizerConfig &config) + : config_(config) { + if (!config.rule_fsts.empty()) { + std::vector files; + SplitStringToVector(config.rule_fsts, ",", false, &files); + itn_list_.reserve(files.size()); + for (const auto &f : files) { + if (config.model_config.debug) { + SHERPA_ONNX_LOGE("rule fst: %s", f.c_str()); + } + itn_list_.push_back(std::make_unique(f)); + } + } + + if (!config.rule_fars.empty()) { + if (config.model_config.debug) { + SHERPA_ONNX_LOGE("Loading FST archives"); + } + std::vector files; + SplitStringToVector(config.rule_fars, ",", false, &files); + + itn_list_.reserve(files.size() + itn_list_.size()); + + for (const auto &f : files) { + if (config.model_config.debug) { + SHERPA_ONNX_LOGE("rule far: %s", f.c_str()); + } + std::unique_ptr> reader( + fst::FarReader::Open(f)); + for (; !reader->Done(); reader->Next()) { + std::unique_ptr r( + fst::CastOrConvertToConstFst(reader->GetFst()->Copy())); + + itn_list_.push_back( + std::make_unique(std::move(r))); + } + } + + if (config.model_config.debug) { + SHERPA_ONNX_LOGE("FST archives loaded!"); + } + } +} + +#if __ANDROID_API__ >= 9 +OfflineRecognizerImpl::OfflineRecognizerImpl( + AAssetManager *mgr, const OfflineRecognizerConfig &config) + : config_(config) { + if (!config.rule_fsts.empty()) { + std::vector files; + SplitStringToVector(config.rule_fsts, ",", false, &files); + itn_list_.reserve(files.size()); + for (const auto &f : files) { + if (config.model_config.debug) { + SHERPA_ONNX_LOGE("rule fst: %s", f.c_str()); + } + auto buf = ReadFile(mgr, f); + std::istrstream is(buf.data(), buf.size()); + itn_list_.push_back(std::make_unique(is)); + } + } + + if (!config.rule_fars.empty()) { + std::vector files; + SplitStringToVector(config.rule_fars, ",", false, &files); + itn_list_.reserve(files.size() + itn_list_.size()); + + for (const auto &f : files) { + if (config.model_config.debug) { + SHERPA_ONNX_LOGE("rule far: %s", f.c_str()); + } + + auto buf = ReadFile(mgr, f); + + std::unique_ptr s( + new std::istrstream(buf.data(), buf.size())); + + std::unique_ptr> reader( + fst::FarReader::Open(std::move(s))); + + for (; !reader->Done(); reader->Next()) { + std::unique_ptr r( + fst::CastOrConvertToConstFst(reader->GetFst()->Copy())); + + itn_list_.push_back( + std::make_unique(std::move(r))); + } // for (; !reader->Done(); reader->Next()) + } // for (const auto &f : files) + } // if (!config.rule_fars.empty()) +} +#endif + +std::string OfflineRecognizerImpl::ApplyInverseTextNormalization( + std::string text) const { + if (!itn_list_.empty()) { + for (const auto &tn : itn_list_) { + text = tn->Normalize(text); + if (config_.model_config.debug) { + SHERPA_ONNX_LOGE("After inverse text normalization: %s", text.c_str()); + } + } + } + + return text; +} + } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/offline-recognizer-impl.h b/sherpa-onnx/csrc/offline-recognizer-impl.h index b849de653..1ba268c11 100644 --- a/sherpa-onnx/csrc/offline-recognizer-impl.h +++ b/sherpa-onnx/csrc/offline-recognizer-impl.h @@ -14,6 +14,7 @@ #include "android/asset_manager_jni.h" #endif +#include "kaldifst/csrc/text-normalizer.h" #include "sherpa-onnx/csrc/macros.h" #include "sherpa-onnx/csrc/offline-recognizer.h" #include "sherpa-onnx/csrc/offline-stream.h" @@ -22,10 +23,15 @@ namespace sherpa_onnx { class OfflineRecognizerImpl { public: + explicit OfflineRecognizerImpl(const OfflineRecognizerConfig &config); + static std::unique_ptr Create( const OfflineRecognizerConfig &config); #if __ANDROID_API__ >= 9 + OfflineRecognizerImpl(AAssetManager *mgr, + const OfflineRecognizerConfig &config); + static std::unique_ptr Create( AAssetManager *mgr, const OfflineRecognizerConfig &config); #endif @@ -41,6 +47,15 @@ class OfflineRecognizerImpl { virtual std::unique_ptr CreateStream() const = 0; virtual void DecodeStreams(OfflineStream **ss, int32_t n) const = 0; + + std::string ApplyInverseTextNormalization(std::string text) const; + + private: + OfflineRecognizerConfig config_; + // for inverse text normalization. Used only if + // config.rule_fsts is not empty or + // config.rule_fars is not empty + std::vector> itn_list_; }; } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/offline-recognizer-paraformer-impl.h b/sherpa-onnx/csrc/offline-recognizer-paraformer-impl.h index 3bcaf390b..a0d4af3b6 100644 --- a/sherpa-onnx/csrc/offline-recognizer-paraformer-impl.h +++ b/sherpa-onnx/csrc/offline-recognizer-paraformer-impl.h @@ -89,7 +89,8 @@ class OfflineRecognizerParaformerImpl : public OfflineRecognizerImpl { public: explicit OfflineRecognizerParaformerImpl( const OfflineRecognizerConfig &config) - : config_(config), + : OfflineRecognizerImpl(config), + config_(config), symbol_table_(config_.model_config.tokens), model_(std::make_unique(config.model_config)) { if (config.decoding_method == "greedy_search") { @@ -109,7 +110,8 @@ class OfflineRecognizerParaformerImpl : public OfflineRecognizerImpl { #if __ANDROID_API__ >= 9 OfflineRecognizerParaformerImpl(AAssetManager *mgr, const OfflineRecognizerConfig &config) - : config_(config), + : OfflineRecognizerImpl(mgr, config), + config_(config), symbol_table_(mgr, config_.model_config.tokens), model_(std::make_unique(mgr, config.model_config)) { @@ -204,6 +206,7 @@ class OfflineRecognizerParaformerImpl : public OfflineRecognizerImpl { for (int32_t i = 0; i != n; ++i) { auto r = Convert(results[i], symbol_table_); + r.text = ApplyInverseTextNormalization(std::move(r.text)); ss[i]->SetResult(r); } } diff --git a/sherpa-onnx/csrc/offline-recognizer-transducer-impl.h b/sherpa-onnx/csrc/offline-recognizer-transducer-impl.h index 265f42bb9..13357f79c 100644 --- a/sherpa-onnx/csrc/offline-recognizer-transducer-impl.h +++ b/sherpa-onnx/csrc/offline-recognizer-transducer-impl.h @@ -74,7 +74,8 @@ class OfflineRecognizerTransducerImpl : public OfflineRecognizerImpl { public: explicit OfflineRecognizerTransducerImpl( const OfflineRecognizerConfig &config) - : config_(config), + : OfflineRecognizerImpl(config), + config_(config), symbol_table_(config_.model_config.tokens), model_(std::make_unique(config_.model_config)) { if (config_.decoding_method == "greedy_search") { @@ -107,7 +108,8 @@ class OfflineRecognizerTransducerImpl : public OfflineRecognizerImpl { #if __ANDROID_API__ >= 9 explicit OfflineRecognizerTransducerImpl( AAssetManager *mgr, const OfflineRecognizerConfig &config) - : config_(config), + : OfflineRecognizerImpl(mgr, config), + config_(config), symbol_table_(mgr, config_.model_config.tokens), model_(std::make_unique(mgr, config_.model_config)) { @@ -230,6 +232,7 @@ class OfflineRecognizerTransducerImpl : public OfflineRecognizerImpl { for (int32_t i = 0; i != n; ++i) { auto r = Convert(results[i], symbol_table_, frame_shift_ms, model_->SubsamplingFactor()); + r.text = ApplyInverseTextNormalization(std::move(r.text)); ss[i]->SetResult(r); } diff --git a/sherpa-onnx/csrc/offline-recognizer-transducer-nemo-impl.h b/sherpa-onnx/csrc/offline-recognizer-transducer-nemo-impl.h index 127fe3433..d5902b05b 100644 --- a/sherpa-onnx/csrc/offline-recognizer-transducer-nemo-impl.h +++ b/sherpa-onnx/csrc/offline-recognizer-transducer-nemo-impl.h @@ -41,7 +41,8 @@ class OfflineRecognizerTransducerNeMoImpl : public OfflineRecognizerImpl { public: explicit OfflineRecognizerTransducerNeMoImpl( const OfflineRecognizerConfig &config) - : config_(config), + : OfflineRecognizerImpl(config), + config_(config), symbol_table_(config_.model_config.tokens), model_(std::make_unique( config_.model_config)) { @@ -59,7 +60,8 @@ class OfflineRecognizerTransducerNeMoImpl : public OfflineRecognizerImpl { #if __ANDROID_API__ >= 9 explicit OfflineRecognizerTransducerNeMoImpl( AAssetManager *mgr, const OfflineRecognizerConfig &config) - : config_(config), + : OfflineRecognizerImpl(mgr, config), + config_(config), symbol_table_(mgr, config_.model_config.tokens), model_(std::make_unique( mgr, config_.model_config)) { @@ -131,6 +133,7 @@ class OfflineRecognizerTransducerNeMoImpl : public OfflineRecognizerImpl { for (int32_t i = 0; i != n; ++i) { auto r = Convert(results[i], symbol_table_, frame_shift_ms, model_->SubsamplingFactor()); + r.text = ApplyInverseTextNormalization(std::move(r.text)); ss[i]->SetResult(r); } diff --git a/sherpa-onnx/csrc/offline-recognizer-whisper-impl.h b/sherpa-onnx/csrc/offline-recognizer-whisper-impl.h index d224c8607..358917608 100644 --- a/sherpa-onnx/csrc/offline-recognizer-whisper-impl.h +++ b/sherpa-onnx/csrc/offline-recognizer-whisper-impl.h @@ -52,7 +52,8 @@ static OfflineRecognitionResult Convert(const OfflineWhisperDecoderResult &src, class OfflineRecognizerWhisperImpl : public OfflineRecognizerImpl { public: explicit OfflineRecognizerWhisperImpl(const OfflineRecognizerConfig &config) - : config_(config), + : OfflineRecognizerImpl(config), + config_(config), symbol_table_(config_.model_config.tokens), model_(std::make_unique(config.model_config)) { Init(); @@ -61,7 +62,8 @@ class OfflineRecognizerWhisperImpl : public OfflineRecognizerImpl { #if __ANDROID_API__ >= 9 OfflineRecognizerWhisperImpl(AAssetManager *mgr, const OfflineRecognizerConfig &config) - : config_(config), + : OfflineRecognizerImpl(mgr, config), + config_(config), symbol_table_(mgr, config_.model_config.tokens), model_( std::make_unique(mgr, config.model_config)) { @@ -150,6 +152,7 @@ class OfflineRecognizerWhisperImpl : public OfflineRecognizerImpl { std::move(cross_kv.second)); auto r = Convert(results[0], symbol_table_); + r.text = ApplyInverseTextNormalization(std::move(r.text)); s->SetResult(r); } catch (const Ort::Exception &ex) { SHERPA_ONNX_LOGE( diff --git a/sherpa-onnx/csrc/offline-recognizer.cc b/sherpa-onnx/csrc/offline-recognizer.cc index d6ba4905c..1285a5cd3 100644 --- a/sherpa-onnx/csrc/offline-recognizer.cc +++ b/sherpa-onnx/csrc/offline-recognizer.cc @@ -10,7 +10,7 @@ #include "sherpa-onnx/csrc/macros.h" #include "sherpa-onnx/csrc/offline-lm-config.h" #include "sherpa-onnx/csrc/offline-recognizer-impl.h" - +#include "sherpa-onnx/csrc/text-utils.h" namespace sherpa_onnx { void OfflineRecognizerConfig::Register(ParseOptions *po) { @@ -44,6 +44,16 @@ void OfflineRecognizerConfig::Register(ParseOptions *po) { po->Register("hotwords-score", &hotwords_score, "The bonus score for each token in context word/phrase. " "Used only when decoding_method is modified_beam_search"); + + po->Register( + "rule-fsts", &rule_fsts, + "If not empty, it specifies fsts for inverse text normalization. " + "If there are multiple fsts, they are separated by a comma."); + + po->Register( + "rule-fars", &rule_fars, + "If not empty, it specifies fst archives for inverse text normalization. " + "If there are multiple archives, they are separated by a comma."); } bool OfflineRecognizerConfig::Validate() const { @@ -61,7 +71,7 @@ bool OfflineRecognizerConfig::Validate() const { if (!hotwords_file.empty() && decoding_method != "modified_beam_search") { SHERPA_ONNX_LOGE( "Please use --decoding-method=modified_beam_search if you" - " provide --hotwords-file. Given --decoding-method=%s", + " provide --hotwords-file. Given --decoding-method='%s'", decoding_method.c_str()); return false; } @@ -72,6 +82,34 @@ bool OfflineRecognizerConfig::Validate() const { return false; } + if (!hotwords_file.empty() && !FileExists(hotwords_file)) { + SHERPA_ONNX_LOGE("--hotwords-file: '%s' does not exist", + hotwords_file.c_str()); + return false; + } + + if (!rule_fsts.empty()) { + std::vector files; + SplitStringToVector(rule_fsts, ",", false, &files); + for (const auto &f : files) { + if (!FileExists(f)) { + SHERPA_ONNX_LOGE("Rule fst '%s' does not exist. ", f.c_str()); + return false; + } + } + } + + if (!rule_fars.empty()) { + std::vector files; + SplitStringToVector(rule_fars, ",", false, &files); + for (const auto &f : files) { + if (!FileExists(f)) { + SHERPA_ONNX_LOGE("Rule far '%s' does not exist. ", f.c_str()); + return false; + } + } + } + return model_config.Validate(); } @@ -87,7 +125,9 @@ std::string OfflineRecognizerConfig::ToString() const { os << "max_active_paths=" << max_active_paths << ", "; os << "hotwords_file=\"" << hotwords_file << "\", "; os << "hotwords_score=" << hotwords_score << ", "; - os << "blank_penalty=" << blank_penalty << ")"; + os << "blank_penalty=" << blank_penalty << ", "; + os << "rule_fsts=\"" << rule_fsts << "\", "; + os << "rule_fars=\"" << rule_fars << "\")"; return os.str(); } diff --git a/sherpa-onnx/csrc/offline-recognizer.h b/sherpa-onnx/csrc/offline-recognizer.h index e93d7edc4..9290a53b5 100644 --- a/sherpa-onnx/csrc/offline-recognizer.h +++ b/sherpa-onnx/csrc/offline-recognizer.h @@ -40,6 +40,12 @@ struct OfflineRecognizerConfig { float blank_penalty = 0.0; + // If there are multiple rules, they are applied from left to right. + std::string rule_fsts; + + // If there are multiple FST archives, they are applied from left to right. + std::string rule_fars; + // only greedy_search is implemented // TODO(fangjun): Implement modified_beam_search @@ -50,7 +56,8 @@ struct OfflineRecognizerConfig { const OfflineCtcFstDecoderConfig &ctc_fst_decoder_config, const std::string &decoding_method, int32_t max_active_paths, const std::string &hotwords_file, float hotwords_score, - float blank_penalty) + float blank_penalty, const std::string &rule_fsts, + const std::string &rule_fars) : feat_config(feat_config), model_config(model_config), lm_config(lm_config), @@ -59,7 +66,9 @@ struct OfflineRecognizerConfig { max_active_paths(max_active_paths), hotwords_file(hotwords_file), hotwords_score(hotwords_score), - blank_penalty(blank_penalty) {} + blank_penalty(blank_penalty), + rule_fsts(rule_fsts), + rule_fars(rule_fars) {} void Register(ParseOptions *po); bool Validate() const; diff --git a/sherpa-onnx/python/csrc/offline-recognizer.cc b/sherpa-onnx/python/csrc/offline-recognizer.cc index 5ef9d4f2f..2a603e08f 100644 --- a/sherpa-onnx/python/csrc/offline-recognizer.cc +++ b/sherpa-onnx/python/csrc/offline-recognizer.cc @@ -17,13 +17,14 @@ static void PybindOfflineRecognizerConfig(py::module *m) { .def(py::init(), + float, const std::string &, const std::string &>(), py::arg("feat_config"), py::arg("model_config"), py::arg("lm_config") = OfflineLMConfig(), py::arg("ctc_fst_decoder_config") = OfflineCtcFstDecoderConfig(), py::arg("decoding_method") = "greedy_search", py::arg("max_active_paths") = 4, py::arg("hotwords_file") = "", - py::arg("hotwords_score") = 1.5, py::arg("blank_penalty") = 0.0) + py::arg("hotwords_score") = 1.5, py::arg("blank_penalty") = 0.0, + py::arg("rule_fsts") = "", py::arg("rule_fars") = "") .def_readwrite("feat_config", &PyClass::feat_config) .def_readwrite("model_config", &PyClass::model_config) .def_readwrite("lm_config", &PyClass::lm_config) @@ -33,6 +34,8 @@ static void PybindOfflineRecognizerConfig(py::module *m) { .def_readwrite("hotwords_file", &PyClass::hotwords_file) .def_readwrite("hotwords_score", &PyClass::hotwords_score) .def_readwrite("blank_penalty", &PyClass::blank_penalty) + .def_readwrite("rule_fsts", &PyClass::rule_fsts) + .def_readwrite("rule_fars", &PyClass::rule_fars) .def("__str__", &PyClass::ToString); } diff --git a/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py b/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py index 480ea23ce..2fade069a 100644 --- a/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py +++ b/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py @@ -54,6 +54,8 @@ def from_transducer( debug: bool = False, provider: str = "cpu", model_type: str = "transducer", + rule_fsts: str = "", + rule_fars: str = "", ): """ Please refer to @@ -107,6 +109,12 @@ def from_transducer( True to show debug messages. provider: onnxruntime execution providers. Valid values are: cpu, cuda, coreml. + rule_fsts: + If not empty, it specifies fsts for inverse text normalization. + If there are multiple fsts, they are separated by a comma. + rule_fars: + If not empty, it specifies fst archives for inverse text normalization. + If there are multiple archives, they are separated by a comma. """ self = cls.__new__(cls) model_config = OfflineModelConfig( @@ -143,6 +151,8 @@ def from_transducer( hotwords_file=hotwords_file, hotwords_score=hotwords_score, blank_penalty=blank_penalty, + rule_fsts=rule_fsts, + rule_fars=rule_fars, ) self.recognizer = _Recognizer(recognizer_config) self.config = recognizer_config @@ -159,6 +169,8 @@ def from_paraformer( decoding_method: str = "greedy_search", debug: bool = False, provider: str = "cpu", + rule_fsts: str = "", + rule_fars: str = "", ): """ Please refer to @@ -186,6 +198,12 @@ def from_paraformer( True to show debug messages. provider: onnxruntime execution providers. Valid values are: cpu, cuda, coreml. + rule_fsts: + If not empty, it specifies fsts for inverse text normalization. + If there are multiple fsts, they are separated by a comma. + rule_fars: + If not empty, it specifies fst archives for inverse text normalization. + If there are multiple archives, they are separated by a comma. """ self = cls.__new__(cls) model_config = OfflineModelConfig( @@ -206,6 +224,8 @@ def from_paraformer( feat_config=feat_config, model_config=model_config, decoding_method=decoding_method, + rule_fsts=rule_fsts, + rule_fars=rule_fars, ) self.recognizer = _Recognizer(recognizer_config) self.config = recognizer_config @@ -222,6 +242,8 @@ def from_telespeech_ctc( decoding_method: str = "greedy_search", debug: bool = False, provider: str = "cpu", + rule_fsts: str = "", + rule_fars: str = "", ): """ Please refer to @@ -251,6 +273,12 @@ def from_telespeech_ctc( True to show debug messages. provider: onnxruntime execution providers. Valid values are: cpu, cuda, coreml. + rule_fsts: + If not empty, it specifies fsts for inverse text normalization. + If there are multiple fsts, they are separated by a comma. + rule_fars: + If not empty, it specifies fst archives for inverse text normalization. + If there are multiple archives, they are separated by a comma. """ self = cls.__new__(cls) model_config = OfflineModelConfig( @@ -271,6 +299,8 @@ def from_telespeech_ctc( feat_config=feat_config, model_config=model_config, decoding_method=decoding_method, + rule_fsts=rule_fsts, + rule_fars=rule_fars, ) self.recognizer = _Recognizer(recognizer_config) self.config = recognizer_config @@ -287,6 +317,8 @@ def from_nemo_ctc( decoding_method: str = "greedy_search", debug: bool = False, provider: str = "cpu", + rule_fsts: str = "", + rule_fars: str = "", ): """ Please refer to @@ -315,6 +347,12 @@ def from_nemo_ctc( True to show debug messages. provider: onnxruntime execution providers. Valid values are: cpu, cuda, coreml. + rule_fsts: + If not empty, it specifies fsts for inverse text normalization. + If there are multiple fsts, they are separated by a comma. + rule_fars: + If not empty, it specifies fst archives for inverse text normalization. + If there are multiple archives, they are separated by a comma. """ self = cls.__new__(cls) model_config = OfflineModelConfig( @@ -335,6 +373,8 @@ def from_nemo_ctc( feat_config=feat_config, model_config=model_config, decoding_method=decoding_method, + rule_fsts=rule_fsts, + rule_fars=rule_fars, ) self.recognizer = _Recognizer(recognizer_config) self.config = recognizer_config @@ -353,6 +393,8 @@ def from_whisper( debug: bool = False, provider: str = "cpu", tail_paddings: int = -1, + rule_fsts: str = "", + rule_fars: str = "", ): """ Please refer to @@ -389,6 +431,12 @@ def from_whisper( True to show debug messages. provider: onnxruntime execution providers. Valid values are: cpu, cuda, coreml. + rule_fsts: + If not empty, it specifies fsts for inverse text normalization. + If there are multiple fsts, they are separated by a comma. + rule_fars: + If not empty, it specifies fst archives for inverse text normalization. + If there are multiple archives, they are separated by a comma. """ self = cls.__new__(cls) model_config = OfflineModelConfig( @@ -415,6 +463,8 @@ def from_whisper( feat_config=feat_config, model_config=model_config, decoding_method=decoding_method, + rule_fsts=rule_fsts, + rule_fars=rule_fars, ) self.recognizer = _Recognizer(recognizer_config) self.config = recognizer_config @@ -431,6 +481,8 @@ def from_tdnn_ctc( decoding_method: str = "greedy_search", debug: bool = False, provider: str = "cpu", + rule_fsts: str = "", + rule_fars: str = "", ): """ Please refer to @@ -458,6 +510,12 @@ def from_tdnn_ctc( True to show debug messages. provider: onnxruntime execution providers. Valid values are: cpu, cuda, coreml. + rule_fsts: + If not empty, it specifies fsts for inverse text normalization. + If there are multiple fsts, they are separated by a comma. + rule_fars: + If not empty, it specifies fst archives for inverse text normalization. + If there are multiple archives, they are separated by a comma. """ self = cls.__new__(cls) model_config = OfflineModelConfig( @@ -478,6 +536,8 @@ def from_tdnn_ctc( feat_config=feat_config, model_config=model_config, decoding_method=decoding_method, + rule_fsts=rule_fsts, + rule_fars=rule_fars, ) self.recognizer = _Recognizer(recognizer_config) self.config = recognizer_config @@ -494,6 +554,8 @@ def from_wenet_ctc( decoding_method: str = "greedy_search", debug: bool = False, provider: str = "cpu", + rule_fsts: str = "", + rule_fars: str = "", ): """ Please refer to @@ -522,6 +584,12 @@ def from_wenet_ctc( True to show debug messages. provider: onnxruntime execution providers. Valid values are: cpu, cuda, coreml. + rule_fsts: + If not empty, it specifies fsts for inverse text normalization. + If there are multiple fsts, they are separated by a comma. + rule_fars: + If not empty, it specifies fst archives for inverse text normalization. + If there are multiple archives, they are separated by a comma. """ self = cls.__new__(cls) model_config = OfflineModelConfig( @@ -542,6 +610,8 @@ def from_wenet_ctc( feat_config=feat_config, model_config=model_config, decoding_method=decoding_method, + rule_fsts=rule_fsts, + rule_fars=rule_fars, ) self.recognizer = _Recognizer(recognizer_config) self.config = recognizer_config From 6e09933d99c6c501f272b1a75dd2f8cfca17f150 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Mon, 17 Jun 2024 17:02:39 +0800 Subject: [PATCH 026/237] Inverse text normalization API for other programming languages (#1019) --- .github/scripts/test-dart.sh | 67 ++++----- .github/scripts/test-dot-net.sh | 1 + .github/scripts/test-nodejs-addon-npm.sh | 6 + .github/scripts/test-nodejs-npm.sh | 9 ++ .github/workflows/run-java-test.yaml | 2 + .github/workflows/test-dot-net.yaml | 59 +------- .github/workflows/test-go.yaml | 3 +- .github/workflows/test-nodejs-addon-api.yaml | 4 +- .gitignore | 1 + .../non-streaming-asr/bin/paraformer-itn.dart | 63 +++++++++ .../non-streaming-asr/run-paraformer-itn.sh | 27 ++++ .../offline-decode-files/Program.cs | 5 + .../run-paraformer-itn.sh | 24 ++++ .../run-telespeech-ctc.sh | 2 +- .../non-streaming-decode-files/main.go | 2 + .../run-paraformer-itn.sh | 28 ++++ .../run-telespeech-ctc.sh | 2 +- ...xtNormalizationNonStreamingParaformer.java | 54 ++++++++ ...n-inverse-text-normalization-paraformer.sh | 46 +++++++ kotlin-api-examples/run.sh | 29 ++++ kotlin-api-examples/test_itn_asr.kt | 37 +++++ .../test_asr_non_streaming_paraformer_itn.js | 48 +++++++ .../test-offline-paraformer-itn.js | 128 ++++++++++++++++++ scripts/dotnet/OfflineRecognizerConfig.cs | 9 +- .../run-paraformer-itn.sh | 1 + scripts/go/sherpa_onnx.go | 15 ++ scripts/node-addon-api/README.md | 4 +- .../node-addon-api/src/non-streaming-asr.cc | 10 ++ .../node-addon-api/src/non-streaming-tts.cc | 2 +- sherpa-onnx/c-api/c-api.cc | 3 + sherpa-onnx/c-api/c-api.h | 2 + .../flutter/lib/src/offline_recognizer.dart | 12 +- .../flutter/lib/src/sherpa_onnx_bindings.dart | 3 + .../sherpa/onnx/OfflineRecognizerConfig.java | 16 +++ sherpa-onnx/jni/offline-recognizer.cc | 12 ++ sherpa-onnx/kotlin-api/OfflineRecognizer.kt | 2 + swift-api-examples/SherpaOnnx.swift | 8 +- wasm/asr/sherpa-onnx-asr.js | 23 +++- wasm/nodejs/sherpa-onnx-wasm-nodejs.cc | 4 +- 39 files changed, 669 insertions(+), 104 deletions(-) create mode 100644 dart-api-examples/non-streaming-asr/bin/paraformer-itn.dart create mode 100755 dart-api-examples/non-streaming-asr/run-paraformer-itn.sh create mode 100755 dotnet-examples/offline-decode-files/run-paraformer-itn.sh create mode 100755 go-api-examples/non-streaming-decode-files/run-paraformer-itn.sh create mode 100644 java-api-examples/InverseTextNormalizationNonStreamingParaformer.java create mode 100755 java-api-examples/run-inverse-text-normalization-paraformer.sh create mode 100644 kotlin-api-examples/test_itn_asr.kt create mode 100644 nodejs-addon-examples/test_asr_non_streaming_paraformer_itn.js create mode 100644 nodejs-examples/test-offline-paraformer-itn.js create mode 120000 scripts/go/_internal/non-streaming-decode-files/run-paraformer-itn.sh diff --git a/.github/scripts/test-dart.sh b/.github/scripts/test-dart.sh index 763f2bcc4..0850a72b1 100755 --- a/.github/scripts/test-dart.sh +++ b/.github/scripts/test-dart.sh @@ -4,6 +4,41 @@ set -ex cd dart-api-examples +pushd non-streaming-asr + +echo '----------paraformer itn----------' +./run-paraformer-itn.sh + +echo '----------paraformer----------' +./run-paraformer.sh +rm -rf sherpa-onnx-* + +echo '----------VAD with paraformer----------' +./run-vad-with-paraformer.sh +rm -rf sherpa-onnx-* + +echo '----------NeMo transducer----------' +./run-nemo-transducer.sh +rm -rf sherpa-onnx-* + +echo '----------NeMo CTC----------' +./run-nemo-ctc.sh +rm -rf sherpa-onnx-* + +echo '----------TeleSpeech CTC----------' +./run-telespeech-ctc.sh +rm -rf sherpa-onnx-* + +echo '----------whisper----------' +./run-whisper.sh +rm -rf sherpa-onnx-* + +echo '----------zipformer transducer----------' +./run-zipformer-transducer.sh +rm -rf sherpa-onnx-* + +popd # non-streaming-asr + pushd tts echo '----------piper tts----------' @@ -44,38 +79,6 @@ rm -rf sherpa-onnx-* popd # streaming-asr -pushd non-streaming-asr - -echo '----------VAD with paraformer----------' -./run-vad-with-paraformer.sh -rm -rf sherpa-onnx-* - -echo '----------NeMo transducer----------' -./run-nemo-transducer.sh -rm -rf sherpa-onnx-* - -echo '----------NeMo CTC----------' -./run-nemo-ctc.sh -rm -rf sherpa-onnx-* - -echo '----------TeleSpeech CTC----------' -./run-telespeech-ctc.sh -rm -rf sherpa-onnx-* - -echo '----------paraformer----------' -./run-paraformer.sh -rm -rf sherpa-onnx-* - -echo '----------whisper----------' -./run-whisper.sh -rm -rf sherpa-onnx-* - -echo '----------zipformer transducer----------' -./run-zipformer-transducer.sh -rm -rf sherpa-onnx-* - -popd # non-streaming-asr - pushd vad ./run.sh rm *.onnx diff --git a/.github/scripts/test-dot-net.sh b/.github/scripts/test-dot-net.sh index 6ae126037..395c67c83 100755 --- a/.github/scripts/test-dot-net.sh +++ b/.github/scripts/test-dot-net.sh @@ -3,6 +3,7 @@ cd dotnet-examples/ cd ./offline-decode-files +./run-paraformer-itn.sh ./run-telespeech-ctc.sh ./run-nemo-ctc.sh ./run-paraformer.sh diff --git a/.github/scripts/test-nodejs-addon-npm.sh b/.github/scripts/test-nodejs-addon-npm.sh index 7057ee2b4..a6c4345a8 100755 --- a/.github/scripts/test-nodejs-addon-npm.sh +++ b/.github/scripts/test-nodejs-addon-npm.sh @@ -119,6 +119,12 @@ tar xvf sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 rm sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 node ./test_asr_non_streaming_paraformer.js + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn-zh-number.wav + +node ./test_asr_non_streaming_paraformer_itn.js + rm -rf sherpa-onnx-paraformer-zh-2023-03-28 echo "----------tts----------" diff --git a/.github/scripts/test-nodejs-npm.sh b/.github/scripts/test-nodejs-npm.sh index a27214383..2098bb166 100755 --- a/.github/scripts/test-nodejs-npm.sh +++ b/.github/scripts/test-nodejs-npm.sh @@ -11,6 +11,15 @@ ls -lh node_modules # offline asr +curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 +ls -lh +tar xvf sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 +rm sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn-zh-number.wav +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst +node ./test-offline-paraformer-itn.js +rm -rf sherpa-onnx-paraformer-zh-2023-03-28 + curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-ctc-en-conformer-small.tar.bz2 ls -lh tar xvf sherpa-onnx-nemo-ctc-en-conformer-small.tar.bz2 diff --git a/.github/workflows/run-java-test.yaml b/.github/workflows/run-java-test.yaml index c000b277b..30fc6a827 100644 --- a/.github/workflows/run-java-test.yaml +++ b/.github/workflows/run-java-test.yaml @@ -190,6 +190,8 @@ jobs: shell: bash run: | cd ./java-api-examples + ./run-inverse-text-normalization-paraformer.sh + ./run-non-streaming-decode-file-paraformer.sh rm -rf sherpa-onnx-paraformer-zh-* diff --git a/.github/workflows/test-dot-net.yaml b/.github/workflows/test-dot-net.yaml index 7052542db..55bc8e6a3 100644 --- a/.github/workflows/test-dot-net.yaml +++ b/.github/workflows/test-dot-net.yaml @@ -39,7 +39,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-latest, macos-latest, windows-latest] + os: [ubuntu-latest] python-version: ["3.8"] steps: @@ -72,45 +72,18 @@ jobs: cmake --build . --target install --config Release - - name: Build sherpa-onnx for windows x86 - if: matrix.os == 'windows-latest' - shell: bash - run: | - export CMAKE_CXX_COMPILER_LAUNCHER=ccache - export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH" - cmake --version - - mkdir build-win32 - cd build-win32 - cmake \ - -A Win32 \ - -DBUILD_SHARED_LIBS=ON \ - -DCMAKE_INSTALL_PREFIX=./install \ - -DCMAKE_BUILD_TYPE=Release \ - -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF \ - -DBUILD_ESPEAK_NG_EXE=OFF \ - -DSHERPA_ONNX_ENABLE_BINARY=ON \ - .. - cmake --build . --target install --config Release - - uses: actions/upload-artifact@v4 with: name: ${{ matrix.os }} path: ./build/install/lib/ - - uses: actions/upload-artifact@v4 - if: matrix.os == 'windows-latest' - with: - name: ${{ matrix.os }}-win32 - path: ./build-win32/install/lib/ - test-dot-net: runs-on: ${{ matrix.os }} needs: [build-libs] strategy: fail-fast: false matrix: - os: [ubuntu-latest, macos-latest] #, windows-latest] + os: [ubuntu-latest] python-version: ["3.8"] steps: @@ -134,30 +107,11 @@ jobs: name: ubuntu-latest path: /tmp/linux - - name: Retrieve artifact from macos-latest - uses: actions/download-artifact@v4 - with: - name: macos-latest - path: /tmp/macos - - - name: Retrieve artifact from windows-latest - uses: actions/download-artifact@v4 - with: - name: windows-latest - path: /tmp/windows-x64 - - - name: Retrieve artifact from windows-latest - uses: actions/download-artifact@v4 - with: - name: windows-latest-win32 - path: /tmp/windows-x86 - - name: Setup .NET uses: actions/setup-dotnet@v4 with: dotnet-version: | 6.0.x - 7.0.x - name: Check dotnet run: dotnet --info @@ -171,15 +125,6 @@ jobs: echo "----------/tmp/linux----------" ls -lh /tmp/linux - echo "----------/tmp/macos----------" - ls -lh /tmp/macos - - echo "----------/tmp/windows-x64----------" - ls -lh /tmp/windows-x64 - - echo "----------/tmp/windows-x86----------" - ls -lh /tmp/windows-x86 - - name: Build shell: bash run: | diff --git a/.github/workflows/test-go.yaml b/.github/workflows/test-go.yaml index 5724d9cb9..1ec654418 100644 --- a/.github/workflows/test-go.yaml +++ b/.github/workflows/test-go.yaml @@ -127,7 +127,7 @@ jobs: - uses: actions/upload-artifact@v4 with: - name: tts-waves + name: tts-waves-${{ matrix.os }} path: tts-waves - name: Test non-streaming decoding files (macOS) @@ -154,6 +154,7 @@ jobs: echo "Test paraformer" ./run-paraformer.sh + ./run-paraformer-itn.sh rm -rf sherpa-onnx-paraformer-zh-2023-03-28 echo "Test NeMo CTC" diff --git a/.github/workflows/test-nodejs-addon-api.yaml b/.github/workflows/test-nodejs-addon-api.yaml index 716a9fb73..224fc0f0b 100644 --- a/.github/workflows/test-nodejs-addon-api.yaml +++ b/.github/workflows/test-nodejs-addon-api.yaml @@ -39,8 +39,8 @@ jobs: strategy: fail-fast: false matrix: - os: [macos-11, macos-14, ubuntu-20.04, ubuntu-22.04] #, windows-latest] - node-version: ["16", "17", "18", "19", "21", "22"] + os: [macos-latest, ubuntu-latest, ubuntu-latest] + node-version: ["16", "22"] python-version: ["3.8"] steps: diff --git a/.gitignore b/.gitignore index 1eb26e5c2..2176798be 100644 --- a/.gitignore +++ b/.gitignore @@ -107,3 +107,4 @@ package-lock.json sherpa-onnx-nemo-* sherpa-onnx-vits-* sherpa-onnx-telespeech-ctc-* +*.fst diff --git a/dart-api-examples/non-streaming-asr/bin/paraformer-itn.dart b/dart-api-examples/non-streaming-asr/bin/paraformer-itn.dart new file mode 100644 index 000000000..c8d2c0801 --- /dev/null +++ b/dart-api-examples/non-streaming-asr/bin/paraformer-itn.dart @@ -0,0 +1,63 @@ +// Copyright (c) 2024 Xiaomi Corporation +import 'dart:io'; +import 'dart:typed_data'; + +import 'package:args/args.dart'; +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; + +import './init.dart'; + +void main(List arguments) async { + await initSherpaOnnx(); + + final parser = ArgParser() + ..addOption('model', help: 'Path to the paraformer model') + ..addOption('tokens', help: 'Path to tokens.txt') + ..addOption('rule-fsts', + help: 'Path to rule fsts for inverse text normalization') + ..addOption('input-wav', help: 'Path to input.wav to transcribe'); + + final res = parser.parse(arguments); + if (res['model'] == null || + res['tokens'] == null || + res['rule-fsts'] == null || + res['input-wav'] == null) { + print(parser.usage); + exit(1); + } + + final model = res['model'] as String; + final tokens = res['tokens'] as String; + final ruleFsts = res['rule-fsts'] as String; + final inputWav = res['input-wav'] as String; + + final paraformer = sherpa_onnx.OfflineParaformerModelConfig( + model: model, + ); + + final modelConfig = sherpa_onnx.OfflineModelConfig( + paraformer: paraformer, + tokens: tokens, + debug: true, + numThreads: 1, + modelType: 'paraformer', + ); + final config = sherpa_onnx.OfflineRecognizerConfig( + model: modelConfig, + ruleFsts: ruleFsts, + ); + final recognizer = sherpa_onnx.OfflineRecognizer(config); + + final waveData = sherpa_onnx.readWave(inputWav); + final stream = recognizer.createStream(); + + stream.acceptWaveform( + samples: waveData.samples, sampleRate: waveData.sampleRate); + recognizer.decode(stream); + + final result = recognizer.getResult(stream); + print(result.text); + + stream.free(); + recognizer.free(); +} diff --git a/dart-api-examples/non-streaming-asr/run-paraformer-itn.sh b/dart-api-examples/non-streaming-asr/run-paraformer-itn.sh new file mode 100755 index 000000000..7d8e30859 --- /dev/null +++ b/dart-api-examples/non-streaming-asr/run-paraformer-itn.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +set -ex + +dart pub get + +if [ ! -f ./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + + tar xvf sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + rm sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 +fi + +if [ ! -f ./itn-zh-number.wav ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn-zh-number.wav +fi + +if [ ! -f ./itn_zh_number.fst ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst +fi + +dart run \ + ./bin/paraformer-itn.dart \ + --model ./sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx \ + --tokens ./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt \ + --rule-fsts ./itn_zh_number.fst \ + --input-wav ./itn-zh-number.wav diff --git a/dotnet-examples/offline-decode-files/Program.cs b/dotnet-examples/offline-decode-files/Program.cs index ea30a14e2..301774f8e 100644 --- a/dotnet-examples/offline-decode-files/Program.cs +++ b/dotnet-examples/offline-decode-files/Program.cs @@ -69,6 +69,10 @@ class Options HelpText = "Valid decoding methods are: greedy_search, modified_beam_search")] public string DecodingMethod { get; set; } + [Option("rule-fsts", Required = false, Default = "", + HelpText = "If not empty, path to rule fst for inverse text normalization")] + public string RuleFsts { get; set; } + [Option("max-active-paths", Required = false, Default = 4, HelpText = @"Used only when --decoding--method is modified_beam_search. It specifies number of active paths to keep during the search")] @@ -233,6 +237,7 @@ private static void Run(Options options) config.MaxActivePaths = options.MaxActivePaths; config.HotwordsFile = options.HotwordsFile; config.HotwordsScore = options.HotwordsScore; + config.RuleFsts = options.RuleFsts; config.ModelConfig.Debug = 0; diff --git a/dotnet-examples/offline-decode-files/run-paraformer-itn.sh b/dotnet-examples/offline-decode-files/run-paraformer-itn.sh new file mode 100755 index 000000000..3f30d98f0 --- /dev/null +++ b/dotnet-examples/offline-decode-files/run-paraformer-itn.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +set -ex + +if [ ! -d ./sherpa-onnx-paraformer-zh-2023-03-28 ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + tar xvf sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + rm sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 +fi + +if [ ! -f ./itn-zh-number.wav ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn-zh-number.wav +fi + +if [ ! -f ./itn_zh_number.fst ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst +fi + +dotnet run \ + --tokens=./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt \ + --paraformer=./sherpa-onnx-paraformer-zh-2023-03-28/model.onnx \ + --rule-fsts=./itn_zh_number.fst \ + --num-threads=2 \ + --files ./itn-zh-number.wav diff --git a/dotnet-examples/offline-decode-files/run-telespeech-ctc.sh b/dotnet-examples/offline-decode-files/run-telespeech-ctc.sh index d678026d0..a7aae402c 100755 --- a/dotnet-examples/offline-decode-files/run-telespeech-ctc.sh +++ b/dotnet-examples/offline-decode-files/run-telespeech-ctc.sh @@ -11,5 +11,5 @@ fi dotnet run \ --telespeech-ctc=./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx \ --tokens=./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt \ - --model-type=telespeech-ctc \ + --model-type=telespeech_ctc \ --files ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/3-sichuan.wav diff --git a/go-api-examples/non-streaming-decode-files/main.go b/go-api-examples/non-streaming-decode-files/main.go index 48ad35302..b1d2063b9 100644 --- a/go-api-examples/non-streaming-decode-files/main.go +++ b/go-api-examples/non-streaming-decode-files/main.go @@ -48,6 +48,8 @@ func main() { flag.StringVar(&config.DecodingMethod, "decoding-method", "greedy_search", "Decoding method. Possible values: greedy_search, modified_beam_search") flag.IntVar(&config.MaxActivePaths, "max-active-paths", 4, "Used only when --decoding-method is modified_beam_search") + flag.StringVar(&config.RuleFsts, "rule-fsts", "", "If not empty, path to rule fst for inverse text normalization") + flag.StringVar(&config.RuleFars, "rule-fars", "", "If not empty, path to rule fst archives for inverse text normalization") flag.Parse() diff --git a/go-api-examples/non-streaming-decode-files/run-paraformer-itn.sh b/go-api-examples/non-streaming-decode-files/run-paraformer-itn.sh new file mode 100755 index 000000000..93103d071 --- /dev/null +++ b/go-api-examples/non-streaming-decode-files/run-paraformer-itn.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +set -ex + +if [ ! -d sherpa-onnx-paraformer-zh-2023-03-28 ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + tar xvf sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + rm sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 +fi + +if [ ! -f ./itn-zh-number.wav ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn-zh-number.wav +fi + +if [ ! -f ./itn_zh_number.fst ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst +fi + +go mod tidy +go build + +./non-streaming-decode-files \ + --paraformer ./sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx \ + --tokens ./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt \ + --model-type paraformer \ + --rule-fsts ./itn_zh_number.fst \ + --debug 0 \ + ./itn-zh-number.wav diff --git a/go-api-examples/non-streaming-decode-files/run-telespeech-ctc.sh b/go-api-examples/non-streaming-decode-files/run-telespeech-ctc.sh index d9785b2aa..cddf82865 100755 --- a/go-api-examples/non-streaming-decode-files/run-telespeech-ctc.sh +++ b/go-api-examples/non-streaming-decode-files/run-telespeech-ctc.sh @@ -14,6 +14,6 @@ go build ./non-streaming-decode-files \ --telespeech-ctc ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx \ --tokens ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt \ - --model-type telespeech-ctc \ + --model-type telespeech_ctc \ --debug 0 \ ./sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/test_wavs/3-sichuan.wav diff --git a/java-api-examples/InverseTextNormalizationNonStreamingParaformer.java b/java-api-examples/InverseTextNormalizationNonStreamingParaformer.java new file mode 100644 index 000000000..53d18248b --- /dev/null +++ b/java-api-examples/InverseTextNormalizationNonStreamingParaformer.java @@ -0,0 +1,54 @@ +// Copyright 2024 Xiaomi Corporation + +// This file shows how to use an offline paraformer, i.e., non-streaming paraformer, +// to decode files with inverse text normalization. +import com.k2fsa.sherpa.onnx.*; + +public class InverseTextNormalizationNonStreamingParaformer { + public static void main(String[] args) { + // please refer to + // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-paraformer-zh-2023-03-28-chinese-english + // to download model files + String model = "./sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx"; + String tokens = "./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt"; + + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn-zh-number.wav + String waveFilename = "./itn-zh-number.wav"; + + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst + String ruleFsts = "./itn_zh_number.fst"; + + WaveReader reader = new WaveReader(waveFilename); + + OfflineParaformerModelConfig paraformer = + OfflineParaformerModelConfig.builder().setModel(model).build(); + + OfflineModelConfig modelConfig = + OfflineModelConfig.builder() + .setParaformer(paraformer) + .setTokens(tokens) + .setNumThreads(1) + .setDebug(true) + .build(); + + OfflineRecognizerConfig config = + OfflineRecognizerConfig.builder() + .setOfflineModelConfig(modelConfig) + .setDecodingMethod("greedy_search") + .setRuleFsts(ruleFsts) + .build(); + + OfflineRecognizer recognizer = new OfflineRecognizer(config); + OfflineStream stream = recognizer.createStream(); + stream.acceptWaveform(reader.getSamples(), reader.getSampleRate()); + + recognizer.decode(stream); + + String text = recognizer.getResult(stream).getText(); + + System.out.printf("filename:%s\nresult:%s\n", waveFilename, text); + + stream.release(); + recognizer.release(); + } +} diff --git a/java-api-examples/run-inverse-text-normalization-paraformer.sh b/java-api-examples/run-inverse-text-normalization-paraformer.sh new file mode 100755 index 000000000..606dba6f7 --- /dev/null +++ b/java-api-examples/run-inverse-text-normalization-paraformer.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash + +set -ex + +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then + mkdir -p ../build + pushd ../build + cmake \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + -DSHERPA_ONNX_ENABLE_JNI=ON \ + .. + + make -j4 + ls -lh lib + popd +fi + +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then + pushd ../sherpa-onnx/java-api + make + popd +fi + +if [ ! -f ./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + + tar xvf sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + rm sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 +fi + +if [ ! -f ./itn-zh-number.wav ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn-zh-number.wav +fi + +if [ ! -f ./itn_zh_number.fst ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst +fi + +java \ + -Djava.library.path=$PWD/../build/lib \ + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \ + InverseTextNormalizationNonStreamingParaformer.java diff --git a/kotlin-api-examples/run.sh b/kotlin-api-examples/run.sh index e7946598d..a96c09748 100755 --- a/kotlin-api-examples/run.sh +++ b/kotlin-api-examples/run.sh @@ -203,6 +203,34 @@ function testOfflineAsr() { java -Djava.library.path=../build/lib -jar $out_filename } +function testInverseTextNormalizationAsr() { + if [ ! -f ./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + tar xvf sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + rm sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + fi + + if [ ! -f ./itn-zh-number.wav ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn-zh-number.wav + fi + + if [ ! -f ./itn_zh_number.fst ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst + fi + + out_filename=test_offline_asr.jar + kotlinc-jvm -include-runtime -d $out_filename \ + test_itn_asr.kt \ + FeatureConfig.kt \ + OfflineRecognizer.kt \ + OfflineStream.kt \ + WaveReader.kt \ + faked-asset-manager.kt + + ls -lh $out_filename + java -Djava.library.path=../build/lib -jar $out_filename +} + function testPunctuation() { if [ ! -f ./sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12/model.onnx ]; then curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/punctuation-models/sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12.tar.bz2 @@ -229,3 +257,4 @@ testAudioTagging testSpokenLanguageIdentification testOfflineAsr testPunctuation +testInverseTextNormalizationAsr diff --git a/kotlin-api-examples/test_itn_asr.kt b/kotlin-api-examples/test_itn_asr.kt new file mode 100644 index 000000000..250af4172 --- /dev/null +++ b/kotlin-api-examples/test_itn_asr.kt @@ -0,0 +1,37 @@ +package com.k2fsa.sherpa.onnx + +fun main() { + test() +} + +fun test() { + val recognizer = createOfflineRecognizer() + val waveFilename = "./itn-zh-number.wav"; + + val objArray = WaveReader.readWaveFromFile( + filename = waveFilename, + ) + val samples: FloatArray = objArray[0] as FloatArray + val sampleRate: Int = objArray[1] as Int + + val stream = recognizer.createStream() + stream.acceptWaveform(samples, sampleRate=sampleRate) + recognizer.decode(stream) + + val result = recognizer.getResult(stream) + println(result) + + stream.release() + recognizer.release() +} + +fun createOfflineRecognizer(): OfflineRecognizer { + val config = OfflineRecognizerConfig( + featConfig = getFeatureConfig(sampleRate = 16000, featureDim = 80), + modelConfig = getOfflineModelConfig(0)!!, + ruleFsts = "./itn_zh_number.fst", + ) + + return OfflineRecognizer(config = config) +} + diff --git a/nodejs-addon-examples/test_asr_non_streaming_paraformer_itn.js b/nodejs-addon-examples/test_asr_non_streaming_paraformer_itn.js new file mode 100644 index 000000000..c5d0f34b0 --- /dev/null +++ b/nodejs-addon-examples/test_asr_non_streaming_paraformer_itn.js @@ -0,0 +1,48 @@ +// Copyright (c) 2024 Xiaomi Corporation +const sherpa_onnx = require('sherpa-onnx-node'); + +// Please download test files from +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +const config = { + 'featConfig': { + 'sampleRate': 16000, + 'featureDim': 80, + }, + 'modelConfig': { + 'paraformer': { + 'model': './sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx', + }, + 'tokens': './sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt', + 'numThreads': 2, + 'provider': 'cpu', + 'debug': 1, + }, + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst + ruleFsts: './itn_zh_number.fst', +}; + +// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn-zh-number.wav +const waveFilename = './itn-zh-number.wav'; + +const recognizer = new sherpa_onnx.OfflineRecognizer(config); +console.log('Started') +let start = Date.now(); +const stream = recognizer.createStream(); +const wave = sherpa_onnx.readWave(waveFilename); +stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples}); + +recognizer.decode(stream); +result = recognizer.getResult(stream) +let stop = Date.now(); +console.log('Done') + +const elapsed_seconds = (stop - start) / 1000; +const duration = wave.samples.length / wave.sampleRate; +const real_time_factor = elapsed_seconds / duration; +console.log('Wave duration', duration.toFixed(3), 'secodns') +console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns') +console.log( + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`, + real_time_factor.toFixed(3)) +console.log(waveFilename) +console.log('result\n', result) diff --git a/nodejs-examples/test-offline-paraformer-itn.js b/nodejs-examples/test-offline-paraformer-itn.js new file mode 100644 index 000000000..ebadc5b0e --- /dev/null +++ b/nodejs-examples/test-offline-paraformer-itn.js @@ -0,0 +1,128 @@ +// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) + +const fs = require('fs'); +const {Readable} = require('stream'); +const wav = require('wav'); + +const sherpa_onnx = require('sherpa-onnx'); + +function createOfflineRecognizer() { + let featConfig = { + sampleRate: 16000, + featureDim: 80, + }; + + let modelConfig = { + transducer: { + encoder: '', + decoder: '', + joiner: '', + }, + paraformer: { + model: './sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx', + }, + nemoCtc: { + model: '', + }, + whisper: { + encoder: '', + decoder: '', + language: '', + task: '', + tailPaddings: -1, + }, + tdnn: { + model: '', + }, + tokens: './sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt', + numThreads: 1, + debug: 0, + provider: 'cpu', + modelType: 'paraformer', + }; + + let lmConfig = { + model: '', + scale: 1.0, + }; + + let config = { + featConfig: featConfig, + modelConfig: modelConfig, + lmConfig: lmConfig, + decodingMethod: 'greedy_search', + maxActivePaths: 4, + hotwordsFile: '', + hotwordsScore: 1.5, + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst + ruleFsts: './itn_zh_number.fst', + }; + + return sherpa_onnx.createOfflineRecognizer(config); +} + + +const recognizer = createOfflineRecognizer(); +const stream = recognizer.createStream(); + +// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn-zh-number.wav +const waveFilename = './itn-zh-number.wav'; + +const reader = new wav.Reader(); +const readable = new Readable().wrap(reader); +const buf = []; + +reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => { + if (sampleRate != recognizer.config.featConfig.sampleRate) { + throw new Error(`Only support sampleRate ${ + recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`); + } + + if (audioFormat != 1) { + throw new Error(`Only support PCM format. Given ${audioFormat}`); + } + + if (channels != 1) { + throw new Error(`Only a single channel. Given ${channel}`); + } + + if (bitDepth != 16) { + throw new Error(`Only support 16-bit samples. Given ${bitDepth}`); + } +}); + +fs.createReadStream(waveFilename, {'highWaterMark': 4096}) + .pipe(reader) + .on('finish', function(err) { + // tail padding + const floatSamples = + new Float32Array(recognizer.config.featConfig.sampleRate * 0.5); + + buf.push(floatSamples); + const flattened = + Float32Array.from(buf.reduce((a, b) => [...a, ...b], [])); + + stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened); + recognizer.decode(stream); + const text = recognizer.getResult(stream).text; + console.log(text); + + stream.free(); + recognizer.free(); + }); + +readable.on('readable', function() { + let chunk; + while ((chunk = readable.read()) != null) { + const int16Samples = new Int16Array( + chunk.buffer, chunk.byteOffset, + chunk.length / Int16Array.BYTES_PER_ELEMENT); + + const floatSamples = new Float32Array(int16Samples.length); + for (let i = 0; i < floatSamples.length; i++) { + floatSamples[i] = int16Samples[i] / 32768.0; + } + + buf.push(floatSamples); + } +}); diff --git a/scripts/dotnet/OfflineRecognizerConfig.cs b/scripts/dotnet/OfflineRecognizerConfig.cs index 23d6e18a0..371a556ea 100644 --- a/scripts/dotnet/OfflineRecognizerConfig.cs +++ b/scripts/dotnet/OfflineRecognizerConfig.cs @@ -21,7 +21,8 @@ public OfflineRecognizerConfig() MaxActivePaths = 4; HotwordsFile = ""; HotwordsScore = 1.5F; - + RuleFsts = ""; + RuleFars = ""; } public FeatureConfig FeatConfig; public OfflineModelConfig ModelConfig; @@ -36,5 +37,11 @@ public OfflineRecognizerConfig() public string HotwordsFile; public float HotwordsScore; + + [MarshalAs(UnmanagedType.LPStr)] + public string RuleFsts; + + [MarshalAs(UnmanagedType.LPStr)] + public string RuleFars; } } diff --git a/scripts/go/_internal/non-streaming-decode-files/run-paraformer-itn.sh b/scripts/go/_internal/non-streaming-decode-files/run-paraformer-itn.sh new file mode 120000 index 000000000..58c03fd85 --- /dev/null +++ b/scripts/go/_internal/non-streaming-decode-files/run-paraformer-itn.sh @@ -0,0 +1 @@ +../../../../go-api-examples/non-streaming-decode-files/run-paraformer-itn.sh \ No newline at end of file diff --git a/scripts/go/sherpa_onnx.go b/scripts/go/sherpa_onnx.go index a03031866..437f4f328 100644 --- a/scripts/go/sherpa_onnx.go +++ b/scripts/go/sherpa_onnx.go @@ -397,6 +397,10 @@ type OfflineRecognizerConfig struct { // Used only when DecodingMethod is modified_beam_search. MaxActivePaths int + HotwordsFile string + HotwordsScore float32 + RuleFsts string + RuleFars string } // It wraps a pointer from C @@ -491,6 +495,17 @@ func NewOfflineRecognizer(config *OfflineRecognizerConfig) *OfflineRecognizer { c.max_active_paths = C.int(config.MaxActivePaths) + c.hotwords_file = C.CString(config.HotwordsFile) + defer C.free(unsafe.Pointer(c.hotwords_file)) + + c.hotwords_score = C.float(config.HotwordsScore) + + c.rule_fsts = C.CString(config.RuleFsts) + defer C.free(unsafe.Pointer(c.rule_fsts)) + + c.rule_fars = C.CString(config.RuleFars) + defer C.free(unsafe.Pointer(c.rule_fars)) + recognizer := &OfflineRecognizer{} recognizer.impl = C.CreateOfflineRecognizer(&c) diff --git a/scripts/node-addon-api/README.md b/scripts/node-addon-api/README.md index f9b3cf0bf..fb291b77b 100644 --- a/scripts/node-addon-api/README.md +++ b/scripts/node-addon-api/README.md @@ -15,8 +15,8 @@ cmake -DCMAKE_INSTALL_PREFIX=./install -DBUILD_SHARED_LIBS=ON .. make -j install export PKG_CONFIG_PATH=$PWD/install:$PKG_CONFIG_PATH cd ../scripts/node-addon-api/ - -./node_modules/.bin/node-gyp build --verbose +npm i +./node_modules/.bin/cmake-js compile --log-level verbose # see test/test_asr_streaming_transducer.js # for usages diff --git a/scripts/node-addon-api/src/non-streaming-asr.cc b/scripts/node-addon-api/src/non-streaming-asr.cc index 671528200..db14ef52d 100644 --- a/scripts/node-addon-api/src/non-streaming-asr.cc +++ b/scripts/node-addon-api/src/non-streaming-asr.cc @@ -180,6 +180,8 @@ CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) { SHERPA_ONNX_ASSIGN_ATTR_INT32(max_active_paths, maxActivePaths); SHERPA_ONNX_ASSIGN_ATTR_STR(hotwords_file, hotwordsFile); SHERPA_ONNX_ASSIGN_ATTR_FLOAT(hotwords_score, hotwordsScore); + SHERPA_ONNX_ASSIGN_ATTR_STR(rule_fsts, ruleFsts); + SHERPA_ONNX_ASSIGN_ATTR_STR(rule_fars, ruleFars); SherpaOnnxOfflineRecognizer *recognizer = CreateOfflineRecognizer(&c); @@ -259,6 +261,14 @@ CreateOfflineRecognizerWrapper(const Napi::CallbackInfo &info) { delete[] c.hotwords_file; } + if (c.rule_fsts) { + delete[] c.rule_fsts; + } + + if (c.rule_fars) { + delete[] c.rule_fars; + } + if (!recognizer) { Napi::TypeError::New(env, "Please check your config!") .ThrowAsJavaScriptException(); diff --git a/scripts/node-addon-api/src/non-streaming-tts.cc b/scripts/node-addon-api/src/non-streaming-tts.cc index c230b972a..70d97cddb 100644 --- a/scripts/node-addon-api/src/non-streaming-tts.cc +++ b/scripts/node-addon-api/src/non-streaming-tts.cc @@ -44,7 +44,7 @@ static SherpaOnnxOfflineTtsModelConfig GetOfflineTtsModelConfig( c.vits = GetOfflineTtsVitsModelConfig(o); - SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, num_threads); + SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads); if (o.Has("debug") && (o.Get("debug").IsNumber() || o.Get("debug").IsBoolean())) { diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index eb4e293d1..01e2191c3 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -388,6 +388,9 @@ SherpaOnnxOfflineRecognizer *CreateOfflineRecognizer( recognizer_config.hotwords_score = SHERPA_ONNX_OR(config->hotwords_score, 1.5); + recognizer_config.rule_fsts = SHERPA_ONNX_OR(config->rule_fsts, ""); + recognizer_config.rule_fars = SHERPA_ONNX_OR(config->rule_fars, ""); + if (config->model_config.debug) { SHERPA_ONNX_LOGE("%s", recognizer_config.ToString().c_str()); } diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index e75d1955f..0229f8059 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -411,6 +411,8 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerConfig { /// Bonus score for each token in hotwords. float hotwords_score; + const char *rule_fsts; + const char *rule_fars; } SherpaOnnxOfflineRecognizerConfig; SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizer diff --git a/sherpa-onnx/flutter/lib/src/offline_recognizer.dart b/sherpa-onnx/flutter/lib/src/offline_recognizer.dart index 7a220105c..b5619e3e0 100644 --- a/sherpa-onnx/flutter/lib/src/offline_recognizer.dart +++ b/sherpa-onnx/flutter/lib/src/offline_recognizer.dart @@ -137,11 +137,13 @@ class OfflineRecognizerConfig { this.maxActivePaths = 4, this.hotwordsFile = '', this.hotwordsScore = 1.5, + this.ruleFsts = '', + this.ruleFars = '', }); @override String toString() { - return 'OfflineRecognizerConfig(feat: $feat, model: $model, lm: $lm, decodingMethod: $decodingMethod, maxActivePaths: $maxActivePaths, hotwordsFile: $hotwordsFile, hotwordsScore: $hotwordsScore)'; + return 'OfflineRecognizerConfig(feat: $feat, model: $model, lm: $lm, decodingMethod: $decodingMethod, maxActivePaths: $maxActivePaths, hotwordsFile: $hotwordsFile, hotwordsScore: $hotwordsScore, ruleFsts: $ruleFsts, ruleFars: $ruleFars)'; } final FeatureConfig feat; @@ -154,6 +156,9 @@ class OfflineRecognizerConfig { final String hotwordsFile; final double hotwordsScore; + + final String ruleFsts; + final String ruleFars; } class OfflineRecognizerResult { @@ -232,8 +237,13 @@ class OfflineRecognizer { c.ref.hotwordsFile = config.hotwordsFile.toNativeUtf8(); c.ref.hotwordsScore = config.hotwordsScore; + c.ref.ruleFsts = config.ruleFsts.toNativeUtf8(); + c.ref.ruleFars = config.ruleFars.toNativeUtf8(); + final ptr = SherpaOnnxBindings.createOfflineRecognizer?.call(c) ?? nullptr; + calloc.free(c.ref.ruleFars); + calloc.free(c.ref.ruleFsts); calloc.free(c.ref.hotwordsFile); calloc.free(c.ref.decodingMethod); calloc.free(c.ref.lm.model); diff --git a/sherpa-onnx/flutter/lib/src/sherpa_onnx_bindings.dart b/sherpa-onnx/flutter/lib/src/sherpa_onnx_bindings.dart index bce1be589..70d9572e7 100644 --- a/sherpa-onnx/flutter/lib/src/sherpa_onnx_bindings.dart +++ b/sherpa-onnx/flutter/lib/src/sherpa_onnx_bindings.dart @@ -130,6 +130,9 @@ final class SherpaOnnxOfflineRecognizerConfig extends Struct { @Float() external double hotwordsScore; + + external Pointer ruleFsts; + external Pointer ruleFars; } final class SherpaOnnxOnlineTransducerModelConfig extends Struct { diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineRecognizerConfig.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineRecognizerConfig.java index 94d3debc9..a8222ad77 100644 --- a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineRecognizerConfig.java +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineRecognizerConfig.java @@ -9,6 +9,8 @@ public class OfflineRecognizerConfig { private final int maxActivePaths; private final String hotwordsFile; private final float hotwordsScore; + private final String ruleFsts; + private final String ruleFars; private OfflineRecognizerConfig(Builder builder) { this.featConfig = builder.featConfig; @@ -17,6 +19,8 @@ private OfflineRecognizerConfig(Builder builder) { this.maxActivePaths = builder.maxActivePaths; this.hotwordsFile = builder.hotwordsFile; this.hotwordsScore = builder.hotwordsScore; + this.ruleFsts = builder.ruleFsts; + this.ruleFars = builder.ruleFars; } public static Builder builder() { @@ -34,6 +38,8 @@ public static class Builder { private int maxActivePaths = 4; private String hotwordsFile = ""; private float hotwordsScore = 1.5f; + private String ruleFsts = ""; + private String ruleFars = ""; public OfflineRecognizerConfig build() { return new OfflineRecognizerConfig(this); @@ -68,5 +74,15 @@ public Builder setHotwordsScore(float hotwordsScore) { this.hotwordsScore = hotwordsScore; return this; } + + public Builder setRuleFsts(String ruleFsts) { + this.ruleFsts = ruleFsts; + return this; + } + + public Builder setRuleFars(String ruleFars) { + this.ruleFars = ruleFars; + return this; + } } } diff --git a/sherpa-onnx/jni/offline-recognizer.cc b/sherpa-onnx/jni/offline-recognizer.cc index cf69389a3..070d46f08 100644 --- a/sherpa-onnx/jni/offline-recognizer.cc +++ b/sherpa-onnx/jni/offline-recognizer.cc @@ -34,6 +34,18 @@ static OfflineRecognizerConfig GetOfflineConfig(JNIEnv *env, jobject config) { fid = env->GetFieldID(cls, "hotwordsScore", "F"); ans.hotwords_score = env->GetFloatField(config, fid); + fid = env->GetFieldID(cls, "ruleFsts", "Ljava/lang/String;"); + s = (jstring)env->GetObjectField(config, fid); + p = env->GetStringUTFChars(s, nullptr); + ans.rule_fsts = p; + env->ReleaseStringUTFChars(s, p); + + fid = env->GetFieldID(cls, "ruleFars", "Ljava/lang/String;"); + s = (jstring)env->GetObjectField(config, fid); + p = env->GetStringUTFChars(s, nullptr); + ans.rule_fars = p; + env->ReleaseStringUTFChars(s, p); + //---------- feat config ---------- fid = env->GetFieldID(cls, "featConfig", "Lcom/k2fsa/sherpa/onnx/FeatureConfig;"); diff --git a/sherpa-onnx/kotlin-api/OfflineRecognizer.kt b/sherpa-onnx/kotlin-api/OfflineRecognizer.kt index 151ac73d5..c910e8d68 100644 --- a/sherpa-onnx/kotlin-api/OfflineRecognizer.kt +++ b/sherpa-onnx/kotlin-api/OfflineRecognizer.kt @@ -53,6 +53,8 @@ data class OfflineRecognizerConfig( var maxActivePaths: Int = 4, var hotwordsFile: String = "", var hotwordsScore: Float = 1.5f, + var ruleFsts: String = "", + var ruleFars: String = "", ) class OfflineRecognizer( diff --git a/swift-api-examples/SherpaOnnx.swift b/swift-api-examples/SherpaOnnx.swift index 7346ac4b8..24082a827 100644 --- a/swift-api-examples/SherpaOnnx.swift +++ b/swift-api-examples/SherpaOnnx.swift @@ -387,7 +387,9 @@ func sherpaOnnxOfflineRecognizerConfig( decodingMethod: String = "greedy_search", maxActivePaths: Int = 4, hotwordsFile: String = "", - hotwordsScore: Float = 1.5 + hotwordsScore: Float = 1.5, + ruleFsts: String = "", + ruleFars: String = "" ) -> SherpaOnnxOfflineRecognizerConfig { return SherpaOnnxOfflineRecognizerConfig( feat_config: featConfig, @@ -396,7 +398,9 @@ func sherpaOnnxOfflineRecognizerConfig( decoding_method: toCPointer(decodingMethod), max_active_paths: Int32(maxActivePaths), hotwords_file: toCPointer(hotwordsFile), - hotwords_score: hotwordsScore + hotwords_score: hotwordsScore, + rule_fsts: toCPointer(ruleFsts), + rule_fars: toCPointer(ruleFars) ) } diff --git a/wasm/asr/sherpa-onnx-asr.js b/wasm/asr/sherpa-onnx-asr.js index 53afe1875..2179fd87d 100644 --- a/wasm/asr/sherpa-onnx-asr.js +++ b/wasm/asr/sherpa-onnx-asr.js @@ -628,7 +628,7 @@ function initSherpaOnnxOfflineRecognizerConfig(config, Module) { const model = initSherpaOnnxOfflineModelConfig(config.modelConfig, Module); const lm = initSherpaOnnxOfflineLMConfig(config.lmConfig, Module); - const len = feat.len + model.len + lm.len + 4 * 4; + const len = feat.len + model.len + lm.len + 6 * 4; const ptr = Module._malloc(len); let offset = 0; @@ -643,7 +643,10 @@ function initSherpaOnnxOfflineRecognizerConfig(config, Module) { const decodingMethodLen = Module.lengthBytesUTF8(config.decodingMethod) + 1; const hotwordsFileLen = Module.lengthBytesUTF8(config.hotwordsFile) + 1; - const bufferLen = decodingMethodLen + hotwordsFileLen; + const ruleFstsLen = Module.lengthBytesUTF8(config.ruleFsts || '') + 1; + const ruleFarsLen = Module.lengthBytesUTF8(config.ruleFars || '') + 1; + const bufferLen = + decodingMethodLen + hotwordsFileLen + ruleFstsLen + ruleFarsLen; const buffer = Module._malloc(bufferLen); offset = 0; @@ -651,6 +654,13 @@ function initSherpaOnnxOfflineRecognizerConfig(config, Module) { offset += decodingMethodLen; Module.stringToUTF8(config.hotwordsFile, buffer + offset, hotwordsFileLen); + offset += hotwordsFileLen; + + Module.stringToUTF8(config.ruleFsts || '', buffer + offset, ruleFstsLen); + offset += ruleFstsLen; + + Module.stringToUTF8(config.ruleFars || '', buffer + offset, ruleFarsLen); + offset += ruleFarsLen; offset = feat.len + model.len + lm.len; @@ -666,6 +676,15 @@ function initSherpaOnnxOfflineRecognizerConfig(config, Module) { Module.setValue(ptr + offset, config.hotwordsScore, 'float'); offset += 4; + Module.setValue( + ptr + offset, buffer + decodingMethodLen + hotwordsFileLen, 'i8*'); + offset += 4; + + Module.setValue( + ptr + offset, buffer + decodingMethodLen + hotwordsFileLen + ruleFstsLen, + 'i8*'); + offset += 4; + return { buffer: buffer, ptr: ptr, len: len, feat: feat, model: model, lm: lm } diff --git a/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc b/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc index 177fb6f04..6e138c76f 100644 --- a/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc +++ b/wasm/nodejs/sherpa-onnx-wasm-nodejs.cc @@ -29,7 +29,7 @@ static_assert(sizeof(SherpaOnnxFeatureConfig) == 2 * 4, ""); static_assert(sizeof(SherpaOnnxOfflineRecognizerConfig) == sizeof(SherpaOnnxFeatureConfig) + sizeof(SherpaOnnxOfflineLMConfig) + - sizeof(SherpaOnnxOfflineModelConfig) + 4 * 4, + sizeof(SherpaOnnxOfflineModelConfig) + 6 * 4, ""); void PrintOfflineTtsConfig(SherpaOnnxOfflineTtsConfig *tts_config) { @@ -103,6 +103,8 @@ void PrintOfflineRecognizerConfig(SherpaOnnxOfflineRecognizerConfig *config) { fprintf(stdout, "max active paths: %d\n", config->max_active_paths); fprintf(stdout, "hotwords_file: %s\n", config->hotwords_file); fprintf(stdout, "hotwords_score: %.2f\n", config->hotwords_score); + fprintf(stdout, "rule_fsts: %s\n", config->rule_fsts); + fprintf(stdout, "rule_fars: %s\n", config->rule_fars); } void CopyHeap(const char *src, int32_t num_bytes, char *dst) { From 349d957da28f44121c22d25787d7ccd62c684c01 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Mon, 17 Jun 2024 18:39:23 +0800 Subject: [PATCH 027/237] Add inverse text normalization for online ASR (#1020) --- .github/scripts/test-python.sh | 11 ++ .../inverse-text-normalization-online-asr.py | 91 ++++++++++++++ sherpa-onnx/csrc/online-recognizer-ctc-impl.h | 12 +- sherpa-onnx/csrc/online-recognizer-impl.cc | 117 ++++++++++++++++++ sherpa-onnx/csrc/online-recognizer-impl.h | 20 +++ .../csrc/online-recognizer-paraformer-impl.h | 10 +- .../csrc/online-recognizer-transducer-impl.h | 12 +- .../online-recognizer-transducer-nemo-impl.h | 14 ++- sherpa-onnx/csrc/online-recognizer.cc | 43 ++++++- sherpa-onnx/csrc/online-recognizer.h | 13 +- sherpa-onnx/python/csrc/online-recognizer.cc | 29 +++-- .../python/sherpa_onnx/online_recognizer.py | 50 ++++++++ 12 files changed, 390 insertions(+), 32 deletions(-) create mode 100755 python-api-examples/inverse-text-normalization-online-asr.py diff --git a/.github/scripts/test-python.sh b/.github/scripts/test-python.sh index c03b95426..0dbb8b99d 100755 --- a/.github/scripts/test-python.sh +++ b/.github/scripts/test-python.sh @@ -256,7 +256,18 @@ if [[ x$OS != x'windows-latest' ]]; then $repo/test_wavs/3.wav \ $repo/test_wavs/8k.wav + ln -s $repo $PWD/ + + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn-zh-number.wav + + python3 ./python-api-examples/inverse-text-normalization-online-asr.py + python3 sherpa-onnx/python/tests/test_online_recognizer.py --verbose + + rm -rfv sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 + + rm -rf $repo fi log "Test non-streaming transducer models" diff --git a/python-api-examples/inverse-text-normalization-online-asr.py b/python-api-examples/inverse-text-normalization-online-asr.py new file mode 100755 index 000000000..8524c20f3 --- /dev/null +++ b/python-api-examples/inverse-text-normalization-online-asr.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2024 Xiaomi Corporation + +""" +This script shows how to use inverse text normalization with streaming ASR. + +Usage: + +(1) Download the test model + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + +(2) Download rule fst + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst + +Please refer to +https://github.com/k2-fsa/colab/blob/master/sherpa-onnx/itn_zh_number.ipynb +for how itn_zh_number.fst is generated. + +(3) Download test wave + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn-zh-number.wav + +(4) Run this script + +python3 ./python-api-examples/inverse-text-normalization-online-asr.py +""" +from pathlib import Path + +import sherpa_onnx +import soundfile as sf + + +def create_recognizer(): + encoder = "./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.int8.onnx" + decoder = "./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx" + joiner = "./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.int8.onnx" + tokens = "./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt" + rule_fsts = "./itn_zh_number.fst" + + if ( + not Path(encoder).is_file() + or not Path(decoder).is_file() + or not Path(joiner).is_file() + or not Path(tokens).is_file() + or not Path(rule_fsts).is_file() + ): + raise ValueError( + """Please download model files from + https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models + """ + ) + return sherpa_onnx.OnlineRecognizer.from_transducer( + encoder=encoder, + decoder=decoder, + joiner=joiner, + tokens=tokens, + debug=True, + rule_fsts=rule_fsts, + ) + + +def main(): + recognizer = create_recognizer() + wave_filename = "./itn-zh-number.wav" + if not Path(wave_filename).is_file(): + raise ValueError( + """Please download model files from + https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models + """ + ) + audio, sample_rate = sf.read(wave_filename, dtype="float32", always_2d=True) + audio = audio[:, 0] # only use the first channel + + stream = recognizer.create_stream() + stream.accept_waveform(sample_rate, audio) + + tail_padding = [0] * int(0.3 * sample_rate) + stream.accept_waveform(sample_rate, tail_padding) + + while recognizer.is_ready(stream): + recognizer.decode_stream(stream) + + print(wave_filename) + print(recognizer.get_result_all(stream)) + + +if __name__ == "__main__": + main() diff --git a/sherpa-onnx/csrc/online-recognizer-ctc-impl.h b/sherpa-onnx/csrc/online-recognizer-ctc-impl.h index 4d8ce2961..7dd9d8b18 100644 --- a/sherpa-onnx/csrc/online-recognizer-ctc-impl.h +++ b/sherpa-onnx/csrc/online-recognizer-ctc-impl.h @@ -68,7 +68,8 @@ static OnlineRecognizerResult Convert(const OnlineCtcDecoderResult &src, class OnlineRecognizerCtcImpl : public OnlineRecognizerImpl { public: explicit OnlineRecognizerCtcImpl(const OnlineRecognizerConfig &config) - : config_(config), + : OnlineRecognizerImpl(config), + config_(config), model_(OnlineCtcModel::Create(config.model_config)), sym_(config.model_config.tokens), endpoint_(config_.endpoint_config) { @@ -84,7 +85,8 @@ class OnlineRecognizerCtcImpl : public OnlineRecognizerImpl { #if __ANDROID_API__ >= 9 explicit OnlineRecognizerCtcImpl(AAssetManager *mgr, const OnlineRecognizerConfig &config) - : config_(config), + : OnlineRecognizerImpl(mgr, config), + config_(config), model_(OnlineCtcModel::Create(mgr, config.model_config)), sym_(mgr, config.model_config.tokens), endpoint_(config_.endpoint_config) { @@ -182,8 +184,10 @@ class OnlineRecognizerCtcImpl : public OnlineRecognizerImpl { // TODO(fangjun): Remember to change these constants if needed int32_t frame_shift_ms = 10; int32_t subsampling_factor = 4; - return Convert(decoder_result, sym_, frame_shift_ms, subsampling_factor, - s->GetCurrentSegment(), s->GetNumFramesSinceStart()); + auto r = Convert(decoder_result, sym_, frame_shift_ms, subsampling_factor, + s->GetCurrentSegment(), s->GetNumFramesSinceStart()); + r.text = ApplyInverseTextNormalization(r.text); + return r; } bool IsEndpoint(OnlineStream *s) const override { diff --git a/sherpa-onnx/csrc/online-recognizer-impl.cc b/sherpa-onnx/csrc/online-recognizer-impl.cc index 2de905772..89d172f97 100644 --- a/sherpa-onnx/csrc/online-recognizer-impl.cc +++ b/sherpa-onnx/csrc/online-recognizer-impl.cc @@ -4,11 +4,22 @@ #include "sherpa-onnx/csrc/online-recognizer-impl.h" +#if __ANDROID_API__ >= 9 +#include + +#include "android/asset_manager.h" +#include "android/asset_manager_jni.h" +#endif + +#include "fst/extensions/far/far.h" +#include "kaldifst/csrc/kaldi-fst-io.h" +#include "sherpa-onnx/csrc/macros.h" #include "sherpa-onnx/csrc/online-recognizer-ctc-impl.h" #include "sherpa-onnx/csrc/online-recognizer-paraformer-impl.h" #include "sherpa-onnx/csrc/online-recognizer-transducer-impl.h" #include "sherpa-onnx/csrc/online-recognizer-transducer-nemo-impl.h" #include "sherpa-onnx/csrc/onnx-utils.h" +#include "sherpa-onnx/csrc/text-utils.h" namespace sherpa_onnx { @@ -78,4 +89,110 @@ std::unique_ptr OnlineRecognizerImpl::Create( } #endif +OnlineRecognizerImpl::OnlineRecognizerImpl(const OnlineRecognizerConfig &config) + : config_(config) { + if (!config.rule_fsts.empty()) { + std::vector files; + SplitStringToVector(config.rule_fsts, ",", false, &files); + itn_list_.reserve(files.size()); + for (const auto &f : files) { + if (config.model_config.debug) { + SHERPA_ONNX_LOGE("rule fst: %s", f.c_str()); + } + itn_list_.push_back(std::make_unique(f)); + } + } + + if (!config.rule_fars.empty()) { + if (config.model_config.debug) { + SHERPA_ONNX_LOGE("Loading FST archives"); + } + std::vector files; + SplitStringToVector(config.rule_fars, ",", false, &files); + + itn_list_.reserve(files.size() + itn_list_.size()); + + for (const auto &f : files) { + if (config.model_config.debug) { + SHERPA_ONNX_LOGE("rule far: %s", f.c_str()); + } + std::unique_ptr> reader( + fst::FarReader::Open(f)); + for (; !reader->Done(); reader->Next()) { + std::unique_ptr r( + fst::CastOrConvertToConstFst(reader->GetFst()->Copy())); + + itn_list_.push_back( + std::make_unique(std::move(r))); + } + } + + if (config.model_config.debug) { + SHERPA_ONNX_LOGE("FST archives loaded!"); + } + } +} + +#if __ANDROID_API__ >= 9 +OnlineRecognizerImpl::OnlineRecognizerImpl(AAssetManager *mgr, + const OnlineRecognizerConfig &config) + : config_(config) { + if (!config.rule_fsts.empty()) { + std::vector files; + SplitStringToVector(config.rule_fsts, ",", false, &files); + itn_list_.reserve(files.size()); + for (const auto &f : files) { + if (config.model_config.debug) { + SHERPA_ONNX_LOGE("rule fst: %s", f.c_str()); + } + auto buf = ReadFile(mgr, f); + std::istrstream is(buf.data(), buf.size()); + itn_list_.push_back(std::make_unique(is)); + } + } + + if (!config.rule_fars.empty()) { + std::vector files; + SplitStringToVector(config.rule_fars, ",", false, &files); + itn_list_.reserve(files.size() + itn_list_.size()); + + for (const auto &f : files) { + if (config.model_config.debug) { + SHERPA_ONNX_LOGE("rule far: %s", f.c_str()); + } + + auto buf = ReadFile(mgr, f); + + std::unique_ptr s( + new std::istrstream(buf.data(), buf.size())); + + std::unique_ptr> reader( + fst::FarReader::Open(std::move(s))); + + for (; !reader->Done(); reader->Next()) { + std::unique_ptr r( + fst::CastOrConvertToConstFst(reader->GetFst()->Copy())); + + itn_list_.push_back( + std::make_unique(std::move(r))); + } // for (; !reader->Done(); reader->Next()) + } // for (const auto &f : files) + } // if (!config.rule_fars.empty()) +} +#endif + +std::string OnlineRecognizerImpl::ApplyInverseTextNormalization( + std::string text) const { + if (!itn_list_.empty()) { + for (const auto &tn : itn_list_) { + text = tn->Normalize(text); + if (config_.model_config.debug) { + SHERPA_ONNX_LOGE("After inverse text normalization: %s", text.c_str()); + } + } + } + + return text; +} + } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/online-recognizer-impl.h b/sherpa-onnx/csrc/online-recognizer-impl.h index 72efedec7..8b569f3af 100644 --- a/sherpa-onnx/csrc/online-recognizer-impl.h +++ b/sherpa-onnx/csrc/online-recognizer-impl.h @@ -9,6 +9,12 @@ #include #include +#if __ANDROID_API__ >= 9 +#include "android/asset_manager.h" +#include "android/asset_manager_jni.h" +#endif + +#include "kaldifst/csrc/text-normalizer.h" #include "sherpa-onnx/csrc/macros.h" #include "sherpa-onnx/csrc/online-recognizer.h" #include "sherpa-onnx/csrc/online-stream.h" @@ -17,10 +23,15 @@ namespace sherpa_onnx { class OnlineRecognizerImpl { public: + explicit OnlineRecognizerImpl(const OnlineRecognizerConfig &config); + static std::unique_ptr Create( const OnlineRecognizerConfig &config); #if __ANDROID_API__ >= 9 + OnlineRecognizerImpl(AAssetManager *mgr, + const OnlineRecognizerConfig &config); + static std::unique_ptr Create( AAssetManager *mgr, const OnlineRecognizerConfig &config); #endif @@ -50,6 +61,15 @@ class OnlineRecognizerImpl { virtual bool IsEndpoint(OnlineStream *s) const = 0; virtual void Reset(OnlineStream *s) const = 0; + + std::string ApplyInverseTextNormalization(std::string text) const; + + private: + OnlineRecognizerConfig config_; + // for inverse text normalization. Used only if + // config.rule_fsts is not empty or + // config.rule_fars is not empty + std::vector> itn_list_; }; } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/online-recognizer-paraformer-impl.h b/sherpa-onnx/csrc/online-recognizer-paraformer-impl.h index 8303af5e3..26fdb08c3 100644 --- a/sherpa-onnx/csrc/online-recognizer-paraformer-impl.h +++ b/sherpa-onnx/csrc/online-recognizer-paraformer-impl.h @@ -96,7 +96,8 @@ static void Scale(const float *x, int32_t n, float scale, float *y) { class OnlineRecognizerParaformerImpl : public OnlineRecognizerImpl { public: explicit OnlineRecognizerParaformerImpl(const OnlineRecognizerConfig &config) - : config_(config), + : OnlineRecognizerImpl(config), + config_(config), model_(config.model_config), sym_(config.model_config.tokens), endpoint_(config_.endpoint_config) { @@ -116,7 +117,8 @@ class OnlineRecognizerParaformerImpl : public OnlineRecognizerImpl { #if __ANDROID_API__ >= 9 explicit OnlineRecognizerParaformerImpl(AAssetManager *mgr, const OnlineRecognizerConfig &config) - : config_(config), + : OnlineRecognizerImpl(mgr, config), + config_(config), model_(mgr, config.model_config), sym_(mgr, config.model_config.tokens), endpoint_(config_.endpoint_config) { @@ -160,7 +162,9 @@ class OnlineRecognizerParaformerImpl : public OnlineRecognizerImpl { OnlineRecognizerResult GetResult(OnlineStream *s) const override { auto decoder_result = s->GetParaformerResult(); - return Convert(decoder_result, sym_); + auto r = Convert(decoder_result, sym_); + r.text = ApplyInverseTextNormalization(r.text); + return r; } bool IsEndpoint(OnlineStream *s) const override { diff --git a/sherpa-onnx/csrc/online-recognizer-transducer-impl.h b/sherpa-onnx/csrc/online-recognizer-transducer-impl.h index a2531b10c..2bea765cb 100644 --- a/sherpa-onnx/csrc/online-recognizer-transducer-impl.h +++ b/sherpa-onnx/csrc/online-recognizer-transducer-impl.h @@ -80,7 +80,8 @@ OnlineRecognizerResult Convert(const OnlineTransducerDecoderResult &src, class OnlineRecognizerTransducerImpl : public OnlineRecognizerImpl { public: explicit OnlineRecognizerTransducerImpl(const OnlineRecognizerConfig &config) - : config_(config), + : OnlineRecognizerImpl(config), + config_(config), model_(OnlineTransducerModel::Create(config.model_config)), sym_(config.model_config.tokens), endpoint_(config_.endpoint_config) { @@ -124,7 +125,8 @@ class OnlineRecognizerTransducerImpl : public OnlineRecognizerImpl { #if __ANDROID_API__ >= 9 explicit OnlineRecognizerTransducerImpl(AAssetManager *mgr, const OnlineRecognizerConfig &config) - : config_(config), + : OnlineRecognizerImpl(mgr, config), + config_(config), model_(OnlineTransducerModel::Create(mgr, config.model_config)), sym_(mgr, config.model_config.tokens), endpoint_(config_.endpoint_config) { @@ -332,8 +334,10 @@ class OnlineRecognizerTransducerImpl : public OnlineRecognizerImpl { // TODO(fangjun): Remember to change these constants if needed int32_t frame_shift_ms = 10; int32_t subsampling_factor = 4; - return Convert(decoder_result, sym_, frame_shift_ms, subsampling_factor, - s->GetCurrentSegment(), s->GetNumFramesSinceStart()); + auto r = Convert(decoder_result, sym_, frame_shift_ms, subsampling_factor, + s->GetCurrentSegment(), s->GetNumFramesSinceStart()); + r.text = ApplyInverseTextNormalization(std::move(r.text)); + return r; } bool IsEndpoint(OnlineStream *s) const override { diff --git a/sherpa-onnx/csrc/online-recognizer-transducer-nemo-impl.h b/sherpa-onnx/csrc/online-recognizer-transducer-nemo-impl.h index 2391efb1f..700054dc2 100644 --- a/sherpa-onnx/csrc/online-recognizer-transducer-nemo-impl.h +++ b/sherpa-onnx/csrc/online-recognizer-transducer-nemo-impl.h @@ -42,7 +42,8 @@ class OnlineRecognizerTransducerNeMoImpl : public OnlineRecognizerImpl { public: explicit OnlineRecognizerTransducerNeMoImpl( const OnlineRecognizerConfig &config) - : config_(config), + : OnlineRecognizerImpl(config), + config_(config), symbol_table_(config.model_config.tokens), endpoint_(config_.endpoint_config), model_( @@ -61,7 +62,8 @@ class OnlineRecognizerTransducerNeMoImpl : public OnlineRecognizerImpl { #if __ANDROID_API__ >= 9 explicit OnlineRecognizerTransducerNeMoImpl( AAssetManager *mgr, const OnlineRecognizerConfig &config) - : config_(config), + : OnlineRecognizerImpl(mgr, config), + config_(config), symbol_table_(mgr, config.model_config.tokens), endpoint_(config_.endpoint_config), model_(std::make_unique( @@ -94,9 +96,11 @@ class OnlineRecognizerTransducerNeMoImpl : public OnlineRecognizerImpl { // TODO(fangjun): Remember to change these constants if needed int32_t frame_shift_ms = 10; int32_t subsampling_factor = model_->SubsamplingFactor(); - return Convert(s->GetResult(), symbol_table_, frame_shift_ms, - subsampling_factor, s->GetCurrentSegment(), - s->GetNumFramesSinceStart()); + auto r = Convert(s->GetResult(), symbol_table_, frame_shift_ms, + subsampling_factor, s->GetCurrentSegment(), + s->GetNumFramesSinceStart()); + r.text = ApplyInverseTextNormalization(std::move(r.text)); + return r; } bool IsEndpoint(OnlineStream *s) const override { diff --git a/sherpa-onnx/csrc/online-recognizer.cc b/sherpa-onnx/csrc/online-recognizer.cc index fcb9169ef..a49a62f6a 100644 --- a/sherpa-onnx/csrc/online-recognizer.cc +++ b/sherpa-onnx/csrc/online-recognizer.cc @@ -14,7 +14,9 @@ #include #include +#include "sherpa-onnx/csrc/file-utils.h" #include "sherpa-onnx/csrc/online-recognizer-impl.h" +#include "sherpa-onnx/csrc/text-utils.h" namespace sherpa_onnx { @@ -100,6 +102,15 @@ void OnlineRecognizerConfig::Register(ParseOptions *po) { "now support greedy_search and modified_beam_search."); po->Register("temperature-scale", &temperature_scale, "Temperature scale for confidence computation in decoding."); + po->Register( + "rule-fsts", &rule_fsts, + "If not empty, it specifies fsts for inverse text normalization. " + "If there are multiple fsts, they are separated by a comma."); + + po->Register( + "rule-fars", &rule_fars, + "If not empty, it specifies fst archives for inverse text normalization. " + "If there are multiple archives, they are separated by a comma."); } bool OnlineRecognizerConfig::Validate() const { @@ -129,6 +140,34 @@ bool OnlineRecognizerConfig::Validate() const { return false; } + if (!hotwords_file.empty() && !FileExists(hotwords_file)) { + SHERPA_ONNX_LOGE("--hotwords-file: '%s' does not exist", + hotwords_file.c_str()); + return false; + } + + if (!rule_fsts.empty()) { + std::vector files; + SplitStringToVector(rule_fsts, ",", false, &files); + for (const auto &f : files) { + if (!FileExists(f)) { + SHERPA_ONNX_LOGE("Rule fst '%s' does not exist. ", f.c_str()); + return false; + } + } + } + + if (!rule_fars.empty()) { + std::vector files; + SplitStringToVector(rule_fars, ",", false, &files); + for (const auto &f : files) { + if (!FileExists(f)) { + SHERPA_ONNX_LOGE("Rule far '%s' does not exist. ", f.c_str()); + return false; + } + } + } + return model_config.Validate(); } @@ -147,7 +186,9 @@ std::string OnlineRecognizerConfig::ToString() const { os << "hotwords_file=\"" << hotwords_file << "\", "; os << "decoding_method=\"" << decoding_method << "\", "; os << "blank_penalty=" << blank_penalty << ", "; - os << "temperature_scale=" << temperature_scale << ")"; + os << "temperature_scale=" << temperature_scale << ", "; + os << "rule_fsts=\"" << rule_fsts << "\", "; + os << "rule_fars=\"" << rule_fars << "\")"; return os.str(); } diff --git a/sherpa-onnx/csrc/online-recognizer.h b/sherpa-onnx/csrc/online-recognizer.h index f7fcf2f21..7fde367fb 100644 --- a/sherpa-onnx/csrc/online-recognizer.h +++ b/sherpa-onnx/csrc/online-recognizer.h @@ -100,6 +100,12 @@ struct OnlineRecognizerConfig { float temperature_scale = 2.0; + // If there are multiple rules, they are applied from left to right. + std::string rule_fsts; + + // If there are multiple FST archives, they are applied from left to right. + std::string rule_fars; + OnlineRecognizerConfig() = default; OnlineRecognizerConfig( @@ -109,7 +115,8 @@ struct OnlineRecognizerConfig { const OnlineCtcFstDecoderConfig &ctc_fst_decoder_config, bool enable_endpoint, const std::string &decoding_method, int32_t max_active_paths, const std::string &hotwords_file, - float hotwords_score, float blank_penalty, float temperature_scale) + float hotwords_score, float blank_penalty, float temperature_scale, + const std::string &rule_fsts, const std::string &rule_fars) : feat_config(feat_config), model_config(model_config), lm_config(lm_config), @@ -121,7 +128,9 @@ struct OnlineRecognizerConfig { hotwords_file(hotwords_file), hotwords_score(hotwords_score), blank_penalty(blank_penalty), - temperature_scale(temperature_scale) {} + temperature_scale(temperature_scale), + rule_fsts(rule_fsts), + rule_fars(rule_fars) {} void Register(ParseOptions *po); bool Validate() const; diff --git a/sherpa-onnx/python/csrc/online-recognizer.cc b/sherpa-onnx/python/csrc/online-recognizer.cc index 148f73ee5..fe6cd454a 100644 --- a/sherpa-onnx/python/csrc/online-recognizer.cc +++ b/sherpa-onnx/python/csrc/online-recognizer.cc @@ -54,19 +54,20 @@ static void PybindOnlineRecognizerResult(py::module *m) { static void PybindOnlineRecognizerConfig(py::module *m) { using PyClass = OnlineRecognizerConfig; py::class_(*m, "OnlineRecognizerConfig") - .def( - py::init(), - py::arg("feat_config"), py::arg("model_config"), - py::arg("lm_config") = OnlineLMConfig(), - py::arg("endpoint_config") = EndpointConfig(), - py::arg("ctc_fst_decoder_config") = OnlineCtcFstDecoderConfig(), - py::arg("enable_endpoint"), py::arg("decoding_method"), - py::arg("max_active_paths") = 4, py::arg("hotwords_file") = "", - py::arg("hotwords_score") = 0, py::arg("blank_penalty") = 0.0, - py::arg("temperature_scale") = 2.0) + .def(py::init(), + py::arg("feat_config"), py::arg("model_config"), + py::arg("lm_config") = OnlineLMConfig(), + py::arg("endpoint_config") = EndpointConfig(), + py::arg("ctc_fst_decoder_config") = OnlineCtcFstDecoderConfig(), + py::arg("enable_endpoint"), py::arg("decoding_method"), + py::arg("max_active_paths") = 4, py::arg("hotwords_file") = "", + py::arg("hotwords_score") = 0, py::arg("blank_penalty") = 0.0, + py::arg("temperature_scale") = 2.0, py::arg("rule_fsts") = "", + py::arg("rule_fars") = "") .def_readwrite("feat_config", &PyClass::feat_config) .def_readwrite("model_config", &PyClass::model_config) .def_readwrite("lm_config", &PyClass::lm_config) @@ -79,6 +80,8 @@ static void PybindOnlineRecognizerConfig(py::module *m) { .def_readwrite("hotwords_score", &PyClass::hotwords_score) .def_readwrite("blank_penalty", &PyClass::blank_penalty) .def_readwrite("temperature_scale", &PyClass::temperature_scale) + .def_readwrite("rule_fsts", &PyClass::rule_fsts) + .def_readwrite("rule_fars", &PyClass::rule_fars) .def("__str__", &PyClass::ToString); } diff --git a/sherpa-onnx/python/sherpa_onnx/online_recognizer.py b/sherpa-onnx/python/sherpa_onnx/online_recognizer.py index 97f7472b4..82b2e3b42 100644 --- a/sherpa-onnx/python/sherpa_onnx/online_recognizer.py +++ b/sherpa-onnx/python/sherpa_onnx/online_recognizer.py @@ -64,6 +64,8 @@ def from_transducer( lm_scale: float = 0.1, temperature_scale: float = 2.0, debug: bool = False, + rule_fsts: str = "", + rule_fars: str = "", ): """ Please refer to @@ -148,6 +150,12 @@ def from_transducer( the log probability, you can get it from the directory where your bpe model is generated. Only used when hotwords provided and the modeling unit is bpe or cjkchar+bpe. + rule_fsts: + If not empty, it specifies fsts for inverse text normalization. + If there are multiple fsts, they are separated by a comma. + rule_fars: + If not empty, it specifies fst archives for inverse text normalization. + If there are multiple archives, they are separated by a comma. """ self = cls.__new__(cls) _assert_file_exists(tokens) @@ -217,6 +225,8 @@ def from_transducer( hotwords_file=hotwords_file, blank_penalty=blank_penalty, temperature_scale=temperature_scale, + rule_fsts=rule_fsts, + rule_fars=rule_fars, ) self.recognizer = _Recognizer(recognizer_config) @@ -239,6 +249,8 @@ def from_paraformer( decoding_method: str = "greedy_search", provider: str = "cpu", debug: bool = False, + rule_fsts: str = "", + rule_fars: str = "", ): """ Please refer to @@ -283,6 +295,12 @@ def from_paraformer( The only valid value is greedy_search. provider: onnxruntime execution providers. Valid values are: cpu, cuda, coreml. + rule_fsts: + If not empty, it specifies fsts for inverse text normalization. + If there are multiple fsts, they are separated by a comma. + rule_fars: + If not empty, it specifies fst archives for inverse text normalization. + If there are multiple archives, they are separated by a comma. """ self = cls.__new__(cls) _assert_file_exists(tokens) @@ -322,6 +340,8 @@ def from_paraformer( endpoint_config=endpoint_config, enable_endpoint=enable_endpoint_detection, decoding_method=decoding_method, + rule_fsts=rule_fsts, + rule_fars=rule_fars, ) self.recognizer = _Recognizer(recognizer_config) @@ -345,6 +365,8 @@ def from_zipformer2_ctc( ctc_max_active: int = 3000, provider: str = "cpu", debug: bool = False, + rule_fsts: str = "", + rule_fars: str = "", ): """ Please refer to @@ -393,6 +415,12 @@ def from_zipformer2_ctc( active paths at a time. provider: onnxruntime execution providers. Valid values are: cpu, cuda, coreml. + rule_fsts: + If not empty, it specifies fsts for inverse text normalization. + If there are multiple fsts, they are separated by a comma. + rule_fars: + If not empty, it specifies fst archives for inverse text normalization. + If there are multiple archives, they are separated by a comma. """ self = cls.__new__(cls) _assert_file_exists(tokens) @@ -433,6 +461,8 @@ def from_zipformer2_ctc( ctc_fst_decoder_config=ctc_fst_decoder_config, enable_endpoint=enable_endpoint_detection, decoding_method=decoding_method, + rule_fsts=rule_fsts, + rule_fars=rule_fars, ) self.recognizer = _Recognizer(recognizer_config) @@ -454,6 +484,8 @@ def from_nemo_ctc( decoding_method: str = "greedy_search", provider: str = "cpu", debug: bool = False, + rule_fsts: str = "", + rule_fars: str = "", ): """ Please refer to @@ -497,6 +529,12 @@ def from_nemo_ctc( onnxruntime execution providers. Valid values are: cpu, cuda, coreml. debug: True to show meta data in the model. + rule_fsts: + If not empty, it specifies fsts for inverse text normalization. + If there are multiple fsts, they are separated by a comma. + rule_fars: + If not empty, it specifies fst archives for inverse text normalization. + If there are multiple archives, they are separated by a comma. """ self = cls.__new__(cls) _assert_file_exists(tokens) @@ -533,6 +571,8 @@ def from_nemo_ctc( endpoint_config=endpoint_config, enable_endpoint=enable_endpoint_detection, decoding_method=decoding_method, + rule_fsts=rule_fsts, + rule_fars=rule_fars, ) self.recognizer = _Recognizer(recognizer_config) @@ -556,6 +596,8 @@ def from_wenet_ctc( decoding_method: str = "greedy_search", provider: str = "cpu", debug: bool = False, + rule_fsts: str = "", + rule_fars: str = "", ): """ Please refer to @@ -602,6 +644,12 @@ def from_wenet_ctc( The only valid value is greedy_search. provider: onnxruntime execution providers. Valid values are: cpu, cuda, coreml. + rule_fsts: + If not empty, it specifies fsts for inverse text normalization. + If there are multiple fsts, they are separated by a comma. + rule_fars: + If not empty, it specifies fst archives for inverse text normalization. + If there are multiple archives, they are separated by a comma. """ self = cls.__new__(cls) _assert_file_exists(tokens) @@ -640,6 +688,8 @@ def from_wenet_ctc( endpoint_config=endpoint_config, enable_endpoint=enable_endpoint_detection, decoding_method=decoding_method, + rule_fsts=rule_fsts, + rule_fars=rule_fars, ) self.recognizer = _Recognizer(recognizer_config) From 6789c909d25bdd1ed0a502454c30f30237549cbf Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Tue, 18 Jun 2024 13:42:17 +0800 Subject: [PATCH 028/237] Inverse text normalization API of streaming ASR for various programming languages (#1022) --- .github/scripts/test-dart.sh | 2 + .github/scripts/test-dot-net.sh | 13 +- .github/scripts/test-nodejs-addon-npm.sh | 9 ++ .github/scripts/test-nodejs-npm.sh | 10 ++ .github/workflows/build-wheels-aarch64.yaml | 2 + .github/workflows/build-wheels-armv7l.yaml | 2 + .github/workflows/build-wheels-linux.yaml | 2 + .../workflows/build-wheels-macos-arm64.yaml | 4 +- .github/workflows/build-wheels-macos-x64.yaml | 2 +- .github/workflows/build-wheels-win32.yaml | 2 + .github/workflows/run-java-test.yaml | 3 + .github/workflows/test-go.yaml | 3 + CMakeLists.txt | 3 +- .../com/k2fsa/sherpa/onnx/MainActivity.kt | 7 + .../com/k2fsa/sherpa/onnx/MainActivity.kt | 11 ++ .../com/k2fsa/sherpa/onnx/MainActivity.kt | 5 + cmake/kaldi-decoder.cmake | 18 +-- .../non-streaming-asr/pubspec.yaml | 2 +- .../bin/zipformer-transducer.dart | 7 +- dart-api-examples/streaming-asr/pubspec.yaml | 2 +- .../run-zipformer-transducer-itn.sh | 28 ++++ dart-api-examples/tts/pubspec.yaml | 2 +- dart-api-examples/vad/pubspec.yaml | 2 +- .../online-decode-files/Program.cs | 5 + .../online-decode-files/run-transducer-itn.sh | 28 ++++ .../streaming-decode-files/main.go | 2 + .../run-transducer-itn.sh | 30 ++++ ...eTextNormalizationStreamingTransducer.java | 68 +++++++++ ...n-inverse-text-normalization-transducer.sh | 45 ++++++ kotlin-api-examples/run.sh | 37 ++++- ...est_itn_asr.kt => test_itn_offline_asr.kt} | 0 kotlin-api-examples/test_itn_online_asr.kt | 41 ++++++ nodejs-addon-examples/package.json | 2 +- .../test_asr_streaming_transducer_itn.js | 59 ++++++++ ...asr_streaming_transducer_microphone_itn.js | 88 ++++++++++++ nodejs-examples/test-online-transducer-itn.js | 131 ++++++++++++++++++ scripts/apk/build-apk-asr-2pass.sh.in | 11 ++ scripts/apk/build-apk-asr.sh.in | 7 + scripts/apk/build-apk-vad-asr.sh.in | 6 + scripts/apk/generate-asr-2pass-apk-script.py | 21 +++ scripts/apk/generate-asr-apk-script.py | 14 ++ scripts/apk/generate-vad-asr-apk-script.py | 14 ++ scripts/dotnet/OnlineRecognizerConfig.cs | 8 ++ .../run-transducer-itn.sh | 1 + scripts/go/release.sh | 10 +- scripts/go/sherpa_onnx.go | 15 ++ scripts/node-addon-api/src/streaming-asr.cc | 10 ++ sherpa-onnx/c-api/c-api.cc | 3 + sherpa-onnx/c-api/c-api.h | 2 + sherpa-onnx/csrc/CMakeLists.txt | 3 +- sherpa-onnx/csrc/offline-recognizer-impl.cc | 3 - sherpa-onnx/csrc/online-recognizer-impl.cc | 5 +- sherpa-onnx/flutter/CHANGELOG.md | 4 + .../flutter/lib/src/online_recognizer.dart | 10 +- .../flutter/lib/src/sherpa_onnx_bindings.dart | 3 + .../sherpa/onnx/OnlineRecognizerConfig.java | 16 +++ sherpa-onnx/jni/online-recognizer.cc | 12 ++ sherpa-onnx/kotlin-api/OnlineRecognizer.kt | 2 + swift-api-examples/SherpaOnnx.swift | 8 +- wasm/asr/CMakeLists.txt | 2 + wasm/asr/sherpa-onnx-asr.js | 28 +++- wasm/asr/sherpa-onnx-wasm-main-asr.cc | 4 +- wasm/kws/CMakeLists.txt | 3 +- wasm/tts/CMakeLists.txt | 2 + 64 files changed, 849 insertions(+), 55 deletions(-) create mode 100755 dart-api-examples/streaming-asr/run-zipformer-transducer-itn.sh create mode 100755 dotnet-examples/online-decode-files/run-transducer-itn.sh create mode 100755 go-api-examples/streaming-decode-files/run-transducer-itn.sh create mode 100644 java-api-examples/InverseTextNormalizationStreamingTransducer.java create mode 100755 java-api-examples/run-inverse-text-normalization-transducer.sh rename kotlin-api-examples/{test_itn_asr.kt => test_itn_offline_asr.kt} (100%) create mode 100644 kotlin-api-examples/test_itn_online_asr.kt create mode 100644 nodejs-addon-examples/test_asr_streaming_transducer_itn.js create mode 100644 nodejs-addon-examples/test_asr_streaming_transducer_microphone_itn.js create mode 100644 nodejs-examples/test-online-transducer-itn.js create mode 120000 scripts/go/_internal/streaming-decode-files/run-transducer-itn.sh diff --git a/.github/scripts/test-dart.sh b/.github/scripts/test-dart.sh index 0850a72b1..b0be657a7 100755 --- a/.github/scripts/test-dart.sh +++ b/.github/scripts/test-dart.sh @@ -66,7 +66,9 @@ echo '----------streaming zipformer ctc----------' rm -rf sherpa-onnx-* echo '----------streaming zipformer transducer----------' +./run-zipformer-transducer-itn.sh ./run-zipformer-transducer.sh +rm -f itn* rm -rf sherpa-onnx-* echo '----------streaming NeMo transducer----------' diff --git a/.github/scripts/test-dot-net.sh b/.github/scripts/test-dot-net.sh index 395c67c83..845162542 100755 --- a/.github/scripts/test-dot-net.sh +++ b/.github/scripts/test-dot-net.sh @@ -2,7 +2,13 @@ cd dotnet-examples/ -cd ./offline-decode-files +cd ./online-decode-files +./run-transducer-itn.sh +./run-zipformer2-ctc.sh +./run-transducer.sh +./run-paraformer.sh + +cd ../offline-decode-files ./run-paraformer-itn.sh ./run-telespeech-ctc.sh ./run-nemo-ctc.sh @@ -27,11 +33,6 @@ cd ../streaming-hlg-decoding/ cd ../spoken-language-identification ./run.sh -cd ../online-decode-files -./run-zipformer2-ctc.sh -./run-transducer.sh -./run-paraformer.sh - cd ../offline-tts ./run-aishell3.sh ./run-piper.sh diff --git a/.github/scripts/test-nodejs-addon-npm.sh b/.github/scripts/test-nodejs-addon-npm.sh index a6c4345a8..5ff89d30e 100755 --- a/.github/scripts/test-nodejs-addon-npm.sh +++ b/.github/scripts/test-nodejs-addon-npm.sh @@ -70,6 +70,13 @@ curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/s tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 +rm -f itn* + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn-zh-number.wav + +node test_asr_streaming_transducer_itn.js + node test_asr_streaming_transducer.js rm -rf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 @@ -120,6 +127,8 @@ rm sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 node ./test_asr_non_streaming_paraformer.js +rm -f itn* + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn-zh-number.wav diff --git a/.github/scripts/test-nodejs-npm.sh b/.github/scripts/test-nodejs-npm.sh index 2098bb166..8428c1df5 100755 --- a/.github/scripts/test-nodejs-npm.sh +++ b/.github/scripts/test-nodejs-npm.sh @@ -15,6 +15,8 @@ curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/s ls -lh tar xvf sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 rm sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 + +rm -f itn* curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn-zh-number.wav curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst node ./test-offline-paraformer-itn.js @@ -57,7 +59,15 @@ rm -rf sherpa-onnx-streaming-paraformer-bilingual-zh-en curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + +rm -f itn* +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn-zh-number.wav +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst + +node ./test-online-transducer-itn.js + node ./test-online-transducer.js + rm -rf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13.tar.bz2 diff --git a/.github/workflows/build-wheels-aarch64.yaml b/.github/workflows/build-wheels-aarch64.yaml index 4bc5b79c2..d9d042324 100644 --- a/.github/workflows/build-wheels-aarch64.yaml +++ b/.github/workflows/build-wheels-aarch64.yaml @@ -2,6 +2,8 @@ name: build-wheels-aarch64 on: push: + branches: + - wheel tags: - 'v[0-9]+.[0-9]+.[0-9]+*' workflow_dispatch: diff --git a/.github/workflows/build-wheels-armv7l.yaml b/.github/workflows/build-wheels-armv7l.yaml index 6b7d74460..a2a2a49aa 100644 --- a/.github/workflows/build-wheels-armv7l.yaml +++ b/.github/workflows/build-wheels-armv7l.yaml @@ -2,6 +2,8 @@ name: build-wheels-armv7l on: push: + branches: + - wheel tags: - 'v[0-9]+.[0-9]+.[0-9]+*' workflow_dispatch: diff --git a/.github/workflows/build-wheels-linux.yaml b/.github/workflows/build-wheels-linux.yaml index 426545622..9d94f1c3a 100644 --- a/.github/workflows/build-wheels-linux.yaml +++ b/.github/workflows/build-wheels-linux.yaml @@ -2,6 +2,8 @@ name: build-wheels-linux on: push: + branches: + - wheel tags: - 'v[0-9]+.[0-9]+.[0-9]+*' workflow_dispatch: diff --git a/.github/workflows/build-wheels-macos-arm64.yaml b/.github/workflows/build-wheels-macos-arm64.yaml index bc02ce38f..5883ba483 100644 --- a/.github/workflows/build-wheels-macos-arm64.yaml +++ b/.github/workflows/build-wheels-macos-arm64.yaml @@ -2,6 +2,8 @@ name: build-wheels-macos-arm64 on: push: + branches: + - wheel tags: - 'v[0-9]+.[0-9]+.[0-9]+*' workflow_dispatch: @@ -84,7 +86,7 @@ jobs: run: | opts='--break-system-packages' v=${{ matrix.python-version }} - if [[ $v == cp38 || $v == cp39 ]]; then + if [[ $v == cp37 || $v == cp38 || $v == cp39 ]]; then opts='' fi diff --git a/.github/workflows/build-wheels-macos-x64.yaml b/.github/workflows/build-wheels-macos-x64.yaml index 8ad21d0ed..df4d8049e 100644 --- a/.github/workflows/build-wheels-macos-x64.yaml +++ b/.github/workflows/build-wheels-macos-x64.yaml @@ -101,7 +101,7 @@ jobs: run: | opts='--break-system-packages' v=${{ matrix.python-version }} - if [[ $v == cp38 || $v == cp39 ]]; then + if [[ $v == cp37 || $v == cp38 || $v == cp39 ]]; then opts='' fi diff --git a/.github/workflows/build-wheels-win32.yaml b/.github/workflows/build-wheels-win32.yaml index ab3d32b13..b2dbd157d 100644 --- a/.github/workflows/build-wheels-win32.yaml +++ b/.github/workflows/build-wheels-win32.yaml @@ -2,6 +2,8 @@ name: build-wheels-win32 on: push: + branches: + - wheel tags: - 'v[0-9]+.[0-9]+.[0-9]+*' workflow_dispatch: diff --git a/.github/workflows/run-java-test.yaml b/.github/workflows/run-java-test.yaml index 30fc6a827..14d22bd2e 100644 --- a/.github/workflows/run-java-test.yaml +++ b/.github/workflows/run-java-test.yaml @@ -173,6 +173,9 @@ jobs: shell: bash run: | cd ./java-api-examples + ./run-inverse-text-normalization-transducer.sh + rm -rf sherpa-onnx-streaming-* + ./run-streaming-decode-file-ctc.sh # Delete model files to save space rm -rf sherpa-onnx-streaming-* diff --git a/.github/workflows/test-go.yaml b/.github/workflows/test-go.yaml index 1ec654418..6f25a0139 100644 --- a/.github/workflows/test-go.yaml +++ b/.github/workflows/test-go.yaml @@ -187,6 +187,9 @@ jobs: ./run-transducer.sh rm -rf sherpa-onnx-streaming-zipformer-en-2023-06-26 + ./run-transducer-itn.sh + rm -rf sherpa-onnx-streaming-* + echo "Test paraformer" ./run-paraformer.sh rm -rf sherpa-onnx-streaming-paraformer-bilingual-zh-en diff --git a/CMakeLists.txt b/CMakeLists.txt index 974dd1c8f..dedf52767 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,8 @@ project(sherpa-onnx) # Remember to update # ./nodejs-addon-examples # ./dart-api-examples/ -set(SHERPA_ONNX_VERSION "1.9.30") +# ./sherpa-onnx/flutter/CHANGELOG.md +set(SHERPA_ONNX_VERSION "1.10.0") # Disable warning about # diff --git a/android/SherpaOnnx/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt b/android/SherpaOnnx/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt index fa2829635..8c96623fc 100644 --- a/android/SherpaOnnx/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt +++ b/android/SherpaOnnx/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt @@ -196,6 +196,9 @@ class MainActivity : AppCompatActivity() { // See https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html // for a list of available models val type = 0 + var ruleFsts : String? + ruleFsts = null + Log.i(TAG, "Select model type $type") val config = OnlineRecognizerConfig( featConfig = getFeatureConfig(sampleRate = sampleRateInHz, featureDim = 80), @@ -205,6 +208,10 @@ class MainActivity : AppCompatActivity() { enableEndpoint = true, ) + if (ruleFsts != null) { + config.ruleFsts = ruleFsts + } + recognizer = OnlineRecognizer( assetManager = application.assets, config = config, diff --git a/android/SherpaOnnx2Pass/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt b/android/SherpaOnnx2Pass/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt index 596d03e09..d3ad9aa94 100644 --- a/android/SherpaOnnx2Pass/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt +++ b/android/SherpaOnnx2Pass/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt @@ -194,6 +194,8 @@ class MainActivity : AppCompatActivity() { // See https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html // for a list of available models val firstType = 9 + val firstRuleFsts: String? + firstRuleFsts = null Log.i(TAG, "Select model type $firstType for the first pass") val config = OnlineRecognizerConfig( featConfig = getFeatureConfig(sampleRate = sampleRateInHz, featureDim = 80), @@ -201,6 +203,9 @@ class MainActivity : AppCompatActivity() { endpointConfig = getEndpointConfig(), enableEndpoint = true, ) + if (firstRuleFsts != null) { + config.ruleFsts = firstRuleFsts; + } onlineRecognizer = OnlineRecognizer( assetManager = application.assets, @@ -213,6 +218,8 @@ class MainActivity : AppCompatActivity() { // See https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html // for a list of available models val secondType = 0 + var secondRuleFsts: String? + secondRuleFsts = null Log.i(TAG, "Select model type $secondType for the second pass") val config = OfflineRecognizerConfig( @@ -220,6 +227,10 @@ class MainActivity : AppCompatActivity() { modelConfig = getOfflineModelConfig(type = secondType)!!, ) + if (secondRuleFsts != null) { + config.ruleFsts = secondRuleFsts + } + offlineRecognizer = OfflineRecognizer( assetManager = application.assets, config = config, diff --git a/android/SherpaOnnxVadAsr/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt b/android/SherpaOnnxVadAsr/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt index e221553c2..fd7d60280 100644 --- a/android/SherpaOnnxVadAsr/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt +++ b/android/SherpaOnnxVadAsr/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt @@ -200,12 +200,17 @@ class MainActivity : AppCompatActivity() { // See https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html // for a list of available models val asrModelType = 0 + val asrRuleFsts: String? + asrRuleFsts = null Log.i(TAG, "Select model type ${asrModelType} for ASR") val config = OfflineRecognizerConfig( featConfig = getFeatureConfig(sampleRate = sampleRateInHz, featureDim = 80), modelConfig = getOfflineModelConfig(type = asrModelType)!!, ) + if (asrRuleFsts != null) { + config.ruleFsts = asrRuleFsts; + } offlineRecognizer = OfflineRecognizer( assetManager = application.assets, diff --git a/cmake/kaldi-decoder.cmake b/cmake/kaldi-decoder.cmake index aa937b3e4..02e62e44f 100644 --- a/cmake/kaldi-decoder.cmake +++ b/cmake/kaldi-decoder.cmake @@ -63,23 +63,15 @@ function(download_kaldi_decoder) kaldi-decoder-core kaldifst_core fst + fstfar DESTINATION ..) - if(SHERPA_ONNX_ENABLE_TTS) - install(TARGETS - fstfar - DESTINATION ..) - endif() else() install(TARGETS kaldi-decoder-core kaldifst_core fst + fstfar DESTINATION lib) - if(SHERPA_ONNX_ENABLE_TTS) - install(TARGETS - fstfar - DESTINATION lib) - endif() endif() if(WIN32 AND BUILD_SHARED_LIBS) @@ -87,12 +79,8 @@ function(download_kaldi_decoder) kaldi-decoder-core kaldifst_core fst + fstfar DESTINATION bin) - if(SHERPA_ONNX_ENABLE_TTS) - install(TARGETS - fstfar - DESTINATION bin) - endif() endif() endfunction() diff --git a/dart-api-examples/non-streaming-asr/pubspec.yaml b/dart-api-examples/non-streaming-asr/pubspec.yaml index 9253f105d..81280c349 100644 --- a/dart-api-examples/non-streaming-asr/pubspec.yaml +++ b/dart-api-examples/non-streaming-asr/pubspec.yaml @@ -10,7 +10,7 @@ environment: # Add regular dependencies here. dependencies: - sherpa_onnx: ^1.9.30 + sherpa_onnx: ^1.10.0 path: ^1.9.0 args: ^2.5.0 diff --git a/dart-api-examples/streaming-asr/bin/zipformer-transducer.dart b/dart-api-examples/streaming-asr/bin/zipformer-transducer.dart index 438af31e5..3e642d00f 100644 --- a/dart-api-examples/streaming-asr/bin/zipformer-transducer.dart +++ b/dart-api-examples/streaming-asr/bin/zipformer-transducer.dart @@ -15,6 +15,7 @@ void main(List arguments) async { ..addOption('decoder', help: 'Path to decoder model') ..addOption('joiner', help: 'Path to joiner model') ..addOption('tokens', help: 'Path to tokens.txt') + ..addOption('rule-fsts', help: 'Path to rule fsts', defaultsTo: '') ..addOption('input-wav', help: 'Path to input.wav to transcribe'); final res = parser.parse(arguments); @@ -31,6 +32,7 @@ void main(List arguments) async { final decoder = res['decoder'] as String; final joiner = res['joiner'] as String; final tokens = res['tokens'] as String; + final ruleFsts = res['rule-fsts'] as String; final inputWav = res['input-wav'] as String; final transducer = sherpa_onnx.OnlineTransducerModelConfig( @@ -45,7 +47,10 @@ void main(List arguments) async { debug: true, numThreads: 1, ); - final config = sherpa_onnx.OnlineRecognizerConfig(model: modelConfig); + final config = sherpa_onnx.OnlineRecognizerConfig( + model: modelConfig, + ruleFsts: ruleFsts, + ); final recognizer = sherpa_onnx.OnlineRecognizer(config); final waveData = sherpa_onnx.readWave(inputWav); diff --git a/dart-api-examples/streaming-asr/pubspec.yaml b/dart-api-examples/streaming-asr/pubspec.yaml index a740b371c..5305be304 100644 --- a/dart-api-examples/streaming-asr/pubspec.yaml +++ b/dart-api-examples/streaming-asr/pubspec.yaml @@ -11,7 +11,7 @@ environment: # Add regular dependencies here. dependencies: - sherpa_onnx: ^1.9.30 + sherpa_onnx: ^1.10.0 path: ^1.9.0 args: ^2.5.0 diff --git a/dart-api-examples/streaming-asr/run-zipformer-transducer-itn.sh b/dart-api-examples/streaming-asr/run-zipformer-transducer-itn.sh new file mode 100755 index 000000000..2169f71db --- /dev/null +++ b/dart-api-examples/streaming-asr/run-zipformer-transducer-itn.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +set -ex + +dart pub get + +if [ ! -f ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 +fi + +if [ ! -f ./itn-zh-number.wav ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn-zh-number.wav +fi + +if [ ! -f ./itn_zh_number.fst ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst +fi + +dart run \ + ./bin/zipformer-transducer.dart \ + --encoder ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.int8.onnx \ + --decoder ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx \ + --joiner ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.int8.onnx \ + --tokens ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt \ + --rule-fsts ./itn_zh_number.fst \ + --input-wav ./itn-zh-number.wav diff --git a/dart-api-examples/tts/pubspec.yaml b/dart-api-examples/tts/pubspec.yaml index fed0bf4e6..c3f1d9119 100644 --- a/dart-api-examples/tts/pubspec.yaml +++ b/dart-api-examples/tts/pubspec.yaml @@ -8,7 +8,7 @@ environment: # Add regular dependencies here. dependencies: - sherpa_onnx: ^1.9.30 + sherpa_onnx: ^1.10.0 path: ^1.9.0 args: ^2.5.0 diff --git a/dart-api-examples/vad/pubspec.yaml b/dart-api-examples/vad/pubspec.yaml index e0d6f6dce..8194273fb 100644 --- a/dart-api-examples/vad/pubspec.yaml +++ b/dart-api-examples/vad/pubspec.yaml @@ -9,7 +9,7 @@ environment: sdk: ^3.4.0 dependencies: - sherpa_onnx: ^1.9.30 + sherpa_onnx: ^1.10.0 path: ^1.9.0 args: ^2.5.0 diff --git a/dotnet-examples/online-decode-files/Program.cs b/dotnet-examples/online-decode-files/Program.cs index 5103dc010..3e8ee93e8 100644 --- a/dotnet-examples/online-decode-files/Program.cs +++ b/dotnet-examples/online-decode-files/Program.cs @@ -85,6 +85,10 @@ larger than this value after something that is not blank has been decoded. Used [Option("hotwords-score", Required = false, Default = 1.5F, HelpText = "hotwords score")] public float HotwordsScore { get; set; } + [Option("rule-fsts", Required = false, Default = "", + HelpText = "If not empty, path to rule fst for inverse text normalization")] + public string RuleFsts { get; set; } + [Option("files", Required = true, HelpText = "Audio files for decoding")] public IEnumerable Files { get; set; } @@ -189,6 +193,7 @@ private static void Run(Options options) config.Rule3MinUtteranceLength = options.Rule3MinUtteranceLength; config.HotwordsFile = options.HotwordsFile; config.HotwordsScore = options.HotwordsScore; + config.RuleFsts = options.RuleFsts; OnlineRecognizer recognizer = new OnlineRecognizer(config); diff --git a/dotnet-examples/online-decode-files/run-transducer-itn.sh b/dotnet-examples/online-decode-files/run-transducer-itn.sh new file mode 100755 index 000000000..17c595789 --- /dev/null +++ b/dotnet-examples/online-decode-files/run-transducer-itn.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +# Please refer to +# https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#csukuangfj-sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20-bilingual-chinese-english +# to download the model files + +set -ex +if [ ! -d ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 +fi + +if [ ! -f ./itn-zh-number.wav ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn-zh-number.wav +fi + +if [ ! -f ./itn_zh_number.fst ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst +fi + +dotnet run -c Release \ + --tokens ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt \ + --encoder ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.int8.onnx \ + --decoder ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.int8.onnx \ + --joiner ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.int8.onnx \ + --decoding-method greedy_search \ + --files ./itn-zh-number.wav diff --git a/go-api-examples/streaming-decode-files/main.go b/go-api-examples/streaming-decode-files/main.go index 5ec2c7cbb..d96b53336 100644 --- a/go-api-examples/streaming-decode-files/main.go +++ b/go-api-examples/streaming-decode-files/main.go @@ -30,6 +30,8 @@ func main() { flag.StringVar(&config.ModelConfig.Provider, "provider", "cpu", "Provider to use") flag.StringVar(&config.DecodingMethod, "decoding-method", "greedy_search", "Decoding method. Possible values: greedy_search, modified_beam_search") flag.IntVar(&config.MaxActivePaths, "max-active-paths", 4, "Used only when --decoding-method is modified_beam_search") + flag.StringVar(&config.RuleFsts, "rule-fsts", "", "If not empty, path to rule fst for inverse text normalization") + flag.StringVar(&config.RuleFars, "rule-fars", "", "If not empty, path to rule fst archives for inverse text normalization") flag.Parse() diff --git a/go-api-examples/streaming-decode-files/run-transducer-itn.sh b/go-api-examples/streaming-decode-files/run-transducer-itn.sh new file mode 100755 index 000000000..47bb13a71 --- /dev/null +++ b/go-api-examples/streaming-decode-files/run-transducer-itn.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +set -ex + +if [ ! -f ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 +fi + +if [ ! -f ./itn-zh-number.wav ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn-zh-number.wav +fi + +if [ ! -f ./itn_zh_number.fst ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst +fi + +go mod tidy +go build + +./streaming-decode-files \ + --encoder ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.int8.onnx \ + --decoder ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx \ + --joiner ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.int8.onnx \ + --tokens ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt \ + --model-type zipformer \ + --rule-fsts ./itn_zh_number.fst \ + --debug 0 \ + ./itn-zh-number.wav diff --git a/java-api-examples/InverseTextNormalizationStreamingTransducer.java b/java-api-examples/InverseTextNormalizationStreamingTransducer.java new file mode 100644 index 000000000..c3cc325d4 --- /dev/null +++ b/java-api-examples/InverseTextNormalizationStreamingTransducer.java @@ -0,0 +1,68 @@ +// Copyright 2024 Xiaomi Corporation + +// This file shows how to use a streaming transducer +// to decode files with inverse text normalization. +import com.k2fsa.sherpa.onnx.*; + +public class InverseTextNormalizationStreamingTransducer { + public static void main(String[] args) { + // please refer to + // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#csukuangfj-sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20-bilingual-chinese-english + // to download model files + String encoder = + "./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.int8.onnx"; + String decoder = + "./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx"; + String joiner = + "./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx"; + String tokens = "./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt"; + + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn-zh-number.wav + String waveFilename = "./itn-zh-number.wav"; + + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst + String ruleFsts = "./itn_zh_number.fst"; + + WaveReader reader = new WaveReader(waveFilename); + + OnlineTransducerModelConfig transducer = + OnlineTransducerModelConfig.builder() + .setEncoder(encoder) + .setDecoder(decoder) + .setJoiner(joiner) + .build(); + + OnlineModelConfig modelConfig = + OnlineModelConfig.builder() + .setTransducer(transducer) + .setTokens(tokens) + .setNumThreads(1) + .setDebug(true) + .build(); + + OnlineRecognizerConfig config = + OnlineRecognizerConfig.builder() + .setOnlineModelConfig(modelConfig) + .setDecodingMethod("greedy_search") + .setRuleFsts(ruleFsts) + .build(); + + OnlineRecognizer recognizer = new OnlineRecognizer(config); + OnlineStream stream = recognizer.createStream(); + stream.acceptWaveform(reader.getSamples(), reader.getSampleRate()); + + float[] tailPaddings = new float[(int) (0.8 * reader.getSampleRate())]; + stream.acceptWaveform(tailPaddings, reader.getSampleRate()); + + while (recognizer.isReady(stream)) { + recognizer.decode(stream); + } + + String text = recognizer.getResult(stream).getText(); + + System.out.printf("filename:%s\nresult:%s\n", waveFilename, text); + + stream.release(); + recognizer.release(); + } +} diff --git a/java-api-examples/run-inverse-text-normalization-transducer.sh b/java-api-examples/run-inverse-text-normalization-transducer.sh new file mode 100755 index 000000000..509d71f07 --- /dev/null +++ b/java-api-examples/run-inverse-text-normalization-transducer.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash + +set -ex + +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then + mkdir -p ../build + pushd ../build + cmake \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + -DSHERPA_ONNX_ENABLE_JNI=ON \ + .. + + make -j4 + ls -lh lib + popd +fi + +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then + pushd ../sherpa-onnx/java-api + make + popd +fi + +if [ ! -f ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 +fi + +if [ ! -f ./itn-zh-number.wav ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn-zh-number.wav +fi + +if [ ! -f ./itn_zh_number.fst ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst +fi + +java \ + -Djava.library.path=$PWD/../build/lib \ + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \ + InverseTextNormalizationStreamingTransducer.java diff --git a/kotlin-api-examples/run.sh b/kotlin-api-examples/run.sh index a96c09748..5b58620e1 100755 --- a/kotlin-api-examples/run.sh +++ b/kotlin-api-examples/run.sh @@ -203,7 +203,7 @@ function testOfflineAsr() { java -Djava.library.path=../build/lib -jar $out_filename } -function testInverseTextNormalizationAsr() { +function testInverseTextNormalizationOfflineAsr() { if [ ! -f ./sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt ]; then curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 tar xvf sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 @@ -218,9 +218,9 @@ function testInverseTextNormalizationAsr() { curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst fi - out_filename=test_offline_asr.jar + out_filename=test_itn_offline_asr.jar kotlinc-jvm -include-runtime -d $out_filename \ - test_itn_asr.kt \ + test_itn_offline_asr.kt \ FeatureConfig.kt \ OfflineRecognizer.kt \ OfflineStream.kt \ @@ -231,6 +231,34 @@ function testInverseTextNormalizationAsr() { java -Djava.library.path=../build/lib -jar $out_filename } +function testInverseTextNormalizationOnlineAsr() { + if [ ! -f ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 + fi + + if [ ! -f ./itn-zh-number.wav ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn-zh-number.wav + fi + + if [ ! -f ./itn_zh_number.fst ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst + fi + + out_filename=test_itn_online_asr.jar + kotlinc-jvm -include-runtime -d $out_filename \ + test_itn_online_asr.kt \ + FeatureConfig.kt \ + OnlineRecognizer.kt \ + OnlineStream.kt \ + WaveReader.kt \ + faked-asset-manager.kt + + ls -lh $out_filename + java -Djava.library.path=../build/lib -jar $out_filename +} + function testPunctuation() { if [ ! -f ./sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12/model.onnx ]; then curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/punctuation-models/sherpa-onnx-punct-ct-transformer-zh-en-vocab272727-2024-04-12.tar.bz2 @@ -257,4 +285,5 @@ testAudioTagging testSpokenLanguageIdentification testOfflineAsr testPunctuation -testInverseTextNormalizationAsr +testInverseTextNormalizationOfflineAsr +testInverseTextNormalizationOnlineAsr diff --git a/kotlin-api-examples/test_itn_asr.kt b/kotlin-api-examples/test_itn_offline_asr.kt similarity index 100% rename from kotlin-api-examples/test_itn_asr.kt rename to kotlin-api-examples/test_itn_offline_asr.kt diff --git a/kotlin-api-examples/test_itn_online_asr.kt b/kotlin-api-examples/test_itn_online_asr.kt new file mode 100644 index 000000000..27b6024b2 --- /dev/null +++ b/kotlin-api-examples/test_itn_online_asr.kt @@ -0,0 +1,41 @@ +package com.k2fsa.sherpa.onnx + +fun main() { + test() +} + +fun test() { + val recognizer = createOnlineRecognizer() + val waveFilename = "./itn-zh-number.wav"; + + val objArray = WaveReader.readWaveFromFile( + filename = waveFilename, + ) + val samples: FloatArray = objArray[0] as FloatArray + val sampleRate: Int = objArray[1] as Int + + val stream = recognizer.createStream() + stream.acceptWaveform(samples, sampleRate=sampleRate) + while (recognizer.isReady(stream)) { + recognizer.decode(stream) + } + + val result = recognizer.getResult(stream).text + println(result) + + stream.release() + recognizer.release() +} + +fun createOnlineRecognizer(): OnlineRecognizer { + val config = OnlineRecognizerConfig( + featConfig = getFeatureConfig(sampleRate = 16000, featureDim = 80), + modelConfig = getModelConfig(8)!!, + ) + + config.ruleFsts = "./itn_zh_number.fst" + println(config) + + return OnlineRecognizer(config = config) +} + diff --git a/nodejs-addon-examples/package.json b/nodejs-addon-examples/package.json index 28d846a11..9c277ef90 100644 --- a/nodejs-addon-examples/package.json +++ b/nodejs-addon-examples/package.json @@ -1,5 +1,5 @@ { "dependencies": { - "sherpa-onnx-node": "^1.9.30" + "sherpa-onnx-node": "^1.10.0" } } diff --git a/nodejs-addon-examples/test_asr_streaming_transducer_itn.js b/nodejs-addon-examples/test_asr_streaming_transducer_itn.js new file mode 100644 index 000000000..b8dfb6cb4 --- /dev/null +++ b/nodejs-addon-examples/test_asr_streaming_transducer_itn.js @@ -0,0 +1,59 @@ +// Copyright (c) 2024 Xiaomi Corporation +const sherpa_onnx = require('sherpa-onnx-node'); + +// Please download test files from +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +const config = { + 'featConfig': { + 'sampleRate': 16000, + 'featureDim': 80, + }, + 'modelConfig': { + 'transducer': { + 'encoder': + './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.onnx', + 'decoder': + './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx', + 'joiner': + './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx', + }, + 'tokens': + './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt', + 'numThreads': 2, + 'provider': 'cpu', + 'debug': 1, + }, + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst + ruleFsts: './itn_zh_number.fst', +}; + +// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn-zh-number.wav +const waveFilename = './itn-zh-number.wav'; + +const recognizer = new sherpa_onnx.OnlineRecognizer(config); +console.log('Started') +let start = Date.now(); +const stream = recognizer.createStream(); +const wave = sherpa_onnx.readWave(waveFilename); +stream.acceptWaveform({sampleRate: wave.sampleRate, samples: wave.samples}); + +const tailPadding = new Float32Array(wave.sampleRate * 0.4); +stream.acceptWaveform({samples: tailPadding, sampleRate: wave.sampleRate}); + +while (recognizer.isReady(stream)) { + recognizer.decode(stream); +} +result = recognizer.getResult(stream) +let stop = Date.now(); +console.log('Done') + +const elapsed_seconds = (stop - start) / 1000; +const duration = wave.samples.length / wave.sampleRate; +const real_time_factor = elapsed_seconds / duration; +console.log('Wave duration', duration.toFixed(3), 'secodns') +console.log('Elapsed', elapsed_seconds.toFixed(3), 'secodns') +console.log( + `RTF = ${elapsed_seconds.toFixed(3)}/${duration.toFixed(3)} =`, + real_time_factor.toFixed(3)) +console.log(waveFilename) +console.log('result\n', result) diff --git a/nodejs-addon-examples/test_asr_streaming_transducer_microphone_itn.js b/nodejs-addon-examples/test_asr_streaming_transducer_microphone_itn.js new file mode 100644 index 000000000..34807d10c --- /dev/null +++ b/nodejs-addon-examples/test_asr_streaming_transducer_microphone_itn.js @@ -0,0 +1,88 @@ +// Copyright (c) 2023-2024 Xiaomi Corporation (authors: Fangjun Kuang) +// +const portAudio = require('naudiodon2'); +// console.log(portAudio.getDevices()); + +const sherpa_onnx = require('sherpa-onnx-node'); + +function createOnlineRecognizer() { + const config = { + 'featConfig': { + 'sampleRate': 16000, + 'featureDim': 80, + }, + 'modelConfig': { + 'transducer': { + 'encoder': + './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.onnx', + 'decoder': + './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx', + 'joiner': + './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx', + }, + 'tokens': + './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt', + 'numThreads': 2, + 'provider': 'cpu', + 'debug': 1, + }, + 'decodingMethod': 'greedy_search', + 'maxActivePaths': 4, + 'enableEndpoint': true, + 'rule1MinTrailingSilence': 2.4, + 'rule2MinTrailingSilence': 1.2, + 'rule3MinUtteranceLength': 20, + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst + ruleFsts: './itn_zh_number.fst', + }; + + return new sherpa_onnx.OnlineRecognizer(config); +} + +const recognizer = createOnlineRecognizer(); +const stream = recognizer.createStream(); + +let lastText = ''; +let segmentIndex = 0; + +const ai = new portAudio.AudioIO({ + inOptions: { + channelCount: 1, + closeOnError: true, // Close the stream if an audio error is detected, if + // set false then just log the error + deviceId: -1, // Use -1 or omit the deviceId to select the default device + sampleFormat: portAudio.SampleFormatFloat32, + sampleRate: recognizer.config.featConfig.sampleRate + } +}); + +const display = new sherpa_onnx.Display(50); + +ai.on('data', data => { + const samples = new Float32Array(data.buffer); + + stream.acceptWaveform( + {sampleRate: recognizer.config.featConfig.sampleRate, samples: samples}); + + while (recognizer.isReady(stream)) { + recognizer.decode(stream); + } + + const isEndpoint = recognizer.isEndpoint(stream); + const text = recognizer.getResult(stream).text.toLowerCase(); + + if (text.length > 0 && lastText != text) { + lastText = text; + display.print(segmentIndex, lastText); + } + if (isEndpoint) { + if (text.length > 0) { + lastText = text; + segmentIndex += 1; + } + recognizer.reset(stream) + } +}); + +ai.start(); +console.log('Started! Please speak') diff --git a/nodejs-examples/test-online-transducer-itn.js b/nodejs-examples/test-online-transducer-itn.js new file mode 100644 index 000000000..9bc5360a2 --- /dev/null +++ b/nodejs-examples/test-online-transducer-itn.js @@ -0,0 +1,131 @@ +// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) +// +const fs = require('fs'); +const {Readable} = require('stream'); +const wav = require('wav'); + +const sherpa_onnx = require('sherpa-onnx'); + +function createOnlineRecognizer() { + let onlineTransducerModelConfig = { + encoder: + './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.int8.onnx', + decoder: + './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx', + joiner: + './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.int8.onnx', + }; + + let onlineParaformerModelConfig = { + encoder: '', + decoder: '', + }; + + let onlineZipformer2CtcModelConfig = { + model: '', + }; + + let onlineModelConfig = { + transducer: onlineTransducerModelConfig, + paraformer: onlineParaformerModelConfig, + zipformer2Ctc: onlineZipformer2CtcModelConfig, + tokens: + './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt', + numThreads: 1, + provider: 'cpu', + debug: 1, + modelType: 'zipformer', + }; + + let featureConfig = { + sampleRate: 16000, + featureDim: 80, + }; + + let recognizerConfig = { + featConfig: featureConfig, + modelConfig: onlineModelConfig, + decodingMethod: 'greedy_search', + maxActivePaths: 4, + enableEndpoint: 1, + rule1MinTrailingSilence: 2.4, + rule2MinTrailingSilence: 1.2, + rule3MinUtteranceLength: 20, + hotwordsFile: '', + hotwordsScore: 1.5, + ctcFstDecoderConfig: { + graph: '', + maxActive: 3000, + }, + // https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst + ruleFsts: './itn_zh_number.fst', + }; + + return sherpa_onnx.createOnlineRecognizer(recognizerConfig); +} + +const recognizer = createOnlineRecognizer(); +const stream = recognizer.createStream(); + +// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn-zh-number.wav +const waveFilename = './itn-zh-number.wav'; + +const reader = new wav.Reader(); +const readable = new Readable().wrap(reader); + +function decode(samples) { + stream.acceptWaveform(recognizer.config.featConfig.sampleRate, samples); + + while (recognizer.isReady(stream)) { + recognizer.decode(stream); + } + const text = recognizer.getResult(stream).text; + console.log(text); +} + +reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => { + if (sampleRate != recognizer.config.featConfig.sampleRate) { + throw new Error(`Only support sampleRate ${ + recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`); + } + + if (audioFormat != 1) { + throw new Error(`Only support PCM format. Given ${audioFormat}`); + } + + if (channels != 1) { + throw new Error(`Only a single channel. Given ${channel}`); + } + + if (bitDepth != 16) { + throw new Error(`Only support 16-bit samples. Given ${bitDepth}`); + } +}); + +fs.createReadStream(waveFilename, {'highWaterMark': 4096}) + .pipe(reader) + .on('finish', function(err) { + // tail padding + const floatSamples = + new Float32Array(recognizer.config.featConfig.sampleRate * 0.5); + decode(floatSamples); + stream.free(); + recognizer.free(); + }); + +readable.on('readable', function() { + let chunk; + while ((chunk = readable.read()) != null) { + const int16Samples = new Int16Array( + chunk.buffer, chunk.byteOffset, + chunk.length / Int16Array.BYTES_PER_ELEMENT); + + const floatSamples = new Float32Array(int16Samples.length); + + for (let i = 0; i < floatSamples.length; i++) { + floatSamples[i] = int16Samples[i] / 32768.0; + } + + decode(floatSamples); + } +}); diff --git a/scripts/apk/build-apk-asr-2pass.sh.in b/scripts/apk/build-apk-asr-2pass.sh.in index 24c2cd3ea..4cd5761a8 100644 --- a/scripts/apk/build-apk-asr-2pass.sh.in +++ b/scripts/apk/build-apk-asr-2pass.sh.in @@ -71,6 +71,17 @@ git checkout . pushd android/SherpaOnnx2Pass/app/src/main/java/com/k2fsa/sherpa/onnx sed -i.bak s/"firstType = 9/firstType = $type1/" ./MainActivity.kt sed -i.bak s/"secondType = 0/secondType = $type2/" ./MainActivity.kt + +{% if first.rule_fsts %} + rule_fsts={{ first.rule_fsts }} + sed -i.bak s%"firstRuleFsts = null"%"firstRuleFsts = \"$rule_fsts\""% ./MainActivity.kt +{% endif %} + +{% if second.rule_fsts %} + rule_fsts={{ second.rule_fsts }} + sed -i.bak s%"secondRuleFsts = null"%"secondRuleFsts = \"$rule_fsts\""% ./MainActivity.kt +{% endif %} + git diff popd diff --git a/scripts/apk/build-apk-asr.sh.in b/scripts/apk/build-apk-asr.sh.in index 468959f08..d2169203a 100644 --- a/scripts/apk/build-apk-asr.sh.in +++ b/scripts/apk/build-apk-asr.sh.in @@ -54,6 +54,12 @@ popd git checkout . pushd android/SherpaOnnx/app/src/main/java/com/k2fsa/sherpa/onnx sed -i.bak s/"type = 0/type = $type/" ./MainActivity.kt + +{% if model.rule_fsts %} + rule_fsts={{ model.rule_fsts }} + sed -i.bak s%"ruleFsts = null"%"ruleFsts = \"$rule_fsts\""% ./MainActivity.kt +{% endif %} + git diff popd @@ -84,6 +90,7 @@ for arch in arm64-v8a armeabi-v7a x86_64 x86; do done rm -rf ./android/SherpaOnnx/app/src/main/assets/$model_name +rm -rf ./android/SherpaOnnx/app/src/main/assets/*.fst {% endfor %} git checkout . diff --git a/scripts/apk/build-apk-vad-asr.sh.in b/scripts/apk/build-apk-vad-asr.sh.in index eb79eab5e..b1c159ece 100644 --- a/scripts/apk/build-apk-vad-asr.sh.in +++ b/scripts/apk/build-apk-vad-asr.sh.in @@ -56,6 +56,12 @@ popd git checkout . pushd android/SherpaOnnxVadAsr/app/src/main/java/com/k2fsa/sherpa/onnx sed -i.bak s/"asrModelType = 0/asrModelType = $type/" ./MainActivity.kt + +{% if model.rule_fsts %} + rule_fsts={{ model.rule_fsts }} + sed -i.bak s%"asrRuleFsts = null"%"asrRuleFsts = \"$rule_fsts\""% ./MainActivity.kt +{% endif %} + git diff popd diff --git a/scripts/apk/generate-asr-2pass-apk-script.py b/scripts/apk/generate-asr-2pass-apk-script.py index 85da8ada2..fc86b5193 100755 --- a/scripts/apk/generate-asr-2pass-apk-script.py +++ b/scripts/apk/generate-asr-2pass-apk-script.py @@ -41,6 +41,7 @@ class Model: # cmd is used to remove extra file from the model directory cmd: str = "" + rule_fsts: str = "" def get_2nd_models(): @@ -70,7 +71,11 @@ def get_2nd_models(): idx=0, lang="zh", short_name="paraformer", + rule_fsts="itn_zh_number.fst", cmd=""" + if [ ! -f itn_zh_number.fst ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst + fi pushd $model_name rm -fv README.md @@ -87,7 +92,11 @@ def get_2nd_models(): idx=4, lang="zh", short_name="zipformer", + rule_fsts="itn_zh_number.fst", cmd=""" + if [ ! -f itn_zh_number.fst ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst + fi pushd $model_name rm -rfv test_wavs @@ -117,7 +126,11 @@ def get_1st_models(): idx=8, lang="bilingual_zh_en", short_name="zipformer", + rule_fsts="itn_zh_number.fst", cmd=""" + if [ ! -f itn_zh_number.fst ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst + fi pushd $model_name rm -fv decoder-epoch-99-avg-1.int8.onnx rm -fv encoder-epoch-99-avg-1.onnx @@ -160,7 +173,11 @@ def get_1st_models(): idx=3, lang="zh", short_name="zipformer2", + rule_fsts="itn_zh_number.fst", cmd=""" + if [ ! -f itn_zh_number.fst ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst + fi pushd $model_name rm -fv exp/encoder-epoch-12-avg-4-chunk-16-left-128.onnx rm -fv exp/decoder-epoch-12-avg-4-chunk-16-left-128.int8.onnx @@ -202,7 +219,11 @@ def get_1st_models(): idx=9, lang="zh", short_name="small_zipformer", + rule_fsts="itn_zh_number.fst", cmd=""" + if [ ! -f itn_zh_number.fst ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst + fi pushd $model_name rm -fv encoder-epoch-99-avg-1.onnx rm -fv decoder-epoch-99-avg-1.int8.onnx diff --git a/scripts/apk/generate-asr-apk-script.py b/scripts/apk/generate-asr-apk-script.py index 8684877cf..05a22a921 100755 --- a/scripts/apk/generate-asr-apk-script.py +++ b/scripts/apk/generate-asr-apk-script.py @@ -42,6 +42,8 @@ class Model: # cmd is used to remove extra file from the model directory cmd: str = "" + rule_fsts: str = "" + def get_models(): models = [ @@ -50,7 +52,11 @@ def get_models(): idx=8, lang="bilingual_zh_en", short_name="zipformer", + rule_fsts="itn_zh_number.fst", cmd=""" + if [ ! -f itn_zh_number.fst ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst + fi pushd $model_name rm -fv decoder-epoch-99-avg-1.int8.onnx rm -fv encoder-epoch-99-avg-1.onnx @@ -93,7 +99,11 @@ def get_models(): idx=3, lang="zh", short_name="zipformer2", + rule_fsts="itn_zh_number.fst", cmd=""" + if [ ! -f itn_zh_number.fst ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst + fi pushd $model_name rm -fv exp/encoder-epoch-12-avg-4-chunk-16-left-128.onnx rm -fv exp/decoder-epoch-12-avg-4-chunk-16-left-128.int8.onnx @@ -135,7 +145,11 @@ def get_models(): idx=9, lang="zh", short_name="small_zipformer", + rule_fsts="itn_zh_number.fst", cmd=""" + if [ ! -f itn_zh_number.fst ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst + fi pushd $model_name rm -fv encoder-epoch-99-avg-1.onnx rm -fv decoder-epoch-99-avg-1.int8.onnx diff --git a/scripts/apk/generate-vad-asr-apk-script.py b/scripts/apk/generate-vad-asr-apk-script.py index ca38fa3fb..61188ca7f 100755 --- a/scripts/apk/generate-vad-asr-apk-script.py +++ b/scripts/apk/generate-vad-asr-apk-script.py @@ -42,6 +42,8 @@ class Model: # cmd is used to remove extra file from the model directory cmd: str = "" + rule_fsts: str = "" + # See get_2nd_models() in ./generate-asr-2pass-apk-script.py def get_models(): @@ -71,7 +73,11 @@ def get_models(): idx=0, lang="zh", short_name="paraformer", + rule_fsts="itn_zh_number.fst", cmd=""" + if [ ! -f itn_zh_number.fst ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst + fi pushd $model_name rm -v README.md @@ -88,7 +94,11 @@ def get_models(): idx=4, lang="zh", short_name="zipformer", + rule_fsts="itn_zh_number.fst", cmd=""" + if [ ! -f itn_zh_number.fst ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst + fi pushd $model_name rm -rfv test_wavs @@ -171,7 +181,11 @@ def get_models(): idx=11, lang="zh", short_name="telespeech", + rule_fsts="itn_zh_number.fst", cmd=""" + if [ ! -f itn_zh_number.fst ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/itn_zh_number.fst + fi pushd $model_name rm -rfv test_wavs diff --git a/scripts/dotnet/OnlineRecognizerConfig.cs b/scripts/dotnet/OnlineRecognizerConfig.cs index 6ba6f5b64..dc58fe844 100644 --- a/scripts/dotnet/OnlineRecognizerConfig.cs +++ b/scripts/dotnet/OnlineRecognizerConfig.cs @@ -26,6 +26,8 @@ public OnlineRecognizerConfig() HotwordsFile = ""; HotwordsScore = 1.5F; CtcFstDecoderConfig = new OnlineCtcFstDecoderConfig(); + RuleFsts = ""; + RuleFars = ""; } public FeatureConfig FeatConfig; public OnlineModelConfig ModelConfig; @@ -64,5 +66,11 @@ public OnlineRecognizerConfig() public float HotwordsScore; public OnlineCtcFstDecoderConfig CtcFstDecoderConfig; + + [MarshalAs(UnmanagedType.LPStr)] + public string RuleFsts; + + [MarshalAs(UnmanagedType.LPStr)] + public string RuleFars; } } diff --git a/scripts/go/_internal/streaming-decode-files/run-transducer-itn.sh b/scripts/go/_internal/streaming-decode-files/run-transducer-itn.sh new file mode 120000 index 000000000..0e1f525a5 --- /dev/null +++ b/scripts/go/_internal/streaming-decode-files/run-transducer-itn.sh @@ -0,0 +1 @@ +../../../../go-api-examples/streaming-decode-files/run-transducer-itn.sh \ No newline at end of file diff --git a/scripts/go/release.sh b/scripts/go/release.sh index 77d663f1b..d46eb1cf7 100755 --- a/scripts/go/release.sh +++ b/scripts/go/release.sh @@ -79,8 +79,8 @@ function osx() { mkdir t cd t - wget -q https://huggingface.co/csukuangfj/sherpa-onnx-wheels/resolve/main/sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-macosx_11_0_x86_64.whl - unzip ./sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-macosx_11_0_x86_64.whl + wget -q https://huggingface.co/csukuangfj/sherpa-onnx-wheels/resolve/main/sherpa_onnx-${SHERPA_ONNX_VERSION}-cp39-cp39-macosx_11_0_x86_64.whl + unzip ./sherpa_onnx-${SHERPA_ONNX_VERSION}-cp39-cp39-macosx_11_0_x86_64.whl cp -v sherpa_onnx/lib/*.dylib $dst/ @@ -93,8 +93,8 @@ function osx() { mkdir t cd t - wget -q https://huggingface.co/csukuangfj/sherpa-onnx-wheels/resolve/main/sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-macosx_11_0_arm64.whl - unzip ./sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-macosx_11_0_arm64.whl + wget -q https://huggingface.co/csukuangfj/sherpa-onnx-wheels/resolve/main/sherpa_onnx-${SHERPA_ONNX_VERSION}-cp39-cp39-macosx_11_0_arm64.whl + unzip ./sherpa_onnx-${SHERPA_ONNX_VERSION}-cp39-cp39-macosx_11_0_arm64.whl cp -v sherpa_onnx/lib/*.dylib $dst/ @@ -126,7 +126,6 @@ function windows() { unzip ./sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-win_amd64.whl cp -v sherpa_onnx-${SHERPA_ONNX_VERSION}.data/data/bin/*.dll $dst - cp -v sherpa_onnx-${SHERPA_ONNX_VERSION}.data/data/bin/*.lib $dst cd .. rm -rf t @@ -139,7 +138,6 @@ function windows() { unzip ./sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-win32.whl cp -v sherpa_onnx-${SHERPA_ONNX_VERSION}.data/data/bin/*.dll $dst - cp -v sherpa_onnx-${SHERPA_ONNX_VERSION}.data/data/bin/*.lib $dst cd .. rm -rf t diff --git a/scripts/go/sherpa_onnx.go b/scripts/go/sherpa_onnx.go index 437f4f328..898c0c21c 100644 --- a/scripts/go/sherpa_onnx.go +++ b/scripts/go/sherpa_onnx.go @@ -127,7 +127,11 @@ type OnlineRecognizerConfig struct { Rule1MinTrailingSilence float32 Rule2MinTrailingSilence float32 Rule3MinUtteranceLength float32 + HotwordsFile string + HotwordsScore float32 CtcFstDecoderConfig OnlineCtcFstDecoderConfig + RuleFsts string + RuleFars string } // It contains the recognition result for a online stream. @@ -204,6 +208,17 @@ func NewOnlineRecognizer(config *OnlineRecognizerConfig) *OnlineRecognizer { c.rule2_min_trailing_silence = C.float(config.Rule2MinTrailingSilence) c.rule3_min_utterance_length = C.float(config.Rule3MinUtteranceLength) + c.hotwords_file = C.CString(config.HotwordsFile) + defer C.free(unsafe.Pointer(c.hotwords_file)) + + c.hotwords_score = C.float(config.HotwordsScore) + + c.rule_fsts = C.CString(config.RuleFsts) + defer C.free(unsafe.Pointer(c.rule_fsts)) + + c.rule_fars = C.CString(config.RuleFars) + defer C.free(unsafe.Pointer(c.rule_fars)) + c.ctc_fst_decoder_config.graph = C.CString(config.CtcFstDecoderConfig.Graph) defer C.free(unsafe.Pointer(c.ctc_fst_decoder_config.graph)) c.ctc_fst_decoder_config.max_active = C.int(config.CtcFstDecoderConfig.MaxActive) diff --git a/scripts/node-addon-api/src/streaming-asr.cc b/scripts/node-addon-api/src/streaming-asr.cc index 59312a230..81482c824 100644 --- a/scripts/node-addon-api/src/streaming-asr.cc +++ b/scripts/node-addon-api/src/streaming-asr.cc @@ -189,6 +189,8 @@ static Napi::External CreateOnlineRecognizerWrapper( rule3MinUtteranceLength); SHERPA_ONNX_ASSIGN_ATTR_STR(hotwords_file, hotwordsFile); SHERPA_ONNX_ASSIGN_ATTR_FLOAT(hotwords_score, hotwordsScore); + SHERPA_ONNX_ASSIGN_ATTR_STR(rule_fsts, ruleFsts); + SHERPA_ONNX_ASSIGN_ATTR_STR(rule_fars, ruleFars); c.ctc_fst_decoder_config = GetCtcFstDecoderConfig(o); @@ -246,6 +248,14 @@ static Napi::External CreateOnlineRecognizerWrapper( delete[] c.hotwords_file; } + if (c.rule_fsts) { + delete[] c.rule_fsts; + } + + if (c.rule_fars) { + delete[] c.rule_fars; + } + if (c.ctc_fst_decoder_config.graph) { delete[] c.ctc_fst_decoder_config.graph; } diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index 01e2191c3..2d0118833 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -110,6 +110,9 @@ SherpaOnnxOnlineRecognizer *CreateOnlineRecognizer( recognizer_config.ctc_fst_decoder_config.max_active = SHERPA_ONNX_OR(config->ctc_fst_decoder_config.max_active, 3000); + recognizer_config.rule_fsts = SHERPA_ONNX_OR(config->rule_fsts, ""); + recognizer_config.rule_fars = SHERPA_ONNX_OR(config->rule_fars, ""); + if (config->model_config.debug) { SHERPA_ONNX_LOGE("%s\n", recognizer_config.ToString().c_str()); } diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index 0229f8059..e9637ae7c 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -144,6 +144,8 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOnlineRecognizerConfig { float hotwords_score; SherpaOnnxOnlineCtcFstDecoderConfig ctc_fst_decoder_config; + const char *rule_fsts; + const char *rule_fars; } SherpaOnnxOnlineRecognizerConfig; SHERPA_ONNX_API typedef struct SherpaOnnxOnlineRecognizerResult { diff --git a/sherpa-onnx/csrc/CMakeLists.txt b/sherpa-onnx/csrc/CMakeLists.txt index 6edb82402..bac0499a8 100644 --- a/sherpa-onnx/csrc/CMakeLists.txt +++ b/sherpa-onnx/csrc/CMakeLists.txt @@ -190,9 +190,10 @@ if(NOT BUILD_SHARED_LIBS AND APPLE) target_link_libraries(sherpa-onnx-core "-framework Foundation") endif() +target_link_libraries(sherpa-onnx-core fstfar fst) + if(SHERPA_ONNX_ENABLE_TTS) target_link_libraries(sherpa-onnx-core piper_phonemize) - target_link_libraries(sherpa-onnx-core fstfar fst) target_link_libraries(sherpa-onnx-core cppjieba) endif() diff --git a/sherpa-onnx/csrc/offline-recognizer-impl.cc b/sherpa-onnx/csrc/offline-recognizer-impl.cc index 546d0f9bf..80a6766ce 100644 --- a/sherpa-onnx/csrc/offline-recognizer-impl.cc +++ b/sherpa-onnx/csrc/offline-recognizer-impl.cc @@ -425,9 +425,6 @@ std::string OfflineRecognizerImpl::ApplyInverseTextNormalization( if (!itn_list_.empty()) { for (const auto &tn : itn_list_) { text = tn->Normalize(text); - if (config_.model_config.debug) { - SHERPA_ONNX_LOGE("After inverse text normalization: %s", text.c_str()); - } } } diff --git a/sherpa-onnx/csrc/online-recognizer-impl.cc b/sherpa-onnx/csrc/online-recognizer-impl.cc index 89d172f97..2784ad24c 100644 --- a/sherpa-onnx/csrc/online-recognizer-impl.cc +++ b/sherpa-onnx/csrc/online-recognizer-impl.cc @@ -4,6 +4,8 @@ #include "sherpa-onnx/csrc/online-recognizer-impl.h" +#include + #if __ANDROID_API__ >= 9 #include @@ -186,9 +188,6 @@ std::string OnlineRecognizerImpl::ApplyInverseTextNormalization( if (!itn_list_.empty()) { for (const auto &tn : itn_list_) { text = tn->Normalize(text); - if (config_.model_config.debug) { - SHERPA_ONNX_LOGE("After inverse text normalization: %s", text.c_str()); - } } } diff --git a/sherpa-onnx/flutter/CHANGELOG.md b/sherpa-onnx/flutter/CHANGELOG.md index 5f912155b..f7524ea9d 100644 --- a/sherpa-onnx/flutter/CHANGELOG.md +++ b/sherpa-onnx/flutter/CHANGELOG.md @@ -1,3 +1,7 @@ +## 1.10.0 + +* Add inverse text normalization + ## 1.9.30 * Add TTS diff --git a/sherpa-onnx/flutter/lib/src/online_recognizer.dart b/sherpa-onnx/flutter/lib/src/online_recognizer.dart index bee1f2683..49ca3d2e8 100644 --- a/sherpa-onnx/flutter/lib/src/online_recognizer.dart +++ b/sherpa-onnx/flutter/lib/src/online_recognizer.dart @@ -111,11 +111,13 @@ class OnlineRecognizerConfig { this.hotwordsFile = '', this.hotwordsScore = 1.5, this.ctcFstDecoderConfig = const OnlineCtcFstDecoderConfig(), + this.ruleFsts = '', + this.ruleFars = '', }); @override String toString() { - return 'OnlineRecognizerConfig(feat: $feat, model: $model, decodingMethod: $decodingMethod, maxActivePaths: $maxActivePaths, enableEndpoint: $enableEndpoint, rule1MinTrailingSilence: $rule1MinTrailingSilence, rule2MinTrailingSilence: $rule2MinTrailingSilence, rule3MinUtteranceLength: $rule3MinUtteranceLength, hotwordsFile: $hotwordsFile, hotwordsScore: $hotwordsScore, ctcFstDecoderConfig: $ctcFstDecoderConfig)'; + return 'OnlineRecognizerConfig(feat: $feat, model: $model, decodingMethod: $decodingMethod, maxActivePaths: $maxActivePaths, enableEndpoint: $enableEndpoint, rule1MinTrailingSilence: $rule1MinTrailingSilence, rule2MinTrailingSilence: $rule2MinTrailingSilence, rule3MinUtteranceLength: $rule3MinUtteranceLength, hotwordsFile: $hotwordsFile, hotwordsScore: $hotwordsScore, ctcFstDecoderConfig: $ctcFstDecoderConfig, ruleFsts: $ruleFsts, ruleFars: $ruleFars)'; } final FeatureConfig feat; @@ -137,6 +139,8 @@ class OnlineRecognizerConfig { final double hotwordsScore; final OnlineCtcFstDecoderConfig ctcFstDecoderConfig; + final String ruleFsts; + final String ruleFars; } class OnlineRecognizerResult { @@ -201,9 +205,13 @@ class OnlineRecognizer { c.ref.ctcFstDecoderConfig.graph = config.ctcFstDecoderConfig.graph.toNativeUtf8(); c.ref.ctcFstDecoderConfig.maxActive = config.ctcFstDecoderConfig.maxActive; + c.ref.ruleFsts = config.ruleFsts.toNativeUtf8(); + c.ref.ruleFars = config.ruleFars.toNativeUtf8(); final ptr = SherpaOnnxBindings.createOnlineRecognizer?.call(c) ?? nullptr; + calloc.free(c.ref.ruleFars); + calloc.free(c.ref.ruleFsts); calloc.free(c.ref.ctcFstDecoderConfig.graph); calloc.free(c.ref.hotwordsFile); calloc.free(c.ref.decodingMethod); diff --git a/sherpa-onnx/flutter/lib/src/sherpa_onnx_bindings.dart b/sherpa-onnx/flutter/lib/src/sherpa_onnx_bindings.dart index 70d9572e7..cd3e7781d 100644 --- a/sherpa-onnx/flutter/lib/src/sherpa_onnx_bindings.dart +++ b/sherpa-onnx/flutter/lib/src/sherpa_onnx_bindings.dart @@ -205,6 +205,9 @@ final class SherpaOnnxOnlineRecognizerConfig extends Struct { external double hotwordsScore; external SherpaOnnxOnlineCtcFstDecoderConfig ctcFstDecoderConfig; + + external Pointer ruleFsts; + external Pointer ruleFars; } final class SherpaOnnxSileroVadModelConfig extends Struct { diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OnlineRecognizerConfig.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OnlineRecognizerConfig.java index af4b76d4d..cb9afd2e1 100644 --- a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OnlineRecognizerConfig.java +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OnlineRecognizerConfig.java @@ -15,6 +15,8 @@ public class OnlineRecognizerConfig { private final int maxActivePaths; private final String hotwordsFile; private final float hotwordsScore; + private final String ruleFsts; + private final String ruleFars; private OnlineRecognizerConfig(Builder builder) { this.featConfig = builder.featConfig; @@ -27,6 +29,8 @@ private OnlineRecognizerConfig(Builder builder) { this.maxActivePaths = builder.maxActivePaths; this.hotwordsFile = builder.hotwordsFile; this.hotwordsScore = builder.hotwordsScore; + this.ruleFsts = builder.ruleFsts; + this.ruleFars = builder.ruleFars; } public static Builder builder() { @@ -48,6 +52,8 @@ public static class Builder { private int maxActivePaths = 4; private String hotwordsFile = ""; private float hotwordsScore = 1.5f; + private String ruleFsts = ""; + private String ruleFars = ""; public OnlineRecognizerConfig build() { return new OnlineRecognizerConfig(this); @@ -102,5 +108,15 @@ public Builder setHotwordsScore(float hotwordsScore) { this.hotwordsScore = hotwordsScore; return this; } + + public Builder setRuleFsts(String ruleFsts) { + this.ruleFsts = ruleFsts; + return this; + } + + public Builder setRuleFars(String ruleFars) { + this.ruleFars = ruleFars; + return this; + } } } diff --git a/sherpa-onnx/jni/online-recognizer.cc b/sherpa-onnx/jni/online-recognizer.cc index e8044526e..d8acd0fed 100644 --- a/sherpa-onnx/jni/online-recognizer.cc +++ b/sherpa-onnx/jni/online-recognizer.cc @@ -37,6 +37,18 @@ static OnlineRecognizerConfig GetConfig(JNIEnv *env, jobject config) { fid = env->GetFieldID(cls, "hotwordsScore", "F"); ans.hotwords_score = env->GetFloatField(config, fid); + fid = env->GetFieldID(cls, "ruleFsts", "Ljava/lang/String;"); + s = (jstring)env->GetObjectField(config, fid); + p = env->GetStringUTFChars(s, nullptr); + ans.rule_fsts = p; + env->ReleaseStringUTFChars(s, p); + + fid = env->GetFieldID(cls, "ruleFars", "Ljava/lang/String;"); + s = (jstring)env->GetObjectField(config, fid); + p = env->GetStringUTFChars(s, nullptr); + ans.rule_fars = p; + env->ReleaseStringUTFChars(s, p); + //---------- feat config ---------- fid = env->GetFieldID(cls, "featConfig", "Lcom/k2fsa/sherpa/onnx/FeatureConfig;"); diff --git a/sherpa-onnx/kotlin-api/OnlineRecognizer.kt b/sherpa-onnx/kotlin-api/OnlineRecognizer.kt index 93a21e07e..de47a5ebd 100644 --- a/sherpa-onnx/kotlin-api/OnlineRecognizer.kt +++ b/sherpa-onnx/kotlin-api/OnlineRecognizer.kt @@ -69,6 +69,8 @@ data class OnlineRecognizerConfig( var maxActivePaths: Int = 4, var hotwordsFile: String = "", var hotwordsScore: Float = 1.5f, + var ruleFsts: String = "", + var ruleFars: String = "", ) data class OnlineRecognizerResult( diff --git a/swift-api-examples/SherpaOnnx.swift b/swift-api-examples/SherpaOnnx.swift index 24082a827..432abcb61 100644 --- a/swift-api-examples/SherpaOnnx.swift +++ b/swift-api-examples/SherpaOnnx.swift @@ -135,7 +135,9 @@ func sherpaOnnxOnlineRecognizerConfig( maxActivePaths: Int = 4, hotwordsFile: String = "", hotwordsScore: Float = 1.5, - ctcFstDecoderConfig: SherpaOnnxOnlineCtcFstDecoderConfig = sherpaOnnxOnlineCtcFstDecoderConfig() + ctcFstDecoderConfig: SherpaOnnxOnlineCtcFstDecoderConfig = sherpaOnnxOnlineCtcFstDecoderConfig(), + ruleFsts: String = "", + ruleFars: String = "" ) -> SherpaOnnxOnlineRecognizerConfig { return SherpaOnnxOnlineRecognizerConfig( feat_config: featConfig, @@ -148,7 +150,9 @@ func sherpaOnnxOnlineRecognizerConfig( rule3_min_utterance_length: rule3MinUtteranceLength, hotwords_file: toCPointer(hotwordsFile), hotwords_score: hotwordsScore, - ctc_fst_decoder_config: ctcFstDecoderConfig + ctc_fst_decoder_config: ctcFstDecoderConfig, + rule_fsts: toCPointer(ruleFsts), + rule_fars: toCPointer(ruleFars) ) } diff --git a/wasm/asr/CMakeLists.txt b/wasm/asr/CMakeLists.txt index b46fe39a1..2a6dd13f6 100644 --- a/wasm/asr/CMakeLists.txt +++ b/wasm/asr/CMakeLists.txt @@ -40,6 +40,8 @@ string(APPEND MY_FLAGS " -sEXPORTED_FUNCTIONS=[_CopyHeap,_malloc,_free,${all_exp string(APPEND MY_FLAGS "--preload-file ${CMAKE_CURRENT_SOURCE_DIR}/assets@. ") string(APPEND MY_FLAGS " -sEXPORTED_RUNTIME_METHODS=['ccall','stringToUTF8','setValue','getValue','lengthBytesUTF8','UTF8ToString'] ") +message(STATUS "MY_FLAGS: ${MY_FLAGS}") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${MY_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MY_FLAGS}") set(CMAKE_EXECUTBLE_LINKER_FLAGS "${CMAKE_EXECUTBLE_LINKER_FLAGS} ${MY_FLAGS}") diff --git a/wasm/asr/sherpa-onnx-asr.js b/wasm/asr/sherpa-onnx-asr.js index 2179fd87d..3341a093c 100644 --- a/wasm/asr/sherpa-onnx-asr.js +++ b/wasm/asr/sherpa-onnx-asr.js @@ -239,7 +239,7 @@ function initSherpaOnnxOnlineRecognizerConfig(config, Module) { const ctcFstDecoder = initSherpaOnnxOnlineCtcFstDecoderConfig( config.ctcFstDecoderConfig, Module) - const len = feat.len + model.len + 8 * 4 + ctcFstDecoder.len; + const len = feat.len + model.len + 8 * 4 + ctcFstDecoder.len + 2 * 4; const ptr = Module._malloc(len); let offset = 0; @@ -251,7 +251,10 @@ function initSherpaOnnxOnlineRecognizerConfig(config, Module) { const decodingMethodLen = Module.lengthBytesUTF8(config.decodingMethod) + 1; const hotwordsFileLen = Module.lengthBytesUTF8(config.hotwordsFile) + 1; - const bufferLen = decodingMethodLen + hotwordsFileLen; + const ruleFstsFileLen = Module.lengthBytesUTF8(config.ruleFsts || '') + 1; + const ruleFarsFileLen = Module.lengthBytesUTF8(config.ruleFars || '') + 1; + const bufferLen = + decodingMethodLen + hotwordsFileLen + ruleFstsFileLen + ruleFarsFileLen; const buffer = Module._malloc(bufferLen); offset = 0; @@ -259,6 +262,13 @@ function initSherpaOnnxOnlineRecognizerConfig(config, Module) { offset += decodingMethodLen; Module.stringToUTF8(config.hotwordsFile, buffer + offset, hotwordsFileLen); + offset += hotwordsFileLen; + + Module.stringToUTF8(config.ruleFsts || '', buffer + offset, ruleFstsFileLen); + offset += ruleFstsFileLen; + + Module.stringToUTF8(config.ruleFars || '', buffer + offset, ruleFarsFileLen); + offset += ruleFarsFileLen; offset = feat.len + model.len; Module.setValue(ptr + offset, buffer, 'i8*'); // decoding method @@ -286,6 +296,16 @@ function initSherpaOnnxOnlineRecognizerConfig(config, Module) { offset += 4; Module._CopyHeap(ctcFstDecoder.ptr, ctcFstDecoder.len, ptr + offset); + offset += ctcFstDecoder.len; + + Module.setValue( + ptr + offset, buffer + decodingMethodLen + hotwordsFileLen, 'i8*'); + offset += 4; + + Module.setValue( + ptr + offset, + buffer + decodingMethodLen + hotwordsFileLen + ruleFstsFileLen, 'i8*'); + offset += 4; return { buffer: buffer, ptr: ptr, len: len, feat: feat, model: model, @@ -363,7 +383,9 @@ function createOnlineRecognizer(Module, myConfig) { ctcFstDecoderConfig: { graph: '', maxActive: 3000, - } + }, + ruleFsts: '', + ruleFars: '', }; if (myConfig) { recognizerConfig = myConfig; diff --git a/wasm/asr/sherpa-onnx-wasm-main-asr.cc b/wasm/asr/sherpa-onnx-wasm-main-asr.cc index de0cf1430..07e5736de 100644 --- a/wasm/asr/sherpa-onnx-wasm-main-asr.cc +++ b/wasm/asr/sherpa-onnx-wasm-main-asr.cc @@ -26,7 +26,7 @@ static_assert(sizeof(SherpaOnnxOnlineCtcFstDecoderConfig) == 2 * 4, ""); static_assert(sizeof(SherpaOnnxOnlineRecognizerConfig) == sizeof(SherpaOnnxFeatureConfig) + sizeof(SherpaOnnxOnlineModelConfig) + 8 * 4 + - sizeof(SherpaOnnxOnlineCtcFstDecoderConfig), + sizeof(SherpaOnnxOnlineCtcFstDecoderConfig) + 2 * 4, ""); void MyPrint(SherpaOnnxOnlineRecognizerConfig *config) { @@ -71,6 +71,8 @@ void MyPrint(SherpaOnnxOnlineRecognizerConfig *config) { config->rule3_min_utterance_length); fprintf(stdout, "hotwords_file: %s\n", config->hotwords_file); fprintf(stdout, "hotwords_score: %.2f\n", config->hotwords_score); + fprintf(stdout, "rule_fsts: %s\n", config->rule_fsts); + fprintf(stdout, "rule_fars: %s\n", config->rule_fars); fprintf(stdout, "----------ctc fst decoder config----------\n"); fprintf(stdout, "graph: %s\n", config->ctc_fst_decoder_config.graph); diff --git a/wasm/kws/CMakeLists.txt b/wasm/kws/CMakeLists.txt index f083892cc..dfa6f7743 100644 --- a/wasm/kws/CMakeLists.txt +++ b/wasm/kws/CMakeLists.txt @@ -31,6 +31,7 @@ string(APPEND MY_FLAGS " -sSTACK_SIZE=10485760 ") string(APPEND MY_FLAGS " -sEXPORTED_FUNCTIONS=[_CopyHeap,_malloc,_free,${all_exported_functions}] ") string(APPEND MY_FLAGS "--preload-file ${CMAKE_CURRENT_SOURCE_DIR}/assets@. ") string(APPEND MY_FLAGS " -sEXPORTED_RUNTIME_METHODS=['ccall','stringToUTF8','setValue','getValue','lengthBytesUTF8','UTF8ToString'] ") + message(STATUS "MY_FLAGS: ${MY_FLAGS}") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${MY_FLAGS}") @@ -51,4 +52,4 @@ install( "$/sherpa-onnx-wasm-kws-main.data" DESTINATION bin/wasm -) \ No newline at end of file +) diff --git a/wasm/tts/CMakeLists.txt b/wasm/tts/CMakeLists.txt index 15fe8dd34..618a98a1d 100644 --- a/wasm/tts/CMakeLists.txt +++ b/wasm/tts/CMakeLists.txt @@ -31,6 +31,8 @@ string(APPEND MY_FLAGS " -sEXPORTED_FUNCTIONS=[_CopyHeap,_malloc,_free,${all_exp string(APPEND MY_FLAGS "--preload-file ${CMAKE_CURRENT_SOURCE_DIR}/assets@. ") string(APPEND MY_FLAGS " -sEXPORTED_RUNTIME_METHODS=['ccall','stringToUTF8','setValue','getValue','lengthBytesUTF8','UTF8ToString'] ") +message(STATUS "MY_FLAGS: ${MY_FLAGS}") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${MY_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MY_FLAGS}") set(CMAKE_EXECUTBLE_LINKER_FLAGS "${CMAKE_EXECUTBLE_LINKER_FLAGS} ${MY_FLAGS}") From 0be7f2885383b651fdf3e003f93b0114aaae3910 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Tue, 18 Jun 2024 16:50:04 +0800 Subject: [PATCH 029/237] build pre-compiled libs for jni (linux x64) (#1026) --- .github/workflows/linux-jni.yaml | 161 +++++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100644 .github/workflows/linux-jni.yaml diff --git a/.github/workflows/linux-jni.yaml b/.github/workflows/linux-jni.yaml new file mode 100644 index 000000000..12d33ed89 --- /dev/null +++ b/.github/workflows/linux-jni.yaml @@ -0,0 +1,161 @@ +name: linux-jni + +on: + push: + branches: + - linux-jni + tags: + - 'v[0-9]+.[0-9]+.[0-9]+*' + workflow_dispatch: + +concurrency: + group: linux-jni-${{ github.ref }} + cancel-in-progress: true + +jobs: + linux-jni: + name: linux jni + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Display PWD + shell: bash + run: | + echo "pwd: $PWD" + ls -lh + du -h -d1 . + + - name: Build sherpa-onnx + uses: addnab/docker-run-action@v3 + with: + image: quay.io/pypa/manylinux2014_x86_64 + options: | + --volume ${{ github.workspace }}/:/home/runner/work/sherpa-onnx/sherpa-onnx + shell: bash + run: | + uname -a + gcc --version + cmake --version + cat /etc/*release + id + pwd + + yum install -y java-11-openjdk-devel + java -version + which java + ls -lh $(which java) + ls -lrt /etc/alternatives/java + + export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-11.0.23.0.9-2.el7_9.x86_64 + echo "JAVA_HOME: $JAVA_HOME" + find $JAVA_HOME -name jni.h + + cd /home/runner/work/sherpa-onnx/sherpa-onnx + + git clone --depth 1 https://github.com/alsa-project/alsa-lib + pushd alsa-lib + ./gitcompile + popd + + export CPLUS_INCLUDE_PATH=$PWD/alsa-lib/include:$CPLUS_INCLUDE_PATH + export SHERPA_ONNX_ALSA_LIB_DIR=$PWD/alsa-lib/src/.libs + + mkdir build + cd build + + cmake -DSHERPA_ONNX_ENABLE_TTS=ON -D CMAKE_BUILD_TYPE=${{ matrix.build_type }} -D BUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX=./install -DSHERPA_ONNX_ENABLE_JNI=ON .. + + make -j2 + make install + + ls -lh lib + ls -lh bin + + - name: Display dependencies of sherpa-onnx for linux + shell: bash + run: | + du -h -d1 . + sudo chown -R $USER ./build + ls -lh build/bin + ls -lh build/_deps/onnxruntime-src/lib/ + + echo "strip" + strip build/bin/* + echo "after strip" + ls -lh build/bin + + file build/bin/sherpa-onnx + file build/bin/sherpa-onnx + ls -lh build/bin/sherpa-onnx + readelf -d build/bin/sherpa-onnx + + - uses: actions/upload-artifact@v4 + with: + name: release-jni + path: install/* + + + - name: Copy files + shell: bash + run: | + du -h -d1 . + SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2) + + dst=sherpa-onnx-${SHERPA_ONNX_VERSION}-linux-x64-jni + mkdir $dst + + cp -a build/install/bin $dst/ + cp -a build/install/lib $dst/ + cp -a build/install/include $dst/ + + tree $dst + + tar cjvf ${dst}.tar.bz2 $dst + du -h -d1 . + + - name: Publish to huggingface + if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && (github.event_name == 'push' || github.event_name == 'workflow_dispatch') + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + uses: nick-fields/retry@v3 + with: + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + rm -rf huggingface + export GIT_CLONE_PROTECTION_ACTIVE=false + GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/sherpa-onnx-libs huggingface + + cd huggingface + mkdir -p jni + + cp -v ../sherpa-onnx-*.tar.bz2 ./jni + + git status + git lfs track "*.bz2" + + git add . + + git commit -m "add more files" + + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-libs main + + - name: Release pre-compiled binaries and libs for linux x64 + if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && github.event_name == 'push' && contains(github.ref, 'refs/tags/') && matrix.build_type == 'Release' + uses: svenstaro/upload-release-action@v2 + with: + file_glob: true + overwrite: true + file: sherpa-onnx-*.tar.bz2 From ab21131f7fceb8184357e7ad059905cc246eef07 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Tue, 18 Jun 2024 16:51:30 +0800 Subject: [PATCH 030/237] Swift API for keyword spotting. (#1027) --- .github/scripts/test-swift.sh | 4 + swift-api-examples/.gitignore | 1 + swift-api-examples/SherpaOnnx.swift | 108 ++++++++++++++++++ .../keyword-spotting-from-file.swift | 83 ++++++++++++++ .../run-keyword-spotting-from-file.sh | 34 ++++++ 5 files changed, 230 insertions(+) create mode 100644 swift-api-examples/keyword-spotting-from-file.swift create mode 100755 swift-api-examples/run-keyword-spotting-from-file.sh diff --git a/.github/scripts/test-swift.sh b/.github/scripts/test-swift.sh index 536c04c47..875c4fa34 100755 --- a/.github/scripts/test-swift.sh +++ b/.github/scripts/test-swift.sh @@ -7,6 +7,10 @@ echo "pwd: $PWD" cd swift-api-examples ls -lh +./run-keyword-spotting-from-file.sh +rm ./keyword-spotting-from-file +rm -rf sherpa-onnx-kws-* + ./run-streaming-hlg-decode-file.sh rm ./streaming-hlg-decode-file rm -rf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18 diff --git a/swift-api-examples/.gitignore b/swift-api-examples/.gitignore index f4290242b..794cabec8 100644 --- a/swift-api-examples/.gitignore +++ b/swift-api-examples/.gitignore @@ -8,3 +8,4 @@ sherpa-onnx-paraformer-zh-2023-09-14 !*.sh *.bak streaming-hlg-decode-file +keyword-spotting-from-file diff --git a/swift-api-examples/SherpaOnnx.swift b/swift-api-examples/SherpaOnnx.swift index 432abcb61..f69405d0c 100644 --- a/swift-api-examples/SherpaOnnx.swift +++ b/swift-api-examples/SherpaOnnx.swift @@ -832,3 +832,111 @@ class SherpaOnnxSpokenLanguageIdentificationWrapper { return SherpaOnnxSpokenLanguageIdentificationResultWrapper(result: result) } } + +// keyword spotting + +class SherpaOnnxKeywordResultWrapper { + /// A pointer to the underlying counterpart in C + let result: UnsafePointer! + + var keyword: String { + return String(cString: result.pointee.keyword) + } + + var count: Int32 { + return result.pointee.count + } + + var tokens: [String] { + if let tokensPointer = result.pointee.tokens_arr { + var tokens: [String] = [] + for index in 0..!) { + self.result = result + } + + deinit { + if let result { + DestroyKeywordResult(result) + } + } +} + +func sherpaOnnxKeywordSpotterConfig( + featConfig: SherpaOnnxFeatureConfig, + modelConfig: SherpaOnnxOnlineModelConfig, + keywordsFile: String, + maxActivePaths: Int = 4, + numTrailingBlanks: Int = 1, + keywordsScore: Float = 1.0, + keywordsThreshold: Float = 0.25 +) -> SherpaOnnxKeywordSpotterConfig { + return SherpaOnnxKeywordSpotterConfig( + feat_config: featConfig, + model_config: modelConfig, + max_active_paths: Int32(maxActivePaths), + num_trailing_blanks: Int32(numTrailingBlanks), + keywords_score: keywordsScore, + keywords_threshold: keywordsThreshold, + keywords_file: toCPointer(keywordsFile) + ) +} + +class SherpaOnnxKeywordSpotterWrapper { + /// A pointer to the underlying counterpart in C + let spotter: OpaquePointer! + var stream: OpaquePointer! + + init( + config: UnsafePointer! + ) { + spotter = CreateKeywordSpotter(config) + stream = CreateKeywordStream(spotter) + } + + deinit { + if let stream { + DestroyOnlineStream(stream) + } + + if let spotter { + DestroyKeywordSpotter(spotter) + } + } + + func acceptWaveform(samples: [Float], sampleRate: Int = 16000) { + AcceptWaveform(stream, Int32(sampleRate), samples, Int32(samples.count)) + } + + func isReady() -> Bool { + return IsKeywordStreamReady(spotter, stream) == 1 ? true : false + } + + func decode() { + DecodeKeywordStream(spotter, stream) + } + + func getResult() -> SherpaOnnxKeywordResultWrapper { + let result: UnsafePointer? = GetKeywordResult( + spotter, stream) + return SherpaOnnxKeywordResultWrapper(result: result) + } + + /// Signal that no more audio samples would be available. + /// After this call, you cannot call acceptWaveform() any more. + func inputFinished() { + InputFinished(stream) + } +} diff --git a/swift-api-examples/keyword-spotting-from-file.swift b/swift-api-examples/keyword-spotting-from-file.swift new file mode 100644 index 000000000..08487eb4a --- /dev/null +++ b/swift-api-examples/keyword-spotting-from-file.swift @@ -0,0 +1,83 @@ +import AVFoundation + +extension AudioBuffer { + func array() -> [Float] { + return Array(UnsafeBufferPointer(self)) + } +} + +extension AVAudioPCMBuffer { + func array() -> [Float] { + return self.audioBufferList.pointee.mBuffers.array() + } +} + +func run() { + let filePath = "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/3.wav" + let encoder = + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/encoder-epoch-12-avg-2-chunk-16-left-64.onnx" + let decoder = + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/decoder-epoch-12-avg-2-chunk-16-left-64.onnx" + let joiner = + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/joiner-epoch-12-avg-2-chunk-16-left-64.onnx" + let tokens = + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/tokens.txt" + let keywordsFile = + "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/test_keywords.txt" + let transducerConfig = sherpaOnnxOnlineTransducerModelConfig( + encoder: encoder, + decoder: decoder, + joiner: joiner + ) + + let modelConfig = sherpaOnnxOnlineModelConfig( + tokens: tokens, + transducer: transducerConfig + ) + + let featConfig = sherpaOnnxFeatureConfig( + sampleRate: 16000, + featureDim: 80 + ) + var config = sherpaOnnxKeywordSpotterConfig( + featConfig: featConfig, + modelConfig: modelConfig, + keywordsFile: keywordsFile + ) + + let spotter = SherpaOnnxKeywordSpotterWrapper(config: &config) + + let fileURL: NSURL = NSURL(fileURLWithPath: filePath) + let audioFile = try! AVAudioFile(forReading: fileURL as URL) + + let audioFormat = audioFile.processingFormat + assert(audioFormat.sampleRate == 16000) + assert(audioFormat.channelCount == 1) + assert(audioFormat.commonFormat == AVAudioCommonFormat.pcmFormatFloat32) + + let audioFrameCount = UInt32(audioFile.length) + let audioFileBuffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: audioFrameCount) + + try! audioFile.read(into: audioFileBuffer!) + let array: [Float]! = audioFileBuffer?.array() + spotter.acceptWaveform(samples: array) + + let tailPadding = [Float](repeating: 0.0, count: 3200) + spotter.acceptWaveform(samples: tailPadding) + + spotter.inputFinished() + while spotter.isReady() { + spotter.decode() + let keyword = spotter.getResult().keyword + if keyword != "" { + print("Detected: \(keyword)") + } + } +} + +@main +struct App { + static func main() { + run() + } +} diff --git a/swift-api-examples/run-keyword-spotting-from-file.sh b/swift-api-examples/run-keyword-spotting-from-file.sh new file mode 100755 index 000000000..7b1420aa2 --- /dev/null +++ b/swift-api-examples/run-keyword-spotting-from-file.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +set -ex + +if [ ! -d ../build-swift-macos ]; then + echo "Please run ../build-swift-macos.sh first!" + exit 1 +fi + +if [ ! -d ./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01 ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/kws-models/sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2 + tar xf sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2 + rm sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01.tar.bz2 +fi + +if [ ! -e ./keyword-spotting-from-file ]; then + # Note: We use -lc++ to link against libc++ instead of libstdc++ + swiftc \ + -lc++ \ + -I ../build-swift-macos/install/include \ + -import-objc-header ./SherpaOnnx-Bridging-Header.h \ + ./keyword-spotting-from-file.swift ./SherpaOnnx.swift \ + -L ../build-swift-macos/install/lib/ \ + -l sherpa-onnx \ + -l onnxruntime \ + -o keyword-spotting-from-file + + strip keyword-spotting-from-file +else + echo "./keyword-spotting-from-file exists - skip building" +fi + +export DYLD_LIBRARY_PATH=$PWD/../build-swift-macos/install/lib:$DYLD_LIBRARY_PATH +./keyword-spotting-from-file From 167bc76db0f7980dec4387302613ce53ffb0cece Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=84=9A=E8=80=85=E8=87=AA=E6=84=9A?= <33177184+xiaokuang95@users.noreply.github.com> Date: Tue, 18 Jun 2024 18:29:39 +0800 Subject: [PATCH 031/237] fix generate-subtitles.py bug (#1029) * fix generate-subtitles.py If the audio file is not muted for more than 1 second at the end, it will cause the last segment to be lost --- python-api-examples/generate-subtitles.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python-api-examples/generate-subtitles.py b/python-api-examples/generate-subtitles.py index 1f03d0d08..960614251 100755 --- a/python-api-examples/generate-subtitles.py +++ b/python-api-examples/generate-subtitles.py @@ -386,12 +386,17 @@ def main(): print("Started!") + is_silence=False # TODO(fangjun): Support multithreads while True: # *2 because int16_t has two bytes data = process.stdout.read(frames_per_read * 2) if not data: - break + if is_silence: + break + is_silence=True + # The converted audio file does not have a mute data of 1 second or more at the end, which will result in the loss of the last segment data + data = np.zeros(1*args.sample_rate,dtype=np.int16) samples = np.frombuffer(data, dtype=np.int16) samples = samples.astype(np.float32) / 32768 From 656b9fa1c89c678c89b9e8f6fc2404be1d7ba685 Mon Sep 17 00:00:00 2001 From: SilverSulfide <37401741+SilverSulfide@users.noreply.github.com> Date: Wed, 19 Jun 2024 11:29:37 +0300 Subject: [PATCH 032/237] Add Python API support for Offline LM rescoring (#1033) --- .../python/sherpa_onnx/offline_recognizer.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py b/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py index 2fade069a..f0e9a45f2 100644 --- a/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py +++ b/sherpa-onnx/python/sherpa_onnx/offline_recognizer.py @@ -6,6 +6,7 @@ from _sherpa_onnx import ( FeatureExtractorConfig, OfflineCtcFstDecoderConfig, + OfflineLMConfig, OfflineModelConfig, OfflineNemoEncDecCtcModelConfig, OfflineParaformerModelConfig, @@ -56,6 +57,8 @@ def from_transducer( model_type: str = "transducer", rule_fsts: str = "", rule_fars: str = "", + lm: str = "", + lm_scale: float = 0.1, ): """ Please refer to @@ -143,9 +146,21 @@ def from_transducer( f"--hotwords-file. Currently given: {decoding_method}" ) + if lm and decoding_method != "modified_beam_search": + raise ValueError( + "Please use --decoding-method=modified_beam_search when using " + f"--lm. Currently given: {decoding_method}" + ) + + lm_config = OfflineLMConfig( + model=lm, + scale=lm_scale, + ) + recognizer_config = OfflineRecognizerConfig( feat_config=feat_config, model_config=model_config, + lm_config=lm_config, decoding_method=decoding_method, max_active_paths=max_active_paths, hotwords_file=hotwords_file, From a11c8599710dbbd2484e93b452acf2bb7196354f Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Wed, 19 Jun 2024 20:51:57 +0800 Subject: [PATCH 033/237] Support clang-tidy (#1034) --- .clang-tidy | 73 ++++++++++++++++ .github/workflows/clang-tidy.yaml | 57 +++++++++++++ .github/workflows/flutter-macos.yaml | 1 + .github/workflows/flutter-windows-x64.yaml | 2 + CMakeLists.txt | 2 +- cmake/openfst.cmake | 16 ++-- sherpa-onnx/csrc/CMakeLists.txt | 14 +++ sherpa-onnx/csrc/audio-tagging-label-file.cc | 2 +- sherpa-onnx/csrc/base64-decode.cc | 8 +- sherpa-onnx/csrc/cat.cc | 6 +- sherpa-onnx/csrc/circular-buffer.cc | 6 +- sherpa-onnx/csrc/context-graph.cc | 9 +- sherpa-onnx/csrc/endpoint.cc | 11 ++- sherpa-onnx/csrc/endpoint.h | 2 +- sherpa-onnx/csrc/jieba-lexicon.cc | 1 + sherpa-onnx/csrc/keyword-spotter.cc | 3 +- sherpa-onnx/csrc/lexicon.cc | 4 +- sherpa-onnx/csrc/lexicon.h | 4 +- .../csrc/offline-ct-transformer-model.cc | 2 +- sherpa-onnx/csrc/offline-ctc-model.cc | 2 +- sherpa-onnx/csrc/offline-stream.cc | 8 +- ...transducer-modified-beam-search-decoder.cc | 2 +- .../csrc/offline-tts-character-frontend.cc | 4 +- sherpa-onnx/csrc/offline-tts.cc | 3 +- sherpa-onnx/csrc/offline-whisper-model.cc | 29 +++---- .../csrc/online-conformer-transducer-model.cc | 3 +- sherpa-onnx/csrc/online-ctc-fst-decoder.cc | 5 +- .../csrc/online-lstm-transducer-model.cc | 3 +- sherpa-onnx/csrc/online-nemo-ctc-model.cc | 20 ++--- sherpa-onnx/csrc/online-recognizer.cc | 3 +- sherpa-onnx/csrc/online-stream.cc | 5 +- sherpa-onnx/csrc/online-stream.h | 1 - sherpa-onnx/csrc/online-transducer-decoder.cc | 4 +- sherpa-onnx/csrc/online-transducer-decoder.h | 4 +- sherpa-onnx/csrc/online-transducer-model.cc | 2 +- .../csrc/online-transducer-nemo-model.cc | 21 +++-- sherpa-onnx/csrc/online-wenet-ctc-model.cc | 18 ++-- .../csrc/online-zipformer-transducer-model.cc | 3 +- .../csrc/online-zipformer2-ctc-model.cc | 6 +- .../online-zipformer2-transducer-model.cc | 5 +- sherpa-onnx/csrc/onnx-utils.cc | 5 +- sherpa-onnx/csrc/onnx-utils.h | 4 +- sherpa-onnx/csrc/packed-sequence.cc | 5 +- sherpa-onnx/csrc/pad-sequence.cc | 3 +- sherpa-onnx/csrc/parse-options.cc | 85 +++++++++---------- sherpa-onnx/csrc/piper-phonemize-lexicon.cc | 2 +- sherpa-onnx/csrc/resample.cc | 29 +++---- sherpa-onnx/csrc/resample.h | 10 +-- sherpa-onnx/csrc/session.cc | 63 +++++++------- sherpa-onnx/csrc/silero-vad-model.cc | 4 +- sherpa-onnx/csrc/slice.cc | 3 +- .../csrc/speaker-embedding-extractor-impl.cc | 2 +- sherpa-onnx/csrc/speaker-embedding-manager.cc | 5 +- .../spoken-language-identification-impl.cc | 2 +- sherpa-onnx/csrc/stack.cc | 4 +- sherpa-onnx/csrc/symbol-table.cc | 2 +- sherpa-onnx/csrc/text-utils.cc | 3 +- .../csrc/transducer-keyword-decoder.cc | 1 - sherpa-onnx/csrc/transpose.cc | 3 +- sherpa-onnx/csrc/unbind.cc | 3 +- sherpa-onnx/csrc/utils.cc | 2 +- sherpa-onnx/csrc/wave-reader.cc | 4 +- sherpa-onnx/csrc/wave-writer.cc | 2 +- 63 files changed, 382 insertions(+), 238 deletions(-) create mode 100644 .clang-tidy create mode 100644 .github/workflows/clang-tidy.yaml diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 000000000..9f2baf4c5 --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,73 @@ +--- +# NOTE there must be no spaces before the '-', so put the comma last. +# The check bugprone-unchecked-optional-access is also turned off atm +# because it causes clang-tidy to hang randomly. The tracking issue +# can be found at https://github.com/llvm/llvm-project/issues/69369. +# +# Modified from +# https://github.com/pytorch/pytorch/blob/main/.clang-tidy +InheritParentConfig: true +Checks: ' +bugprone-*, +-bugprone-easily-swappable-parameters, +-bugprone-forward-declaration-namespace, +-bugprone-implicit-widening-of-multiplication-result, +-bugprone-macro-parentheses, +-bugprone-lambda-function-name, +-bugprone-narrowing-conversions, +-bugprone-reserved-identifier, +-bugprone-swapped-arguments, +-bugprone-unchecked-optional-access, +clang-diagnostic-missing-prototypes, +cppcoreguidelines-*, +-cppcoreguidelines-avoid-const-or-ref-data-members, +-cppcoreguidelines-avoid-do-while, +-cppcoreguidelines-avoid-magic-numbers, +-cppcoreguidelines-avoid-non-const-global-variables, +-cppcoreguidelines-interfaces-global-init, +-cppcoreguidelines-macro-usage, +-cppcoreguidelines-narrowing-conversions, +-cppcoreguidelines-owning-memory, +-cppcoreguidelines-pro-bounds-array-to-pointer-decay, +-cppcoreguidelines-pro-bounds-constant-array-index, +-cppcoreguidelines-pro-bounds-pointer-arithmetic, +-cppcoreguidelines-pro-type-const-cast, +-cppcoreguidelines-pro-type-cstyle-cast, +-cppcoreguidelines-pro-type-reinterpret-cast, +-cppcoreguidelines-pro-type-static-cast-downcast, +-cppcoreguidelines-pro-type-union-access, +-cppcoreguidelines-pro-type-vararg, +-cppcoreguidelines-special-member-functions, +-cppcoreguidelines-non-private-member-variables-in-classes, +-facebook-hte-RelativeInclude, +hicpp-exception-baseclass, +hicpp-avoid-goto, +misc-*, +-misc-const-correctness, +-misc-include-cleaner, +-misc-use-anonymous-namespace, +-misc-unused-parameters, +-misc-no-recursion, +-misc-non-private-member-variables-in-classes, +-misc-confusable-identifiers, +modernize-*, +-modernize-macro-to-enum, +-modernize-pass-by-value, +-modernize-return-braced-init-list, +-modernize-use-auto, +-modernize-use-default-member-init, +-modernize-use-using, +-modernize-use-trailing-return-type, +-modernize-use-nodiscard, +performance-*, +readability-container-size-empty, +readability-delete-null-pointer, +readability-duplicate-include +readability-misplaced-array-index, +readability-redundant-function-ptr-dereference, +readability-redundant-smartptr-get, +readability-simplify-subscript-expr, +readability-string-compare, +' +WarningsAsErrors: '*' +... diff --git a/.github/workflows/clang-tidy.yaml b/.github/workflows/clang-tidy.yaml new file mode 100644 index 000000000..3295fc647 --- /dev/null +++ b/.github/workflows/clang-tidy.yaml @@ -0,0 +1,57 @@ +name: clang-tidy + +on: + push: + branches: + - master + - clang-tidy + paths: + - 'sherpa-onnx/csrc/**' + + pull_request: + branches: + - master + paths: + - 'sherpa-onnx/csrc/**' + + workflow_dispatch: + +concurrency: + group: clang-tidy-${{ github.ref }} + cancel-in-progress: true + +jobs: + clang-tidy: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.8] + fail-fast: false + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install clang-tidy + shell: bash + run: | + pip install clang-tidy + + - name: Configure + shell: bash + run: | + mkdir build + cd build + cmake -DSHERPA_ONNX_ENABLE_PYTHON=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON .. + + - name: Check with clang-tidy + shell: bash + run: | + cd build + make check diff --git a/.github/workflows/flutter-macos.yaml b/.github/workflows/flutter-macos.yaml index d92d30879..d148fb040 100644 --- a/.github/workflows/flutter-macos.yaml +++ b/.github/workflows/flutter-macos.yaml @@ -184,6 +184,7 @@ jobs: path: ./*.tar.bz2 - name: Publish to huggingface + if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && github.event_name == 'push' && contains(github.ref, 'refs/tags/') && matrix.build_type == 'Release' env: HF_TOKEN: ${{ secrets.HF_TOKEN }} uses: nick-fields/retry@v3 diff --git a/.github/workflows/flutter-windows-x64.yaml b/.github/workflows/flutter-windows-x64.yaml index ce4e7456a..7dfc197e2 100644 --- a/.github/workflows/flutter-windows-x64.yaml +++ b/.github/workflows/flutter-windows-x64.yaml @@ -133,6 +133,7 @@ jobs: shell: bash run: | d=$PWD + SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2) pushd sherpa-onnx/flutter dart pub get @@ -159,6 +160,7 @@ jobs: path: ./*.tar.bz2 - name: Publish to huggingface + if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && github.event_name == 'push' && contains(github.ref, 'refs/tags/') && matrix.build_type == 'Release' env: HF_TOKEN: ${{ secrets.HF_TOKEN }} uses: nick-fields/retry@v3 diff --git a/CMakeLists.txt b/CMakeLists.txt index dedf52767..d474c6aa9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -167,7 +167,7 @@ if(SHERPA_ONNX_ENABLE_WASM_KWS) endif() if(NOT CMAKE_CXX_STANDARD) - set(CMAKE_CXX_STANDARD 14 CACHE STRING "The C++ version to be used.") + set(CMAKE_CXX_STANDARD 17 CACHE STRING "The C++ version to be used.") endif() set(CMAKE_CXX_EXTENSIONS OFF) message(STATUS "C++ Standard version: ${CMAKE_CXX_STANDARD}") diff --git a/cmake/openfst.cmake b/cmake/openfst.cmake index 59d4f9fc3..77e4c157d 100644 --- a/cmake/openfst.cmake +++ b/cmake/openfst.cmake @@ -3,18 +3,18 @@ function(download_openfst) include(FetchContent) - set(openfst_URL "https://github.com/csukuangfj/openfst/archive/refs/tags/sherpa-onnx-2024-06-13.tar.gz") - set(openfst_URL2 "https://hub.nuaa.cf/csukuangfj/openfst/archive/refs/tags/sherpa-onnx-2024-06-13.tar.gz") - set(openfst_HASH "SHA256=f10a71c6b64d89eabdc316d372b956c30c825c7c298e2f20c780320e8181ffb6") + set(openfst_URL "https://github.com/csukuangfj/openfst/archive/refs/tags/sherpa-onnx-2024-06-19.tar.gz") + set(openfst_URL2 "https://hub.nuaa.cf/csukuangfj/openfst/archive/refs/tags/sherpa-onnx-2024-06-19.tar.gz") + set(openfst_HASH "SHA256=5c98e82cc509c5618502dde4860b8ea04d843850ed57e6d6b590b644b268853d") # If you don't have access to the Internet, # please pre-download it set(possible_file_locations - $ENV{HOME}/Downloads/openfst-sherpa-onnx-2024-06-13.tar.gz - ${CMAKE_SOURCE_DIR}/openfst-sherpa-onnx-2024-06-13.tar.gz - ${CMAKE_BINARY_DIR}/openfst-sherpa-onnx-2024-06-13.tar.gz - /tmp/openfst-sherpa-onnx-2024-06-13.tar.gz - /star-fj/fangjun/download/github/openfst-sherpa-onnx-2024-06-13.tar.gz + $ENV{HOME}/Downloads/openfst-sherpa-onnx-2024-06-19.tar.gz + ${CMAKE_SOURCE_DIR}/openfst-sherpa-onnx-2024-06-19.tar.gz + ${CMAKE_BINARY_DIR}/openfst-sherpa-onnx-2024-06-19.tar.gz + /tmp/openfst-sherpa-onnx-2024-06-19.tar.gz + /star-fj/fangjun/download/github/openfst-sherpa-onnx-2024-06-19.tar.gz ) foreach(f IN LISTS possible_file_locations) diff --git a/sherpa-onnx/csrc/CMakeLists.txt b/sherpa-onnx/csrc/CMakeLists.txt index bac0499a8..16da143f1 100644 --- a/sherpa-onnx/csrc/CMakeLists.txt +++ b/sherpa-onnx/csrc/CMakeLists.txt @@ -534,3 +534,17 @@ if(SHERPA_ONNX_ENABLE_TESTS) sherpa_onnx_add_test(${source}) endforeach() endif() + +set(srcs_to_check) +foreach(s IN LISTS sources) + list(APPEND srcs_to_check ${CMAKE_CURRENT_LIST_DIR}/${s}) +endforeach() + +# For clang-tidy +add_custom_target( + clang-tidy-check + clang-tidy -p ${CMAKE_BINARY_DIR}/compile_commands.json --config-file ${CMAKE_SOURCE_DIR}/.clang-tidy ${srcs_to_check} + DEPENDS ${sources}) + +add_custom_target(check DEPENDS clang-tidy-check) + diff --git a/sherpa-onnx/csrc/audio-tagging-label-file.cc b/sherpa-onnx/csrc/audio-tagging-label-file.cc index ea1032997..bd71bcb10 100644 --- a/sherpa-onnx/csrc/audio-tagging-label-file.cc +++ b/sherpa-onnx/csrc/audio-tagging-label-file.cc @@ -60,7 +60,7 @@ void AudioTaggingLabels::Init(std::istream &is) { std::size_t pos{}; int32_t i = std::stoi(index, &pos); - if (index.size() == 0 || pos != index.size()) { + if (index.empty() || pos != index.size()) { SHERPA_ONNX_LOGE("Invalid line: %s", line.c_str()); exit(-1); } diff --git a/sherpa-onnx/csrc/base64-decode.cc b/sherpa-onnx/csrc/base64-decode.cc index b22e443e1..5723790f6 100644 --- a/sherpa-onnx/csrc/base64-decode.cc +++ b/sherpa-onnx/csrc/base64-decode.cc @@ -34,7 +34,7 @@ std::string Base64Decode(const std::string &s) { exit(-1); } - int32_t n = s.size() / 4 * 3; + int32_t n = static_cast(s.size()) / 4 * 3; std::string ans; ans.reserve(n); @@ -46,16 +46,16 @@ std::string Base64Decode(const std::string &s) { } int32_t first = (Ord(s[i]) << 2) + ((Ord(s[i + 1]) & 0x30) >> 4); - ans.push_back(first); + ans.push_back(static_cast(first)); if (i + 2 < static_cast(s.size()) && s[i + 2] != '=') { int32_t second = ((Ord(s[i + 1]) & 0x0f) << 4) + ((Ord(s[i + 2]) & 0x3c) >> 2); - ans.push_back(second); + ans.push_back(static_cast(second)); if (i + 3 < static_cast(s.size()) && s[i + 3] != '=') { int32_t third = ((Ord(s[i + 2]) & 0x03) << 6) + Ord(s[i + 3]); - ans.push_back(third); + ans.push_back(static_cast(third)); } } i += 4; diff --git a/sherpa-onnx/csrc/cat.cc b/sherpa-onnx/csrc/cat.cc index 00da2263b..05c40b0c2 100644 --- a/sherpa-onnx/csrc/cat.cc +++ b/sherpa-onnx/csrc/cat.cc @@ -82,9 +82,9 @@ Ort::Value Cat(OrtAllocator *allocator, T *dst = ans.GetTensorMutableData(); for (int32_t i = 0; i != leading_size; ++i) { - for (int32_t n = 0; n != static_cast(values.size()); ++n) { - auto this_dim = values[n]->GetTensorTypeAndShapeInfo().GetShape()[dim]; - const T *src = values[n]->GetTensorData(); + for (auto value : values) { + auto this_dim = value->GetTensorTypeAndShapeInfo().GetShape()[dim]; + const T *src = value->GetTensorData(); src += i * this_dim * trailing_size; std::copy(src, src + this_dim * trailing_size, dst); diff --git a/sherpa-onnx/csrc/circular-buffer.cc b/sherpa-onnx/csrc/circular-buffer.cc index 4ed667333..ef937cabe 100644 --- a/sherpa-onnx/csrc/circular-buffer.cc +++ b/sherpa-onnx/csrc/circular-buffer.cc @@ -20,7 +20,7 @@ CircularBuffer::CircularBuffer(int32_t capacity) { } void CircularBuffer::Resize(int32_t new_capacity) { - int32_t capacity = buffer_.size(); + int32_t capacity = static_cast(buffer_.size()); if (new_capacity <= capacity) { SHERPA_ONNX_LOGE("new_capacity (%d) <= original capacity (%d). Skip it.", new_capacity, capacity); @@ -86,7 +86,7 @@ void CircularBuffer::Resize(int32_t new_capacity) { } void CircularBuffer::Push(const float *p, int32_t n) { - int32_t capacity = buffer_.size(); + int32_t capacity = static_cast(buffer_.size()); int32_t size = Size(); if (n + size > capacity) { int32_t new_capacity = std::max(capacity * 2, n + size); @@ -126,7 +126,7 @@ std::vector CircularBuffer::Get(int32_t start_index, int32_t n) const { return {}; } - int32_t capacity = buffer_.size(); + int32_t capacity = static_cast(buffer_.size()); if (start_index - head_ + n > size) { SHERPA_ONNX_LOGE("Invalid start_index: %d and n: %d. head_: %d, size: %d", diff --git a/sherpa-onnx/csrc/context-graph.cc b/sherpa-onnx/csrc/context-graph.cc index ef876b360..336208b12 100644 --- a/sherpa-onnx/csrc/context-graph.cc +++ b/sherpa-onnx/csrc/context-graph.cc @@ -67,8 +67,8 @@ void ContextGraph::Build(const std::vector> &token_ids, std::tuple ContextGraph::ForwardOneStep(const ContextState *state, int32_t token, bool strict_mode /*= true*/) const { - const ContextState *node; - float score; + const ContextState *node = nullptr; + float score = 0; if (1 == state->next.count(token)) { node = state->next.at(token).get(); score = node->token_score; @@ -84,7 +84,10 @@ ContextGraph::ForwardOneStep(const ContextState *state, int32_t token, score = node->node_score - state->node_score; } - SHERPA_ONNX_CHECK(nullptr != node); + if (!node) { + SHERPA_ONNX_LOGE("Some bad things happened."); + exit(-1); + } const ContextState *matched_node = node->is_end ? node : (node->output != nullptr ? node->output : nullptr); diff --git a/sherpa-onnx/csrc/endpoint.cc b/sherpa-onnx/csrc/endpoint.cc index 3a9a424c2..afe76fdb7 100644 --- a/sherpa-onnx/csrc/endpoint.cc +++ b/sherpa-onnx/csrc/endpoint.cc @@ -73,10 +73,15 @@ std::string EndpointConfig::ToString() const { return os.str(); } -bool Endpoint::IsEndpoint(int num_frames_decoded, int trailing_silence_frames, +bool Endpoint::IsEndpoint(int32_t num_frames_decoded, + int32_t trailing_silence_frames, float frame_shift_in_seconds) const { - float utterance_length = num_frames_decoded * frame_shift_in_seconds; - float trailing_silence = trailing_silence_frames * frame_shift_in_seconds; + float utterance_length = + static_cast(num_frames_decoded) * frame_shift_in_seconds; + + float trailing_silence = + static_cast(trailing_silence_frames) * frame_shift_in_seconds; + if (RuleActivated(config_.rule1, "rule1", trailing_silence, utterance_length) || RuleActivated(config_.rule2, "rule2", trailing_silence, diff --git a/sherpa-onnx/csrc/endpoint.h b/sherpa-onnx/csrc/endpoint.h index 73995840d..aea5d7d22 100644 --- a/sherpa-onnx/csrc/endpoint.h +++ b/sherpa-onnx/csrc/endpoint.h @@ -64,7 +64,7 @@ class Endpoint { /// This function returns true if this set of endpointing rules thinks we /// should terminate decoding. - bool IsEndpoint(int num_frames_decoded, int trailing_silence_frames, + bool IsEndpoint(int32_t num_frames_decoded, int32_t trailing_silence_frames, float frame_shift_in_seconds) const; private: diff --git a/sherpa-onnx/csrc/jieba-lexicon.cc b/sherpa-onnx/csrc/jieba-lexicon.cc index bffc11871..1bf64cd50 100644 --- a/sherpa-onnx/csrc/jieba-lexicon.cc +++ b/sherpa-onnx/csrc/jieba-lexicon.cc @@ -103,6 +103,7 @@ class JiebaLexicon::Impl { if (w == "。" || w == "!" || w == "?" || w == ",") { ans.push_back(std::move(this_sentence)); + this_sentence = {}; } } // for (const auto &w : words) diff --git a/sherpa-onnx/csrc/keyword-spotter.cc b/sherpa-onnx/csrc/keyword-spotter.cc index 7e93d7a04..1110ee584 100644 --- a/sherpa-onnx/csrc/keyword-spotter.cc +++ b/sherpa-onnx/csrc/keyword-spotter.cc @@ -4,9 +4,8 @@ #include "sherpa-onnx/csrc/keyword-spotter.h" -#include - #include +#include #include #include #include diff --git a/sherpa-onnx/csrc/lexicon.cc b/sherpa-onnx/csrc/lexicon.cc index 3a502c24c..91307a216 100644 --- a/sherpa-onnx/csrc/lexicon.cc +++ b/sherpa-onnx/csrc/lexicon.cc @@ -82,7 +82,7 @@ std::unordered_map ReadTokens(std::istream &is) { std::string line; std::string sym; - int32_t id; + int32_t id = -1; while (std::getline(is, line)) { std::istringstream iss(line); iss >> sym; @@ -254,6 +254,7 @@ std::vector> Lexicon::ConvertTextToTokenIdsChinese( this_sentence.push_back(eos); } ans.push_back(std::move(this_sentence)); + this_sentence = {}; if (sil != -1) { this_sentence.push_back(sil); @@ -324,6 +325,7 @@ std::vector> Lexicon::ConvertTextToTokenIdsNotChinese( if (w != ",") { this_sentence.push_back(blank); ans.push_back(std::move(this_sentence)); + this_sentence = {}; } continue; diff --git a/sherpa-onnx/csrc/lexicon.h b/sherpa-onnx/csrc/lexicon.h index e26a2decc..81a510956 100644 --- a/sherpa-onnx/csrc/lexicon.h +++ b/sherpa-onnx/csrc/lexicon.h @@ -62,8 +62,8 @@ class Lexicon : public OfflineTtsFrontend { std::unordered_map> word2ids_; std::unordered_set punctuations_; std::unordered_map token2id_; - Language language_; - bool debug_; + Language language_ = Language::kUnknown; + bool debug_ = false; }; } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/offline-ct-transformer-model.cc b/sherpa-onnx/csrc/offline-ct-transformer-model.cc index 7f7e63b0f..2ce593b3e 100644 --- a/sherpa-onnx/csrc/offline-ct-transformer-model.cc +++ b/sherpa-onnx/csrc/offline-ct-transformer-model.cc @@ -67,7 +67,7 @@ class OfflineCtTransformerModel::Impl { std::vector tokens; SHERPA_ONNX_READ_META_DATA_VEC_STRING_SEP(tokens, "tokens", "|"); - int32_t vocab_size; + int32_t vocab_size = 0; SHERPA_ONNX_READ_META_DATA(vocab_size, "vocab_size"); if (static_cast(tokens.size()) != vocab_size) { SHERPA_ONNX_LOGE("tokens.size() %d != vocab_size %d", diff --git a/sherpa-onnx/csrc/offline-ctc-model.cc b/sherpa-onnx/csrc/offline-ctc-model.cc index cfa8ab45c..bd646ece3 100644 --- a/sherpa-onnx/csrc/offline-ctc-model.cc +++ b/sherpa-onnx/csrc/offline-ctc-model.cc @@ -19,7 +19,7 @@ namespace { -enum class ModelType { +enum class ModelType : std::uint8_t { kEncDecCTCModelBPE, kEncDecHybridRNNTCTCBPEModel, kTdnn, diff --git a/sherpa-onnx/csrc/offline-stream.cc b/sherpa-onnx/csrc/offline-stream.cc index 6e72a4a1f..79bdb5c54 100644 --- a/sherpa-onnx/csrc/offline-stream.cc +++ b/sherpa-onnx/csrc/offline-stream.cc @@ -4,11 +4,11 @@ #include "sherpa-onnx/csrc/offline-stream.h" -#include - #include +#include #include #include +#include #include "kaldi-native-fbank/csrc/online-feature.h" #include "sherpa-onnx/csrc/macros.h" @@ -56,7 +56,7 @@ class OfflineStream::Impl { public: explicit Impl(const FeatureExtractorConfig &config, ContextGraphPtr context_graph) - : config_(config), context_graph_(context_graph) { + : config_(config), context_graph_(std::move(context_graph)) { if (config.is_mfcc) { mfcc_opts_.frame_opts.dither = config_.dither; mfcc_opts_.frame_opts.snip_edges = config_.snip_edges; @@ -266,7 +266,7 @@ class OfflineStream::Impl { OfflineStream::OfflineStream(const FeatureExtractorConfig &config /*= {}*/, ContextGraphPtr context_graph /*= nullptr*/) - : impl_(std::make_unique(config, context_graph)) {} + : impl_(std::make_unique(config, std::move(context_graph))) {} OfflineStream::OfflineStream(WhisperTag tag) : impl_(std::make_unique(tag)) {} diff --git a/sherpa-onnx/csrc/offline-transducer-modified-beam-search-decoder.cc b/sherpa-onnx/csrc/offline-transducer-modified-beam-search-decoder.cc index 317c7ad8c..391620a01 100644 --- a/sherpa-onnx/csrc/offline-transducer-modified-beam-search-decoder.cc +++ b/sherpa-onnx/csrc/offline-transducer-modified-beam-search-decoder.cc @@ -42,7 +42,7 @@ OfflineTransducerModifiedBeamSearchDecoder::Decode( std::vector context_graphs(batch_size, nullptr); for (int32_t i = 0; i < batch_size; ++i) { - const ContextState *context_state; + const ContextState *context_state = nullptr; if (ss != nullptr) { context_graphs[i] = ss[packed_encoder_out.sorted_indexes[i]]->GetContextGraph(); diff --git a/sherpa-onnx/csrc/offline-tts-character-frontend.cc b/sherpa-onnx/csrc/offline-tts-character-frontend.cc index 2b64e67d8..857200e9c 100644 --- a/sherpa-onnx/csrc/offline-tts-character-frontend.cc +++ b/sherpa-onnx/csrc/offline-tts-character-frontend.cc @@ -30,7 +30,7 @@ static std::unordered_map ReadTokens(std::istream &is) { std::string sym; std::u32string s; - int32_t id; + int32_t id = 0; while (std::getline(is, line)) { std::istringstream iss(line); iss >> sym; @@ -138,6 +138,7 @@ OfflineTtsCharacterFrontend::ConvertTextToTokenIds( } ans.push_back(std::move(this_sentence)); + this_sentence = {}; // re-initialize this_sentence if (use_eos_bos) { @@ -172,6 +173,7 @@ OfflineTtsCharacterFrontend::ConvertTextToTokenIds( } ans.push_back(std::move(this_sentence)); + this_sentence = {}; // re-initialize this_sentence if (use_eos_bos) { diff --git a/sherpa-onnx/csrc/offline-tts.cc b/sherpa-onnx/csrc/offline-tts.cc index 4349f98e9..12feda0b7 100644 --- a/sherpa-onnx/csrc/offline-tts.cc +++ b/sherpa-onnx/csrc/offline-tts.cc @@ -5,6 +5,7 @@ #include "sherpa-onnx/csrc/offline-tts.h" #include +#include #include "sherpa-onnx/csrc/file-utils.h" #include "sherpa-onnx/csrc/macros.h" @@ -87,7 +88,7 @@ OfflineTts::~OfflineTts() = default; GeneratedAudio OfflineTts::Generate( const std::string &text, int64_t sid /*=0*/, float speed /*= 1.0*/, GeneratedAudioCallback callback /*= nullptr*/) const { - return impl_->Generate(text, sid, speed, callback); + return impl_->Generate(text, sid, speed, std::move(callback)); } int32_t OfflineTts::SampleRate() const { return impl_->SampleRate(); } diff --git a/sherpa-onnx/csrc/offline-whisper-model.cc b/sherpa-onnx/csrc/offline-whisper-model.cc index 183574f9d..f73234d95 100644 --- a/sherpa-onnx/csrc/offline-whisper-model.cc +++ b/sherpa-onnx/csrc/offline-whisper-model.cc @@ -22,9 +22,9 @@ class OfflineWhisperModel::Impl { explicit Impl(const OfflineModelConfig &config) : config_(config), env_(ORT_LOGGING_LEVEL_ERROR), + debug_(config.debug), sess_opts_(GetSessionOptions(config)), allocator_{} { - debug_ = config_.debug; { auto buf = ReadFile(config.whisper.encoder); InitEncoder(buf.data(), buf.size()); @@ -39,9 +39,9 @@ class OfflineWhisperModel::Impl { explicit Impl(const SpokenLanguageIdentificationConfig &config) : lid_config_(config), env_(ORT_LOGGING_LEVEL_ERROR), + debug_(config_.debug), sess_opts_(GetSessionOptions(config)), allocator_{} { - debug_ = config_.debug; { auto buf = ReadFile(config.whisper.encoder); InitEncoder(buf.data(), buf.size()); @@ -148,7 +148,6 @@ class OfflineWhisperModel::Impl { cross_v = std::move(std::get<4>(decoder_out)); const float *p_logits = std::get<0>(decoder_out).GetTensorData(); - int32_t vocab_size = VocabSize(); const auto &all_language_ids = GetAllLanguageIDs(); int32_t lang_id = all_language_ids[0]; @@ -317,18 +316,18 @@ class OfflineWhisperModel::Impl { std::unordered_map id2lang_; // model meta data - int32_t n_text_layer_; - int32_t n_text_ctx_; - int32_t n_text_state_; - int32_t n_vocab_; - int32_t sot_; - int32_t eot_; - int32_t blank_; - int32_t translate_; - int32_t transcribe_; - int32_t no_timestamps_; - int32_t no_speech_; - int32_t is_multilingual_; + int32_t n_text_layer_ = 0; + int32_t n_text_ctx_ = 0; + int32_t n_text_state_ = 0; + int32_t n_vocab_ = 0; + int32_t sot_ = 0; + int32_t eot_ = 0; + int32_t blank_ = 0; + int32_t translate_ = 0; + int32_t transcribe_ = 0; + int32_t no_timestamps_ = 0; + int32_t no_speech_ = 0; + int32_t is_multilingual_ = 0; std::vector sot_sequence_; }; diff --git a/sherpa-onnx/csrc/online-conformer-transducer-model.cc b/sherpa-onnx/csrc/online-conformer-transducer-model.cc index 58cbce01c..8e7dfa1f4 100644 --- a/sherpa-onnx/csrc/online-conformer-transducer-model.cc +++ b/sherpa-onnx/csrc/online-conformer-transducer-model.cc @@ -4,9 +4,8 @@ #include "sherpa-onnx/csrc/online-conformer-transducer-model.h" -#include - #include +#include #include #include #include diff --git a/sherpa-onnx/csrc/online-ctc-fst-decoder.cc b/sherpa-onnx/csrc/online-ctc-fst-decoder.cc index f50578833..95e16ba75 100644 --- a/sherpa-onnx/csrc/online-ctc-fst-decoder.cc +++ b/sherpa-onnx/csrc/online-ctc-fst-decoder.cc @@ -52,8 +52,9 @@ static void DecodeOne(const float *log_probs, int32_t num_rows, if (ok) { std::vector isymbols_out; std::vector osymbols_out; - ok = fst::GetLinearSymbolSequence(fst_out, &isymbols_out, &osymbols_out, - nullptr); + /*ok =*/fst::GetLinearSymbolSequence(fst_out, &isymbols_out, + &osymbols_out, nullptr); + // TODO(fangjun): handle ok is false std::vector tokens; tokens.reserve(isymbols_out.size()); diff --git a/sherpa-onnx/csrc/online-lstm-transducer-model.cc b/sherpa-onnx/csrc/online-lstm-transducer-model.cc index 4a0e838da..eca115931 100644 --- a/sherpa-onnx/csrc/online-lstm-transducer-model.cc +++ b/sherpa-onnx/csrc/online-lstm-transducer-model.cc @@ -3,9 +3,8 @@ // Copyright (c) 2023 Xiaomi Corporation #include "sherpa-onnx/csrc/online-lstm-transducer-model.h" -#include - #include +#include #include #include #include diff --git a/sherpa-onnx/csrc/online-nemo-ctc-model.cc b/sherpa-onnx/csrc/online-nemo-ctc-model.cc index 3f796e2d7..e4335b4b1 100644 --- a/sherpa-onnx/csrc/online-nemo-ctc-model.cc +++ b/sherpa-onnx/csrc/online-nemo-ctc-model.cc @@ -265,16 +265,16 @@ class OnlineNeMoCtcModel::Impl { std::vector output_names_; std::vector output_names_ptr_; - int32_t window_size_; - int32_t chunk_shift_; - int32_t subsampling_factor_; - int32_t vocab_size_; - int32_t cache_last_channel_dim1_; - int32_t cache_last_channel_dim2_; - int32_t cache_last_channel_dim3_; - int32_t cache_last_time_dim1_; - int32_t cache_last_time_dim2_; - int32_t cache_last_time_dim3_; + int32_t window_size_ = 0; + int32_t chunk_shift_ = 0; + int32_t subsampling_factor_ = 0; + int32_t vocab_size_ = 0; + int32_t cache_last_channel_dim1_ = 0; + int32_t cache_last_channel_dim2_ = 0; + int32_t cache_last_channel_dim3_ = 0; + int32_t cache_last_time_dim1_ = 0; + int32_t cache_last_time_dim2_ = 0; + int32_t cache_last_time_dim3_ = 0; Ort::Value cache_last_channel_{nullptr}; Ort::Value cache_last_time_{nullptr}; diff --git a/sherpa-onnx/csrc/online-recognizer.cc b/sherpa-onnx/csrc/online-recognizer.cc index a49a62f6a..599a0553d 100644 --- a/sherpa-onnx/csrc/online-recognizer.cc +++ b/sherpa-onnx/csrc/online-recognizer.cc @@ -5,9 +5,8 @@ #include "sherpa-onnx/csrc/online-recognizer.h" -#include - #include +#include #include #include #include diff --git a/sherpa-onnx/csrc/online-stream.cc b/sherpa-onnx/csrc/online-stream.cc index 0a30cf40c..972133c77 100644 --- a/sherpa-onnx/csrc/online-stream.cc +++ b/sherpa-onnx/csrc/online-stream.cc @@ -8,6 +8,7 @@ #include #include "sherpa-onnx/csrc/features.h" +#include "sherpa-onnx/csrc/transducer-keyword-decoder.h" namespace sherpa_onnx { @@ -15,7 +16,7 @@ class OnlineStream::Impl { public: explicit Impl(const FeatureExtractorConfig &config, ContextGraphPtr context_graph) - : feat_extractor_(config), context_graph_(context_graph) {} + : feat_extractor_(config), context_graph_(std::move(context_graph)) {} void AcceptWaveform(int32_t sampling_rate, const float *waveform, int32_t n) { feat_extractor_.AcceptWaveform(sampling_rate, waveform, n); @@ -146,7 +147,7 @@ class OnlineStream::Impl { OnlineStream::OnlineStream(const FeatureExtractorConfig &config /*= {}*/, ContextGraphPtr context_graph /*= nullptr */) - : impl_(std::make_unique(config, context_graph)) {} + : impl_(std::make_unique(config, std::move(context_graph))) {} OnlineStream::~OnlineStream() = default; diff --git a/sherpa-onnx/csrc/online-stream.h b/sherpa-onnx/csrc/online-stream.h index e9958d736..1cf29de7f 100644 --- a/sherpa-onnx/csrc/online-stream.h +++ b/sherpa-onnx/csrc/online-stream.h @@ -15,7 +15,6 @@ #include "sherpa-onnx/csrc/online-ctc-decoder.h" #include "sherpa-onnx/csrc/online-paraformer-decoder.h" #include "sherpa-onnx/csrc/online-transducer-decoder.h" -#include "sherpa-onnx/csrc/transducer-keyword-decoder.h" namespace sherpa_onnx { diff --git a/sherpa-onnx/csrc/online-transducer-decoder.cc b/sherpa-onnx/csrc/online-transducer-decoder.cc index 0c51eabb5..682b9bc7e 100644 --- a/sherpa-onnx/csrc/online-transducer-decoder.cc +++ b/sherpa-onnx/csrc/online-transducer-decoder.cc @@ -45,13 +45,13 @@ OnlineTransducerDecoderResult &OnlineTransducerDecoderResult::operator=( } OnlineTransducerDecoderResult::OnlineTransducerDecoderResult( - OnlineTransducerDecoderResult &&other) + OnlineTransducerDecoderResult &&other) noexcept : OnlineTransducerDecoderResult() { *this = std::move(other); } OnlineTransducerDecoderResult &OnlineTransducerDecoderResult::operator=( - OnlineTransducerDecoderResult &&other) { + OnlineTransducerDecoderResult &&other) noexcept { if (this == &other) { return *this; } diff --git a/sherpa-onnx/csrc/online-transducer-decoder.h b/sherpa-onnx/csrc/online-transducer-decoder.h index 25d798fb6..e507a0fc4 100644 --- a/sherpa-onnx/csrc/online-transducer-decoder.h +++ b/sherpa-onnx/csrc/online-transducer-decoder.h @@ -44,10 +44,10 @@ struct OnlineTransducerDecoderResult { OnlineTransducerDecoderResult &operator=( const OnlineTransducerDecoderResult &other); - OnlineTransducerDecoderResult(OnlineTransducerDecoderResult &&other); + OnlineTransducerDecoderResult(OnlineTransducerDecoderResult &&other) noexcept; OnlineTransducerDecoderResult &operator=( - OnlineTransducerDecoderResult &&other); + OnlineTransducerDecoderResult &&other) noexcept; }; class OnlineStream; diff --git a/sherpa-onnx/csrc/online-transducer-model.cc b/sherpa-onnx/csrc/online-transducer-model.cc index 6a1fe5d55..a6cd49d9e 100644 --- a/sherpa-onnx/csrc/online-transducer-model.cc +++ b/sherpa-onnx/csrc/online-transducer-model.cc @@ -23,7 +23,7 @@ namespace { -enum class ModelType { +enum class ModelType : std::uint8_t { kConformer, kLstm, kZipformer, diff --git a/sherpa-onnx/csrc/online-transducer-nemo-model.cc b/sherpa-onnx/csrc/online-transducer-nemo-model.cc index 3869a8b20..8882c6caa 100644 --- a/sherpa-onnx/csrc/online-transducer-nemo-model.cc +++ b/sherpa-onnx/csrc/online-transducer-nemo-model.cc @@ -5,10 +5,9 @@ #include "sherpa-onnx/csrc/online-transducer-nemo-model.h" -#include -#include - #include +#include +#include #include #include #include @@ -429,8 +428,8 @@ class OnlineTransducerNeMoModel::Impl { std::vector joiner_output_names_; std::vector joiner_output_names_ptr_; - int32_t window_size_; - int32_t chunk_shift_; + int32_t window_size_ = 0; + int32_t chunk_shift_ = 0; int32_t vocab_size_ = 0; int32_t subsampling_factor_ = 8; std::string normalize_type_; @@ -438,12 +437,12 @@ class OnlineTransducerNeMoModel::Impl { int32_t pred_hidden_ = -1; // encoder states - int32_t cache_last_channel_dim1_; - int32_t cache_last_channel_dim2_; - int32_t cache_last_channel_dim3_; - int32_t cache_last_time_dim1_; - int32_t cache_last_time_dim2_; - int32_t cache_last_time_dim3_; + int32_t cache_last_channel_dim1_ = 0; + int32_t cache_last_channel_dim2_ = 0; + int32_t cache_last_channel_dim3_ = 0; + int32_t cache_last_time_dim1_ = 0; + int32_t cache_last_time_dim2_ = 0; + int32_t cache_last_time_dim3_ = 0; // init encoder states Ort::Value cache_last_channel_{nullptr}; diff --git a/sherpa-onnx/csrc/online-wenet-ctc-model.cc b/sherpa-onnx/csrc/online-wenet-ctc-model.cc index 34557bf10..50a7a10aa 100644 --- a/sherpa-onnx/csrc/online-wenet-ctc-model.cc +++ b/sherpa-onnx/csrc/online-wenet-ctc-model.cc @@ -192,15 +192,15 @@ class OnlineWenetCtcModel::Impl { std::vector output_names_; std::vector output_names_ptr_; - int32_t head_; - int32_t num_blocks_; - int32_t output_size_; - int32_t cnn_module_kernel_; - int32_t right_context_; - int32_t subsampling_factor_; - int32_t vocab_size_; - - int32_t required_cache_size_; + int32_t head_ = 0; + int32_t num_blocks_ = 0; + int32_t output_size_ = 0; + int32_t cnn_module_kernel_ = 0; + int32_t right_context_ = 0; + int32_t subsampling_factor_ = 0; + int32_t vocab_size_ = 0; + + int32_t required_cache_size_ = 0; Ort::Value attn_cache_{nullptr}; Ort::Value conv_cache_{nullptr}; diff --git a/sherpa-onnx/csrc/online-zipformer-transducer-model.cc b/sherpa-onnx/csrc/online-zipformer-transducer-model.cc index b7e16cb67..2c9b97374 100644 --- a/sherpa-onnx/csrc/online-zipformer-transducer-model.cc +++ b/sherpa-onnx/csrc/online-zipformer-transducer-model.cc @@ -4,9 +4,8 @@ #include "sherpa-onnx/csrc/online-zipformer-transducer-model.h" -#include - #include +#include #include #include #include diff --git a/sherpa-onnx/csrc/online-zipformer2-ctc-model.cc b/sherpa-onnx/csrc/online-zipformer2-ctc-model.cc index aff4e5cb8..3dc3eedfc 100644 --- a/sherpa-onnx/csrc/online-zipformer2-ctc-model.cc +++ b/sherpa-onnx/csrc/online-zipformer2-ctc-model.cc @@ -4,10 +4,8 @@ #include "sherpa-onnx/csrc/online-zipformer2-ctc-model.h" -#include -#include - #include +#include #include #include #include @@ -90,7 +88,6 @@ class OnlineZipformer2CtcModel::Impl { std::vector StackStates( std::vector> states) const { int32_t batch_size = static_cast(states.size()); - int32_t num_encoders = static_cast(num_encoder_layers_.size()); std::vector buf(batch_size); @@ -168,7 +165,6 @@ class OnlineZipformer2CtcModel::Impl { assert(states.size() == m * 6 + 2); int32_t batch_size = states[0].GetTensorTypeAndShapeInfo().GetShape()[1]; - int32_t num_encoders = num_encoder_layers_.size(); std::vector> ans; ans.resize(batch_size); diff --git a/sherpa-onnx/csrc/online-zipformer2-transducer-model.cc b/sherpa-onnx/csrc/online-zipformer2-transducer-model.cc index 5d4818548..e5c448210 100644 --- a/sherpa-onnx/csrc/online-zipformer2-transducer-model.cc +++ b/sherpa-onnx/csrc/online-zipformer2-transducer-model.cc @@ -4,10 +4,9 @@ #include "sherpa-onnx/csrc/online-zipformer2-transducer-model.h" -#include -#include - #include +#include +#include #include #include #include diff --git a/sherpa-onnx/csrc/onnx-utils.cc b/sherpa-onnx/csrc/onnx-utils.cc index 2d5c20823..5d9f3745e 100644 --- a/sherpa-onnx/csrc/onnx-utils.cc +++ b/sherpa-onnx/csrc/onnx-utils.cc @@ -281,11 +281,12 @@ CopyableOrtValue &CopyableOrtValue::operator=(const CopyableOrtValue &other) { return *this; } -CopyableOrtValue::CopyableOrtValue(CopyableOrtValue &&other) { +CopyableOrtValue::CopyableOrtValue(CopyableOrtValue &&other) noexcept { *this = std::move(other); } -CopyableOrtValue &CopyableOrtValue::operator=(CopyableOrtValue &&other) { +CopyableOrtValue &CopyableOrtValue::operator=( + CopyableOrtValue &&other) noexcept { if (this == &other) { return *this; } diff --git a/sherpa-onnx/csrc/onnx-utils.h b/sherpa-onnx/csrc/onnx-utils.h index 0b7fcc750..b179b378d 100644 --- a/sherpa-onnx/csrc/onnx-utils.h +++ b/sherpa-onnx/csrc/onnx-utils.h @@ -110,9 +110,9 @@ struct CopyableOrtValue { CopyableOrtValue &operator=(const CopyableOrtValue &other); - CopyableOrtValue(CopyableOrtValue &&other); + CopyableOrtValue(CopyableOrtValue &&other) noexcept; - CopyableOrtValue &operator=(CopyableOrtValue &&other); + CopyableOrtValue &operator=(CopyableOrtValue &&other) noexcept; }; std::vector Convert(std::vector values); diff --git a/sherpa-onnx/csrc/packed-sequence.cc b/sherpa-onnx/csrc/packed-sequence.cc index df5b9202a..1c3fe91c4 100644 --- a/sherpa-onnx/csrc/packed-sequence.cc +++ b/sherpa-onnx/csrc/packed-sequence.cc @@ -4,9 +4,8 @@ #include "sherpa-onnx/csrc/packed-sequence.h" -#include - #include +#include #include #include @@ -57,7 +56,7 @@ PackedSequence PackPaddedSequence(OrtAllocator *allocator, int64_t max_T = p_length[indexes[0]]; - int32_t sum_T = std::accumulate(p_length, p_length + n, 0); + auto sum_T = std::accumulate(p_length, p_length + n, static_cast(0)); std::array data_shape{sum_T, v_shape[2]}; diff --git a/sherpa-onnx/csrc/pad-sequence.cc b/sherpa-onnx/csrc/pad-sequence.cc index d9f8ebf9a..3ecf277dc 100644 --- a/sherpa-onnx/csrc/pad-sequence.cc +++ b/sherpa-onnx/csrc/pad-sequence.cc @@ -4,9 +4,8 @@ #include "sherpa-onnx/csrc/pad-sequence.h" -#include - #include +#include #include namespace sherpa_onnx { diff --git a/sherpa-onnx/csrc/parse-options.cc b/sherpa-onnx/csrc/parse-options.cc index f0658223f..33a07f32a 100644 --- a/sherpa-onnx/csrc/parse-options.cc +++ b/sherpa-onnx/csrc/parse-options.cc @@ -11,9 +11,8 @@ #include "sherpa-onnx/csrc/parse-options.h" -#include - #include +#include #include #include #include @@ -33,7 +32,7 @@ ParseOptions::ParseOptions(const std::string &prefix, ParseOptions *po) } else { other_parser_ = po; } - if (po != nullptr && po->prefix_ != "") { + if (po != nullptr && !po->prefix_.empty()) { prefix_ = po->prefix_ + std::string(".") + prefix; } else { prefix_ = prefix; @@ -179,10 +178,10 @@ void ParseOptions::DisableOption(const std::string &name) { string_map_.erase(name); } -int ParseOptions::NumArgs() const { return positional_args_.size(); } +int32_t ParseOptions::NumArgs() const { return positional_args_.size(); } -std::string ParseOptions::GetArg(int i) const { - if (i < 1 || i > static_cast(positional_args_.size())) { +std::string ParseOptions::GetArg(int32_t i) const { + if (i < 1 || i > static_cast(positional_args_.size())) { SHERPA_ONNX_LOGE("ParseOptions::GetArg, invalid index %d", i); exit(-1); } @@ -191,7 +190,7 @@ std::string ParseOptions::GetArg(int i) const { } // We currently do not support any other options. -enum ShellType { kBash = 0 }; +enum ShellType : std::uint8_t { kBash = 0 }; // This can be changed in the code if it ever does need to be changed (as it's // unlikely that one compilation of this tool-set would use both shells). @@ -213,7 +212,7 @@ static bool MustBeQuoted(const std::string &str, ShellType st) { if (*c == '\0') { return true; // Must quote empty string } else { - const char *ok_chars[2]; + std::array ok_chars{}; // These seem not to be interpreted as long as there are no other "bad" // characters involved (e.g. "," would be interpreted as part of something @@ -229,7 +228,7 @@ static bool MustBeQuoted(const std::string &str, ShellType st) { // are OK. All others are forbidden (this is easier since the shell // interprets most non-alphanumeric characters). if (!isalnum(*c)) { - const char *d; + const char *d = nullptr; for (d = ok_chars[st]; *d != '\0'; ++d) { if (*c == *d) break; } @@ -269,22 +268,22 @@ static std::string QuoteAndEscape(const std::string &str, ShellType /*st*/) { escape_str = "\\\""; // should never be accessed. } - char buf[2]; + std::array buf{}; buf[1] = '\0'; buf[0] = quote_char; - std::string ans = buf; + std::string ans = buf.data(); const char *c = str.c_str(); for (; *c != '\0'; ++c) { if (*c == quote_char) { ans += escape_str; } else { buf[0] = *c; - ans += buf; + ans += buf.data(); } } buf[0] = quote_char; - ans += buf; + ans += buf.data(); return ans; } @@ -293,11 +292,11 @@ std::string ParseOptions::Escape(const std::string &str) { return MustBeQuoted(str, kShellType) ? QuoteAndEscape(str, kShellType) : str; } -int ParseOptions::Read(int argc, const char *const argv[]) { +int32_t ParseOptions::Read(int32_t argc, const char *const *argv) { argc_ = argc; argv_ = argv; std::string key, value; - int i; + int32_t i = 0; // first pass: look for config parameter, look for priority for (i = 1; i < argc; ++i) { @@ -306,13 +305,13 @@ int ParseOptions::Read(int argc, const char *const argv[]) { // a lone "--" marks the end of named options break; } - bool has_equal_sign; + bool has_equal_sign = false; SplitLongArg(argv[i], &key, &value, &has_equal_sign); NormalizeArgName(&key); Trim(&value); - if (key.compare("config") == 0) { + if (key == "config") { ReadConfigFile(value); - } else if (key.compare("help") == 0) { + } else if (key == "help") { PrintUsage(); exit(0); } @@ -330,7 +329,7 @@ int ParseOptions::Read(int argc, const char *const argv[]) { double_dash_seen = true; break; } - bool has_equal_sign; + bool has_equal_sign = false; SplitLongArg(argv[i], &key, &value, &has_equal_sign); NormalizeArgName(&key); Trim(&value); @@ -349,14 +348,14 @@ int ParseOptions::Read(int argc, const char *const argv[]) { if ((std::strcmp(argv[i], "--") == 0) && !double_dash_seen) { double_dash_seen = true; } else { - positional_args_.push_back(std::string(argv[i])); + positional_args_.emplace_back(argv[i]); } } // if the user did not suppress this with --print-args = false.... if (print_args_) { std::ostringstream strm; - for (int j = 0; j < argc; ++j) strm << Escape(argv[j]) << " "; + for (int32_t j = 0; j < argc; ++j) strm << Escape(argv[j]) << " "; strm << '\n'; SHERPA_ONNX_LOGE("%s", strm.str().c_str()); } @@ -368,14 +367,14 @@ void ParseOptions::PrintUsage(bool print_command_line /*=false*/) const { os << '\n' << usage_ << '\n'; // first we print application-specific options bool app_specific_header_printed = false; - for (auto it = doc_map_.begin(); it != doc_map_.end(); ++it) { - if (it->second.is_standard_ == false) { // application-specific option + for (const auto &it : doc_map_) { + if (it.second.is_standard_ == false) { // application-specific option if (app_specific_header_printed == false) { // header was not yet printed os << "Options:" << '\n'; app_specific_header_printed = true; } - os << " --" << std::setw(25) << std::left << it->second.name_ << " : " - << it->second.use_msg_ << '\n'; + os << " --" << std::setw(25) << std::left << it.second.name_ << " : " + << it.second.use_msg_ << '\n'; } } if (app_specific_header_printed == true) { @@ -384,17 +383,17 @@ void ParseOptions::PrintUsage(bool print_command_line /*=false*/) const { // then the standard options os << "Standard options:" << '\n'; - for (auto it = doc_map_.begin(); it != doc_map_.end(); ++it) { - if (it->second.is_standard_ == true) { // we have standard option - os << " --" << std::setw(25) << std::left << it->second.name_ << " : " - << it->second.use_msg_ << '\n'; + for (const auto &it : doc_map_) { + if (it.second.is_standard_ == true) { // we have standard option + os << " --" << std::setw(25) << std::left << it.second.name_ << " : " + << it.second.use_msg_ << '\n'; } } os << '\n'; if (print_command_line) { std::ostringstream strm; strm << "Command line was: "; - for (int j = 0; j < argc_; ++j) strm << Escape(argv_[j]) << " "; + for (int32_t j = 0; j < argc_; ++j) strm << Escape(argv_[j]) << " "; strm << '\n'; os << strm.str(); } @@ -405,9 +404,9 @@ void ParseOptions::PrintUsage(bool print_command_line /*=false*/) const { void ParseOptions::PrintConfig(std::ostream &os) const { os << '\n' << "[[ Configuration of UI-Registered options ]]" << '\n'; std::string key; - for (auto it = doc_map_.begin(); it != doc_map_.end(); ++it) { - key = it->first; - os << it->second.name_ << " = "; + for (const auto &it : doc_map_) { + key = it.first; + os << it.second.name_ << " = "; if (bool_map_.end() != bool_map_.find(key)) { os << (*bool_map_.at(key) ? "true" : "false"); } else if (int_map_.end() != int_map_.find(key)) { @@ -442,13 +441,13 @@ void ParseOptions::ReadConfigFile(const std::string &filename) { while (std::getline(is, line)) { ++line_number; // trim out the comments - size_t pos; - if ((pos = line.find_first_of('#')) != std::string::npos) { + size_t pos = line.find_first_of('#'); + if (pos != std::string::npos) { line.erase(pos); } // skip empty lines Trim(&line); - if (line.length() == 0) continue; + if (line.empty()) continue; if (line.substr(0, 2) != "--") { SHERPA_ONNX_LOGE( @@ -461,7 +460,7 @@ void ParseOptions::ReadConfigFile(const std::string &filename) { } // parse option - bool has_equal_sign; + bool has_equal_sign = false; SplitLongArg(line, &key, &value, &has_equal_sign); NormalizeArgName(&key); Trim(&value); @@ -527,7 +526,7 @@ void ParseOptions::Trim(std::string *str) const { bool ParseOptions::SetOption(const std::string &key, const std::string &value, bool has_equal_sign) { if (bool_map_.end() != bool_map_.find(key)) { - if (has_equal_sign && value == "") { + if (has_equal_sign && value.empty()) { SHERPA_ONNX_LOGE("Invalid option --%s=", key.c_str()); exit(-1); } @@ -557,12 +556,10 @@ bool ParseOptions::ToBool(std::string str) const { std::transform(str.begin(), str.end(), str.begin(), ::tolower); // allow "" as a valid option for "true", so that --x is the same as --x=true - if ((str.compare("true") == 0) || (str.compare("t") == 0) || - (str.compare("1") == 0) || (str.compare("") == 0)) { + if (str == "true" || str == "t" || str == "1" || str.empty()) { return true; } - if ((str.compare("false") == 0) || (str.compare("f") == 0) || - (str.compare("0") == 0)) { + if (str == "false" || str == "f" || str == "0") { return false; } // if it is neither true nor false: @@ -593,7 +590,7 @@ uint32_t ParseOptions::ToUint(const std::string &str) const { } float ParseOptions::ToFloat(const std::string &str) const { - float ret; + float ret = 0; if (!ConvertStringToReal(str, &ret)) { SHERPA_ONNX_LOGE("Invalid floating-point option \"%s\"", str.c_str()); exit(-1); @@ -602,7 +599,7 @@ float ParseOptions::ToFloat(const std::string &str) const { } double ParseOptions::ToDouble(const std::string &str) const { - double ret; + double ret = 0; if (!ConvertStringToReal(str, &ret)) { SHERPA_ONNX_LOGE("Invalid floating-point option \"%s\"", str.c_str()); exit(-1); diff --git a/sherpa-onnx/csrc/piper-phonemize-lexicon.cc b/sherpa-onnx/csrc/piper-phonemize-lexicon.cc index ff5e9abe1..aa7b9a2c5 100644 --- a/sherpa-onnx/csrc/piper-phonemize-lexicon.cc +++ b/sherpa-onnx/csrc/piper-phonemize-lexicon.cc @@ -37,7 +37,7 @@ static std::unordered_map ReadTokens(std::istream &is) { std::string sym; std::u32string s; - int32_t id; + int32_t id = 0; while (std::getline(is, line)) { std::istringstream iss(line); iss >> sym; diff --git a/sherpa-onnx/csrc/resample.cc b/sherpa-onnx/csrc/resample.cc index f82c61a96..ad5ef2a63 100644 --- a/sherpa-onnx/csrc/resample.cc +++ b/sherpa-onnx/csrc/resample.cc @@ -24,10 +24,9 @@ #include "sherpa-onnx/csrc/resample.h" -#include -#include -#include - +#include +#include +#include #include #include @@ -54,8 +53,8 @@ I Gcd(I m, I n) { } // could use compile-time assertion // but involves messing with complex template stuff. - static_assert(std::is_integral::value, ""); - while (1) { + static_assert(std::is_integral_v); + while (true) { m %= n; if (m == 0) return (n > 0 ? n : -n); n %= m; @@ -139,10 +138,10 @@ void LinearResample::SetIndexesAndWeights() { in the header as h(t) = f(t)g(t), evaluated at t. */ float LinearResample::FilterFunc(float t) const { - float window, // raised-cosine (Hanning) window of width - // num_zeros_/2*filter_cutoff_ - filter; // sinc filter function - if (fabs(t) < num_zeros_ / (2.0 * filter_cutoff_)) + float window = 0, // raised-cosine (Hanning) window of width + // num_zeros_/2*filter_cutoff_ + filter = 0; // sinc filter function + if (std::fabs(t) < num_zeros_ / (2.0 * filter_cutoff_)) window = 0.5 * (1 + cos(M_2PI * filter_cutoff_ / num_zeros_ * t)); else window = 0.0; // outside support of window function @@ -172,15 +171,15 @@ void LinearResample::Resample(const float *input, int32_t input_dim, bool flush, // of it we are producing here. for (int64_t samp_out = output_sample_offset_; samp_out < tot_output_samp; samp_out++) { - int64_t first_samp_in; - int32_t samp_out_wrapped; + int64_t first_samp_in = 0; + int32_t samp_out_wrapped = 0; GetIndexes(samp_out, &first_samp_in, &samp_out_wrapped); const std::vector &weights = weights_[samp_out_wrapped]; // first_input_index is the first index into "input" that we have a weight // for. int32_t first_input_index = static_cast(first_samp_in - input_sample_offset_); - float this_output; + float this_output = 0; if (first_input_index >= 0 && first_input_index + static_cast(weights.size()) <= input_dim) { this_output = @@ -239,7 +238,7 @@ int64_t LinearResample::GetNumOutputSamples(int64_t input_num_samp, // largest integer in the interval [ 0, 2 - 0.9 ) are the same (both one). // So when we're subtracting the window-width we can ignore the fractional // part. - int32_t window_width_ticks = floor(window_width * tick_freq); + int32_t window_width_ticks = std::floor(window_width * tick_freq); // The time-period of the output that we can sample gets reduced // by the window-width (which is actually the distance from the // center to the edge of the windowing function) if we're not @@ -287,7 +286,7 @@ void LinearResample::SetRemainder(const float *input, int32_t input_dim) { // that are "in the past" relative to the beginning of the latest // input... anyway, storing more remainder than needed is not harmful. int32_t max_remainder_needed = - ceil(samp_rate_in_ * num_zeros_ / filter_cutoff_); + std::ceil(samp_rate_in_ * num_zeros_ / filter_cutoff_); input_remainder_.resize(max_remainder_needed); for (int32_t index = -static_cast(input_remainder_.size()); index < 0; index++) { diff --git a/sherpa-onnx/csrc/resample.h b/sherpa-onnx/csrc/resample.h index 2006ae900..d6e22a198 100644 --- a/sherpa-onnx/csrc/resample.h +++ b/sherpa-onnx/csrc/resample.h @@ -130,11 +130,11 @@ class LinearResample { // the following variables keep track of where we are in a particular signal, // if it is being provided over multiple calls to Resample(). - int64_t input_sample_offset_; ///< The number of input samples we have - ///< already received for this signal - ///< (including anything in remainder_) - int64_t output_sample_offset_; ///< The number of samples we have already - ///< output for this signal. + int64_t input_sample_offset_ = 0; ///< The number of input samples we have + ///< already received for this signal + ///< (including anything in remainder_) + int64_t output_sample_offset_ = 0; ///< The number of samples we have already + ///< output for this signal. std::vector input_remainder_; ///< A small trailing part of the ///< previously seen input signal. }; diff --git a/sherpa-onnx/csrc/session.cc b/sherpa-onnx/csrc/session.cc index 431a6a761..0f6ed89db 100644 --- a/sherpa-onnx/csrc/session.cc +++ b/sherpa-onnx/csrc/session.cc @@ -21,14 +21,14 @@ namespace sherpa_onnx { - static void OrtStatusFailure(OrtStatus *status, const char *s) { - const auto &api = Ort::GetApi(); - const char *msg = api.GetErrorMessage(status); - SHERPA_ONNX_LOGE( + const auto &api = Ort::GetApi(); + const char *msg = api.GetErrorMessage(status); + SHERPA_ONNX_LOGE( "Failed to enable TensorRT : %s." - "Available providers: %s. Fallback to cuda", msg, s); - api.ReleaseStatus(status); + "Available providers: %s. Fallback to cuda", + msg, s); + api.ReleaseStatus(status); } static Ort::SessionOptions GetSessionOptionsImpl(int32_t num_threads, @@ -65,29 +65,28 @@ static Ort::SessionOptions GetSessionOptionsImpl(int32_t num_threads, } case Provider::kTRT: { struct TrtPairs { - const char* op_keys; - const char* op_values; + const char *op_keys; + const char *op_values; }; std::vector trt_options = { - {"device_id", "0"}, - {"trt_max_workspace_size", "2147483648"}, - {"trt_max_partition_iterations", "10"}, - {"trt_min_subgraph_size", "5"}, - {"trt_fp16_enable", "0"}, - {"trt_detailed_build_log", "0"}, - {"trt_engine_cache_enable", "1"}, - {"trt_engine_cache_path", "."}, - {"trt_timing_cache_enable", "1"}, - {"trt_timing_cache_path", "."} - }; + {"device_id", "0"}, + {"trt_max_workspace_size", "2147483648"}, + {"trt_max_partition_iterations", "10"}, + {"trt_min_subgraph_size", "5"}, + {"trt_fp16_enable", "0"}, + {"trt_detailed_build_log", "0"}, + {"trt_engine_cache_enable", "1"}, + {"trt_engine_cache_path", "."}, + {"trt_timing_cache_enable", "1"}, + {"trt_timing_cache_path", "."}}; // ToDo : Trt configs // "trt_int8_enable" // "trt_int8_use_native_calibration_table" // "trt_dump_subgraphs" - std::vector option_keys, option_values; - for (const TrtPairs& pair : trt_options) { + std::vector option_keys, option_values; + for (const TrtPairs &pair : trt_options) { option_keys.emplace_back(pair.op_keys); option_values.emplace_back(pair.op_values); } @@ -95,19 +94,23 @@ static Ort::SessionOptions GetSessionOptionsImpl(int32_t num_threads, std::vector available_providers = Ort::GetAvailableProviders(); if (std::find(available_providers.begin(), available_providers.end(), - "TensorrtExecutionProvider") != available_providers.end()) { - const auto& api = Ort::GetApi(); + "TensorrtExecutionProvider") != available_providers.end()) { + const auto &api = Ort::GetApi(); - OrtTensorRTProviderOptionsV2* tensorrt_options; - OrtStatus *statusC = api.CreateTensorRTProviderOptions( - &tensorrt_options); + OrtTensorRTProviderOptionsV2 *tensorrt_options = nullptr; + OrtStatus *statusC = + api.CreateTensorRTProviderOptions(&tensorrt_options); OrtStatus *statusU = api.UpdateTensorRTProviderOptions( - tensorrt_options, option_keys.data(), option_values.data(), - option_keys.size()); + tensorrt_options, option_keys.data(), option_values.data(), + option_keys.size()); sess_opts.AppendExecutionProvider_TensorRT_V2(*tensorrt_options); - if (statusC) { OrtStatusFailure(statusC, os.str().c_str()); } - if (statusU) { OrtStatusFailure(statusU, os.str().c_str()); } + if (statusC) { + OrtStatusFailure(statusC, os.str().c_str()); + } + if (statusU) { + OrtStatusFailure(statusU, os.str().c_str()); + } api.ReleaseTensorRTProviderOptions(tensorrt_options); } diff --git a/sherpa-onnx/csrc/silero-vad-model.cc b/sherpa-onnx/csrc/silero-vad-model.cc index a0c1e6c53..3c8233ea8 100644 --- a/sherpa-onnx/csrc/silero-vad-model.cc +++ b/sherpa-onnx/csrc/silero-vad-model.cc @@ -20,11 +20,11 @@ class SileroVadModel::Impl { : config_(config), env_(ORT_LOGGING_LEVEL_ERROR), sess_opts_(GetSessionOptions(config)), - allocator_{} { + allocator_{}, + sample_rate_(config.sample_rate) { auto buf = ReadFile(config.silero_vad.model); Init(buf.data(), buf.size()); - sample_rate_ = config.sample_rate; if (sample_rate_ != 16000) { SHERPA_ONNX_LOGE("Expected sample rate 16000. Given: %d", config.sample_rate); diff --git a/sherpa-onnx/csrc/slice.cc b/sherpa-onnx/csrc/slice.cc index 6cc577587..5b7cf97df 100644 --- a/sherpa-onnx/csrc/slice.cc +++ b/sherpa-onnx/csrc/slice.cc @@ -4,9 +4,8 @@ #include "sherpa-onnx/csrc/slice.h" -#include - #include +#include #include namespace sherpa_onnx { diff --git a/sherpa-onnx/csrc/speaker-embedding-extractor-impl.cc b/sherpa-onnx/csrc/speaker-embedding-extractor-impl.cc index 53ff99408..fac2ea654 100644 --- a/sherpa-onnx/csrc/speaker-embedding-extractor-impl.cc +++ b/sherpa-onnx/csrc/speaker-embedding-extractor-impl.cc @@ -12,7 +12,7 @@ namespace sherpa_onnx { namespace { -enum class ModelType { +enum class ModelType : std::uint8_t { kWeSpeaker, k3dSpeaker, kNeMo, diff --git a/sherpa-onnx/csrc/speaker-embedding-manager.cc b/sherpa-onnx/csrc/speaker-embedding-manager.cc index 569b8d4fc..6c90c1953 100644 --- a/sherpa-onnx/csrc/speaker-embedding-manager.cc +++ b/sherpa-onnx/csrc/speaker-embedding-manager.cc @@ -122,7 +122,7 @@ class SpeakerEmbeddingManager::Impl { Eigen::VectorXf scores = embedding_matrix_ * v; - Eigen::VectorXf::Index max_index; + Eigen::VectorXf::Index max_index = 0; float max_score = scores.maxCoeff(&max_index); if (max_score < threshold) { return {}; @@ -178,11 +178,12 @@ class SpeakerEmbeddingManager::Impl { std::vector GetAllSpeakers() const { std::vector all_speakers; + all_speakers.reserve(name2row_.size()); for (const auto &p : name2row_) { all_speakers.push_back(p.first); } - std::stable_sort(all_speakers.begin(), all_speakers.end()); + std::sort(all_speakers.begin(), all_speakers.end()); return all_speakers; } diff --git a/sherpa-onnx/csrc/spoken-language-identification-impl.cc b/sherpa-onnx/csrc/spoken-language-identification-impl.cc index 016ac7e0c..b8984a66f 100644 --- a/sherpa-onnx/csrc/spoken-language-identification-impl.cc +++ b/sherpa-onnx/csrc/spoken-language-identification-impl.cc @@ -18,7 +18,7 @@ namespace sherpa_onnx { namespace { -enum class ModelType { +enum class ModelType : std::uint8_t { kWhisper, kUnknown, }; diff --git a/sherpa-onnx/csrc/stack.cc b/sherpa-onnx/csrc/stack.cc index 302ec733e..1d0fac514 100644 --- a/sherpa-onnx/csrc/stack.cc +++ b/sherpa-onnx/csrc/stack.cc @@ -71,8 +71,8 @@ Ort::Value Stack(OrtAllocator *allocator, T *dst = ans.GetTensorMutableData(); for (int32_t i = 0; i != leading_size; ++i) { - for (int32_t n = 0; n != static_cast(values.size()); ++n) { - const T *src = values[n]->GetTensorData(); + for (auto value : values) { + const T *src = value->GetTensorData(); src += i * trailing_size; std::copy(src, src + trailing_size, dst); diff --git a/sherpa-onnx/csrc/symbol-table.cc b/sherpa-onnx/csrc/symbol-table.cc index 524b26892..8862972b7 100644 --- a/sherpa-onnx/csrc/symbol-table.cc +++ b/sherpa-onnx/csrc/symbol-table.cc @@ -36,7 +36,7 @@ SymbolTable::SymbolTable(AAssetManager *mgr, const std::string &filename) { void SymbolTable::Init(std::istream &is) { std::string sym; - int32_t id; + int32_t id = 0; while (is >> sym >> id) { #if 0 // we disable the test here since for some multi-lingual BPE models diff --git a/sherpa-onnx/csrc/text-utils.cc b/sherpa-onnx/csrc/text-utils.cc index 04586dd8c..3f12e1460 100644 --- a/sherpa-onnx/csrc/text-utils.cc +++ b/sherpa-onnx/csrc/text-utils.cc @@ -5,9 +5,8 @@ #include "sherpa-onnx/csrc/text-utils.h" -#include - #include +#include #include #include #include diff --git a/sherpa-onnx/csrc/transducer-keyword-decoder.cc b/sherpa-onnx/csrc/transducer-keyword-decoder.cc index af78cb9c2..b94b7ba10 100644 --- a/sherpa-onnx/csrc/transducer-keyword-decoder.cc +++ b/sherpa-onnx/csrc/transducer-keyword-decoder.cc @@ -151,7 +151,6 @@ void TransducerKeywordDecoder::Decode( if (matched) { float ys_prob = 0.0; - int32_t length = best_hyp.ys_probs.size(); for (int32_t i = 0; i < matched_state->level; ++i) { ys_prob += best_hyp.ys_probs[i]; } diff --git a/sherpa-onnx/csrc/transpose.cc b/sherpa-onnx/csrc/transpose.cc index 5ec32667e..90a34599b 100644 --- a/sherpa-onnx/csrc/transpose.cc +++ b/sherpa-onnx/csrc/transpose.cc @@ -4,9 +4,8 @@ #include "sherpa-onnx/csrc/transpose.h" -#include - #include +#include #include namespace sherpa_onnx { diff --git a/sherpa-onnx/csrc/unbind.cc b/sherpa-onnx/csrc/unbind.cc index ffe5a6c37..587e268c1 100644 --- a/sherpa-onnx/csrc/unbind.cc +++ b/sherpa-onnx/csrc/unbind.cc @@ -4,9 +4,8 @@ #include "sherpa-onnx/csrc/unbind.h" -#include - #include +#include #include #include #include diff --git a/sherpa-onnx/csrc/utils.cc b/sherpa-onnx/csrc/utils.cc index 93de43e73..b5df9682f 100644 --- a/sherpa-onnx/csrc/utils.cc +++ b/sherpa-onnx/csrc/utils.cc @@ -30,7 +30,6 @@ static bool EncodeBase(const std::vector &lines, std::vector tmp_thresholds; std::vector tmp_phrases; - std::string line; std::string word; bool has_scores = false; bool has_thresholds = false; @@ -72,6 +71,7 @@ static bool EncodeBase(const std::vector &lines, } } ids->push_back(std::move(tmp_ids)); + tmp_ids = {}; tmp_scores.push_back(score); tmp_phrases.push_back(phrase); tmp_thresholds.push_back(threshold); diff --git a/sherpa-onnx/csrc/wave-reader.cc b/sherpa-onnx/csrc/wave-reader.cc index b9d262796..1058db596 100644 --- a/sherpa-onnx/csrc/wave-reader.cc +++ b/sherpa-onnx/csrc/wave-reader.cc @@ -100,13 +100,13 @@ struct WaveHeader { int32_t subchunk2_id; // a tag of this chunk int32_t subchunk2_size; // size of subchunk2 }; -static_assert(sizeof(WaveHeader) == 44, ""); +static_assert(sizeof(WaveHeader) == 44); // Read a wave file of mono-channel. // Return its samples normalized to the range [-1, 1). std::vector ReadWaveImpl(std::istream &is, int32_t *sampling_rate, bool *is_ok) { - WaveHeader header; + WaveHeader header{}; is.read(reinterpret_cast(&header), sizeof(header)); if (!is) { *is_ok = false; diff --git a/sherpa-onnx/csrc/wave-writer.cc b/sherpa-onnx/csrc/wave-writer.cc index f20af4b13..069b6276e 100644 --- a/sherpa-onnx/csrc/wave-writer.cc +++ b/sherpa-onnx/csrc/wave-writer.cc @@ -37,7 +37,7 @@ struct WaveHeader { bool WriteWave(const std::string &filename, int32_t sampling_rate, const float *samples, int32_t n) { - WaveHeader header; + WaveHeader header{}; header.chunk_id = 0x46464952; // FFIR header.format = 0x45564157; // EVAW header.subchunk1_id = 0x20746d66; // "fmt " From 675fb1574f82d591df362f6a0a52f69a1a9fc647 Mon Sep 17 00:00:00 2001 From: Zhong-Yi Li Date: Wed, 19 Jun 2024 20:52:42 +0800 Subject: [PATCH 034/237] offline transducer: treat unk as blank (#1005) Co-authored-by: chungyi.li --- .../csrc/offline-recognizer-transducer-impl.h | 17 +++++++++++++---- .../offline-transducer-greedy-search-decoder.cc | 4 +++- .../offline-transducer-greedy-search-decoder.h | 4 +++- ...e-transducer-modified-beam-search-decoder.cc | 5 +++-- ...ne-transducer-modified-beam-search-decoder.h | 4 +++- 5 files changed, 25 insertions(+), 9 deletions(-) diff --git a/sherpa-onnx/csrc/offline-recognizer-transducer-impl.h b/sherpa-onnx/csrc/offline-recognizer-transducer-impl.h index 13357f79c..c439319eb 100644 --- a/sherpa-onnx/csrc/offline-recognizer-transducer-impl.h +++ b/sherpa-onnx/csrc/offline-recognizer-transducer-impl.h @@ -78,9 +78,13 @@ class OfflineRecognizerTransducerImpl : public OfflineRecognizerImpl { config_(config), symbol_table_(config_.model_config.tokens), model_(std::make_unique(config_.model_config)) { + if (symbol_table_.Contains("")) { + unk_id_ = symbol_table_[""]; + } + if (config_.decoding_method == "greedy_search") { decoder_ = std::make_unique( - model_.get(), config_.blank_penalty); + model_.get(), unk_id_, config_.blank_penalty); } else if (config_.decoding_method == "modified_beam_search") { if (!config_.lm_config.model.empty()) { lm_ = OfflineLM::Create(config.lm_config); @@ -97,7 +101,7 @@ class OfflineRecognizerTransducerImpl : public OfflineRecognizerImpl { decoder_ = std::make_unique( model_.get(), lm_.get(), config_.max_active_paths, - config_.lm_config.scale, config_.blank_penalty); + config_.lm_config.scale, unk_id_, config_.blank_penalty); } else { SHERPA_ONNX_LOGE("Unsupported decoding method: %s", config_.decoding_method.c_str()); @@ -113,9 +117,13 @@ class OfflineRecognizerTransducerImpl : public OfflineRecognizerImpl { symbol_table_(mgr, config_.model_config.tokens), model_(std::make_unique(mgr, config_.model_config)) { + if (symbol_table_.Contains("")) { + unk_id_ = symbol_table_[""]; + } + if (config_.decoding_method == "greedy_search") { decoder_ = std::make_unique( - model_.get(), config_.blank_penalty); + model_.get(), unk_id_, config_.blank_penalty); } else if (config_.decoding_method == "modified_beam_search") { if (!config_.lm_config.model.empty()) { lm_ = OfflineLM::Create(mgr, config.lm_config); @@ -133,7 +141,7 @@ class OfflineRecognizerTransducerImpl : public OfflineRecognizerImpl { decoder_ = std::make_unique( model_.get(), lm_.get(), config_.max_active_paths, - config_.lm_config.scale, config_.blank_penalty); + config_.lm_config.scale, unk_id_, config_.blank_penalty); } else { SHERPA_ONNX_LOGE("Unsupported decoding method: %s", config_.decoding_method.c_str()); @@ -293,6 +301,7 @@ class OfflineRecognizerTransducerImpl : public OfflineRecognizerImpl { std::unique_ptr model_; std::unique_ptr decoder_; std::unique_ptr lm_; + int32_t unk_id_ = -1; }; } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/offline-transducer-greedy-search-decoder.cc b/sherpa-onnx/csrc/offline-transducer-greedy-search-decoder.cc index c8809a9f1..6fd3bf404 100644 --- a/sherpa-onnx/csrc/offline-transducer-greedy-search-decoder.cc +++ b/sherpa-onnx/csrc/offline-transducer-greedy-search-decoder.cc @@ -57,7 +57,9 @@ OfflineTransducerGreedySearchDecoder::Decode(Ort::Value encoder_out, std::max_element(static_cast(p_logit), static_cast(p_logit) + vocab_size))); p_logit += vocab_size; - if (y != 0) { + // blank id is hardcoded to 0 + // also, it treats unk as blank + if (y != 0 && y != unk_id_) { ans[i].tokens.push_back(y); ans[i].timestamps.push_back(t); emitted = true; diff --git a/sherpa-onnx/csrc/offline-transducer-greedy-search-decoder.h b/sherpa-onnx/csrc/offline-transducer-greedy-search-decoder.h index b284d22a7..79109e60d 100644 --- a/sherpa-onnx/csrc/offline-transducer-greedy-search-decoder.h +++ b/sherpa-onnx/csrc/offline-transducer-greedy-search-decoder.h @@ -15,8 +15,9 @@ namespace sherpa_onnx { class OfflineTransducerGreedySearchDecoder : public OfflineTransducerDecoder { public: OfflineTransducerGreedySearchDecoder(OfflineTransducerModel *model, + int32_t unk_id, float blank_penalty) - : model_(model), blank_penalty_(blank_penalty) {} + : model_(model), unk_id_(unk_id), blank_penalty_(blank_penalty) {} std::vector Decode( Ort::Value encoder_out, Ort::Value encoder_out_length, @@ -24,6 +25,7 @@ class OfflineTransducerGreedySearchDecoder : public OfflineTransducerDecoder { private: OfflineTransducerModel *model_; // Not owned + int32_t unk_id_; float blank_penalty_; }; diff --git a/sherpa-onnx/csrc/offline-transducer-modified-beam-search-decoder.cc b/sherpa-onnx/csrc/offline-transducer-modified-beam-search-decoder.cc index 391620a01..7e81624eb 100644 --- a/sherpa-onnx/csrc/offline-transducer-modified-beam-search-decoder.cc +++ b/sherpa-onnx/csrc/offline-transducer-modified-beam-search-decoder.cc @@ -131,8 +131,9 @@ OfflineTransducerModifiedBeamSearchDecoder::Decode( float context_score = 0; auto context_state = new_hyp.context_state; - if (new_token != 0) { - // blank id is fixed to 0 + // blank is hardcoded to 0 + // also, it treats unk as blank + if (new_token != 0 && new_token != unk_id_) { new_hyp.ys.push_back(new_token); new_hyp.timestamps.push_back(t); if (context_graphs[i] != nullptr) { diff --git a/sherpa-onnx/csrc/offline-transducer-modified-beam-search-decoder.h b/sherpa-onnx/csrc/offline-transducer-modified-beam-search-decoder.h index 08fa4182f..2e67cd71a 100644 --- a/sherpa-onnx/csrc/offline-transducer-modified-beam-search-decoder.h +++ b/sherpa-onnx/csrc/offline-transducer-modified-beam-search-decoder.h @@ -19,12 +19,13 @@ class OfflineTransducerModifiedBeamSearchDecoder OfflineTransducerModifiedBeamSearchDecoder(OfflineTransducerModel *model, OfflineLM *lm, int32_t max_active_paths, - float lm_scale, + float lm_scale, int32_t unk_id, float blank_penalty) : model_(model), lm_(lm), max_active_paths_(max_active_paths), lm_scale_(lm_scale), + unk_id_(unk_id), blank_penalty_(blank_penalty) {} std::vector Decode( @@ -37,6 +38,7 @@ class OfflineTransducerModifiedBeamSearchDecoder int32_t max_active_paths_; float lm_scale_; // used only when lm_ is not nullptr + int32_t unk_id_; float blank_penalty_; }; From 36336b31f42d35a9a1448edd1eb1f652f41b344d Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 20 Jun 2024 18:05:57 +0800 Subject: [PATCH 035/237] Build Android APK for Thai (#1036) --- .../online-decode-files/run-transducer-itn.sh | 1 + scripts/apk/generate-vad-asr-apk-script.py | 37 +++++++++++++++---- sherpa-onnx/kotlin-api/OfflineRecognizer.kt | 13 +++++++ 3 files changed, 43 insertions(+), 8 deletions(-) diff --git a/dotnet-examples/online-decode-files/run-transducer-itn.sh b/dotnet-examples/online-decode-files/run-transducer-itn.sh index 17c595789..0c81fc7d8 100755 --- a/dotnet-examples/online-decode-files/run-transducer-itn.sh +++ b/dotnet-examples/online-decode-files/run-transducer-itn.sh @@ -24,5 +24,6 @@ dotnet run -c Release \ --encoder ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.int8.onnx \ --decoder ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.int8.onnx \ --joiner ./sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.int8.onnx \ + --rule-fsts ./itn_zh_number.fst \ --decoding-method greedy_search \ --files ./itn-zh-number.wav diff --git a/scripts/apk/generate-vad-asr-apk-script.py b/scripts/apk/generate-vad-asr-apk-script.py index 61188ca7f..489f2b90b 100755 --- a/scripts/apk/generate-vad-asr-apk-script.py +++ b/scripts/apk/generate-vad-asr-apk-script.py @@ -55,13 +55,13 @@ def get_models(): short_name="whisper_tiny", cmd=""" pushd $model_name - rm -v tiny.en-encoder.onnx - rm -v tiny.en-decoder.onnx + rm -fv tiny.en-encoder.onnx + rm -fv tiny.en-decoder.onnx rm -rf test_wavs - rm -v *.py - rm -v requirements.txt - rm -v .gitignore - rm -v README.md + rm -fv *.py + rm -fv requirements.txt + rm -fv .gitignore + rm -fv README.md ls -lh @@ -80,7 +80,7 @@ def get_models(): fi pushd $model_name - rm -v README.md + rm -fv README.md rm -rfv test_wavs rm model.onnx @@ -102,7 +102,7 @@ def get_models(): pushd $model_name rm -rfv test_wavs - rm -v README.md + rm -fv README.md mv -v data/lang_char/tokens.txt ./ rm -rfv data/lang_char @@ -193,6 +193,27 @@ def get_models(): ls -lh + popd + """, + ), + Model( + model_name="sherpa-onnx-zipformer-thai-2024-06-20", + idx=12, + lang="th", + short_name="zipformer", + cmd=""" + pushd $model_name + + rm -rfv test_wavs + rm -fv README.md + rm -fv bpe.model + + rm encoder-epoch-12-avg-5.onnx + rm decoder-epoch-12-avg-5.int8.onnx + rm joiner-epoch-12-avg-5.onnx + + ls -lh + popd """, ), diff --git a/sherpa-onnx/kotlin-api/OfflineRecognizer.kt b/sherpa-onnx/kotlin-api/OfflineRecognizer.kt index c910e8d68..7163d3d10 100644 --- a/sherpa-onnx/kotlin-api/OfflineRecognizer.kt +++ b/sherpa-onnx/kotlin-api/OfflineRecognizer.kt @@ -284,6 +284,19 @@ fun getOfflineModelConfig(type: Int): OfflineModelConfig? { modelType = "tele_speech", ) } + + 12 -> { + val modelDir = "sherpa-onnx-zipformer-thai-2024-06-20" + return OfflineModelConfig( + transducer = OfflineTransducerModelConfig( + encoder = "$modelDir/encoder-epoch-12-avg-5.int8.onnx", + decoder = "$modelDir/decoder-epoch-12-avg-5.onnx", + joiner = "$modelDir/joiner-epoch-12-avg-5.int8.onnx", + ), + tokens = "$modelDir/tokens.txt", + modelType = "zipformer2", + ) + } } return null } From 96ab843173ceac7ecf8e49be408705766d57b0b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BD=AD=E9=9C=87=E4=B8=9C?= <275331498@qq.com> Date: Fri, 21 Jun 2024 11:15:59 +0800 Subject: [PATCH 036/237] fix typo (#1038) --- python-api-examples/streaming_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-api-examples/streaming_server.py b/python-api-examples/streaming_server.py index b5f500e08..c7c1b8de6 100755 --- a/python-api-examples/streaming_server.py +++ b/python-api-examples/streaming_server.py @@ -158,7 +158,7 @@ def add_model_args(parser: argparse.ArgumentParser): parser.add_argument( "--paraformer-decoder", type=str, - help="Path to the transducer decoder model.", + help="Path to the paraformer decoder model.", ) parser.add_argument( From 9dd0e03568295b1faef64d3cbf3916a36b04b547 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sat, 22 Jun 2024 18:18:36 +0800 Subject: [PATCH 037/237] Enable to stop TTS generation (#1041) --- CMakeLists.txt | 2 +- .../com/k2fsa/sherpa/onnx/MainActivity.kt | 35 ++++++++-- .../main/java/com/k2fsa/sherpa/onnx/Tts.kt | 4 +- .../app/src/main/res/layout/activity_main.xml | 12 ++++ .../app/src/main/res/values/strings.xml | 1 + .../sherpa/onnx/tts/engine/TtsService.kt | 7 +- .../non-streaming-asr/pubspec.yaml | 2 +- dart-api-examples/streaming-asr/pubspec.yaml | 2 +- dart-api-examples/tts/bin/piper.dart | 4 ++ dart-api-examples/tts/pubspec.yaml | 2 +- dart-api-examples/vad/pubspec.yaml | 2 +- dotnet-examples/offline-tts-play/Program.cs | 4 ++ kotlin-api-examples/test_tts.kt | 42 +++++++++++- .../NonStreamingTextToSpeech.rc | Bin 12926 -> 13026 bytes .../NonStreamingTextToSpeechDlg.cpp | 62 ++++++++++++------ .../NonStreamingTextToSpeechDlg.h | 3 + .../NonStreamingTextToSpeech/Resource.h | 3 +- nodejs-addon-examples/package.json | 2 +- python-api-examples/offline-tts-play.py | 7 ++ scripts/dotnet/OfflineTts.cs | 6 +- sherpa-onnx/c-api/c-api.cc | 10 +-- sherpa-onnx/c-api/c-api.h | 15 +++-- sherpa-onnx/csrc/offline-tts-vits-impl.h | 10 +-- sherpa-onnx/csrc/offline-tts.h | 4 +- .../csrc/sherpa-onnx-offline-tts-play-alsa.cc | 11 +++- .../csrc/sherpa-onnx-offline-tts-play.cc | 10 ++- sherpa-onnx/csrc/sherpa-onnx-offline-tts.cc | 3 +- sherpa-onnx/flutter/CHANGELOG.md | 4 ++ .../flutter/lib/src/sherpa_onnx_bindings.dart | 2 +- sherpa-onnx/flutter/lib/src/tts.dart | 6 +- sherpa-onnx/jni/offline-tts.cc | 34 +++++++++- sherpa-onnx/python/csrc/offline-tts.cc | 6 +- 32 files changed, 248 insertions(+), 69 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d474c6aa9..ec529bab6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ project(sherpa-onnx) # ./nodejs-addon-examples # ./dart-api-examples/ # ./sherpa-onnx/flutter/CHANGELOG.md -set(SHERPA_ONNX_VERSION "1.10.0") +set(SHERPA_ONNX_VERSION "1.10.1") # Disable warning about # diff --git a/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt b/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt index f44bef8eb..b95ad7d78 100644 --- a/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt +++ b/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt @@ -26,6 +26,9 @@ class MainActivity : AppCompatActivity() { private lateinit var speed: EditText private lateinit var generate: Button private lateinit var play: Button + private lateinit var stop: Button + private var stopped: Boolean = false + private var mediaPlayer: MediaPlayer? = null // see // https://developer.android.com/reference/kotlin/android/media/AudioTrack @@ -49,9 +52,11 @@ class MainActivity : AppCompatActivity() { generate = findViewById(R.id.generate) play = findViewById(R.id.play) + stop = findViewById(R.id.stop) generate.setOnClickListener { onClickGenerate() } play.setOnClickListener { onClickPlay() } + stop.setOnClickListener { onClickStop() } sid.setText("0") speed.setText("1.0") @@ -70,7 +75,7 @@ class MainActivity : AppCompatActivity() { AudioFormat.CHANNEL_OUT_MONO, AudioFormat.ENCODING_PCM_FLOAT ) - Log.i(TAG, "sampleRate: ${sampleRate}, buffLength: ${bufLength}") + Log.i(TAG, "sampleRate: $sampleRate, buffLength: $bufLength") val attr = AudioAttributes.Builder().setContentType(AudioAttributes.CONTENT_TYPE_SPEECH) .setUsage(AudioAttributes.USAGE_MEDIA) @@ -90,8 +95,14 @@ class MainActivity : AppCompatActivity() { } // this function is called from C++ - private fun callback(samples: FloatArray) { - track.write(samples, 0, samples.size, AudioTrack.WRITE_BLOCKING) + private fun callback(samples: FloatArray): Int { + if (!stopped) { + track.write(samples, 0, samples.size, AudioTrack.WRITE_BLOCKING) + return 1 + } else { + track.stop() + return 0 + } } private fun onClickGenerate() { @@ -127,6 +138,8 @@ class MainActivity : AppCompatActivity() { track.play() play.isEnabled = false + generate.isEnabled = false + stopped = false Thread { val audio = tts.generateWithCallback( text = textStr, @@ -140,6 +153,7 @@ class MainActivity : AppCompatActivity() { if (ok) { runOnUiThread { play.isEnabled = true + generate.isEnabled = true track.stop() } } @@ -148,11 +162,22 @@ class MainActivity : AppCompatActivity() { private fun onClickPlay() { val filename = application.filesDir.absolutePath + "/generated.wav" - val mediaPlayer = MediaPlayer.create( + mediaPlayer?.stop() + mediaPlayer = MediaPlayer.create( applicationContext, Uri.fromFile(File(filename)) ) - mediaPlayer.start() + mediaPlayer?.start() + } + + private fun onClickStop() { + stopped = true + play.isEnabled = true + generate.isEnabled = true + track.pause() + track.flush() + mediaPlayer?.stop() + mediaPlayer = null } private fun initTts() { diff --git a/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/Tts.kt b/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/Tts.kt index b25869d07..4f9c4b6f6 100644 --- a/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/Tts.kt +++ b/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/Tts.kt @@ -76,7 +76,7 @@ class OfflineTts( text: String, sid: Int = 0, speed: Float = 1.0f, - callback: (samples: FloatArray) -> Unit + callback: (samples: FloatArray) -> Int ): GeneratedAudio { val objArray = generateWithCallbackImpl( ptr, @@ -146,7 +146,7 @@ class OfflineTts( text: String, sid: Int = 0, speed: Float = 1.0f, - callback: (samples: FloatArray) -> Unit + callback: (samples: FloatArray) -> Int ): Array companion object { diff --git a/android/SherpaOnnxTts/app/src/main/res/layout/activity_main.xml b/android/SherpaOnnxTts/app/src/main/res/layout/activity_main.xml index 3547de872..c66022a8c 100644 --- a/android/SherpaOnnxTts/app/src/main/res/layout/activity_main.xml +++ b/android/SherpaOnnxTts/app/src/main/res/layout/activity_main.xml @@ -84,4 +84,16 @@ app:layout_constraintLeft_toLeftOf="parent" app:layout_constraintRight_toRightOf="parent" app:layout_constraintTop_toBottomOf="@id/generate" /> + + + + +
+
+ + + +
+
+ + + + + diff --git a/wasm/vad/sherpa-onnx-vad.js b/wasm/vad/sherpa-onnx-vad.js new file mode 100644 index 000000000..154bbea0f --- /dev/null +++ b/wasm/vad/sherpa-onnx-vad.js @@ -0,0 +1,253 @@ +function freeConfig(config, Module) { + if ('buffer' in config) { + Module._free(config.buffer); + } + + if ('sileroVad' in config) { + freeConfig(config.sileroVad, Module) + } + + + Module._free(config.ptr); +} + +// The user should free the returned pointers +function initSherpaOnnxSileroVadModelConfig(config, Module) { + const modelLen = Module.lengthBytesUTF8(config.model || '') + 1; + + const n = modelLen; + + const buffer = Module._malloc(n); + + const len = 5 * 4; + const ptr = Module._malloc(len); + + Module.stringToUTF8(config.model || '', buffer, modelLen); + + offset = 0; + Module.setValue(ptr, buffer, 'i8*'); + offset += 4; + + Module.setValue(ptr + offset, config.threshold || 0.5, 'float'); + offset += 4; + + Module.setValue(ptr + offset, config.minSilenceDuration || 0.5, 'float'); + offset += 4; + + Module.setValue(ptr + offset, config.minSpeechDuration || 0.25, 'float'); + offset += 4; + + Module.setValue(ptr + offset, config.windowSize || 512, 'i32'); + offset += 4; + + return { + buffer: buffer, ptr: ptr, len: len, + } +} + +function initSherpaOnnxVadModelConfig(config, Module) { + if (!('sileroVad' in config)) { + config.sileroVad = { + model: '', + threshold: 0.50, + minSilenceDuration: 0.50, + minSpeechDuration: 0.25, + windowSize: 512, + }; + } + + const sileroVad = + initSherpaOnnxSileroVadModelConfig(config.sileroVad, Module); + + const len = sileroVad.len + 4 * 4; + const ptr = Module._malloc(len); + + const providerLen = Module.lengthBytesUTF8(config.provider || 'cpu') + 1; + const buffer = Module._malloc(providerLen); + Module.stringToUTF8(config.provider || 'cpu', buffer, providerLen); + + let offset = 0; + Module._CopyHeap(sileroVad.ptr, sileroVad.len, ptr + offset); + offset += sileroVad.len; + + Module.setValue(ptr + offset, config.sampleRate || 16000, 'i32'); + offset += 4; + + Module.setValue(ptr + offset, config.numThreads || 1, 'i32'); + offset += 4; + + Module.setValue(ptr + offset, buffer, 'i8*'); // provider + offset += 4; + + Module.setValue(ptr + offset, config.debug || 0, 'i32'); + offset += 4; + + return { + buffer: buffer, ptr: ptr, len: len, sileroVad: sileroVad, + } +} + +function createVad(Module, myConfig) { + const sileroVad = { + model: './silero_vad.onnx', + threshold: 0.50, + minSilenceDuration: 0.50, + minSpeechDuration: 0.25, + windowSize: 512, + }; + + let config = { + sileroVad: sileroVad, + sampleRate: 16000, + numThreads: 1, + provider: 'cpu', + debug: 1, + bufferSizeInSeconds: 30, + }; + + if (myConfig) { + config = myConfig; + } + + return new Vad(config, Module); +} + + +class CircularBuffer { + constructor(capacity, Module) { + this.handle = Module._SherpaOnnxCreateCircularBuffer(capacity); + this.Module = Module; + } + + free() { + this.Module._SherpaOnnxDestroyCircularBuffer(this.handle); + this.handle = 0 + } + + /** + * @param samples {Float32Array} + */ + push(samples) { + const pointer = + this.Module._malloc(samples.length * samples.BYTES_PER_ELEMENT); + this.Module.HEAPF32.set(samples, pointer / samples.BYTES_PER_ELEMENT); + this.Module._SherpaOnnxCircularBufferPush( + this.handle, pointer, samples.length); + this.Module._free(pointer); + } + + get(startIndex, n) { + const p = + this.Module._SherpaOnnxCircularBufferGet(this.handle, startIndex, n); + + const samplesPtr = p / 4; + const samples = new Float32Array(n); + for (let i = 0; i < n; i++) { + samples[i] = this.Module.HEAPF32[samplesPtr + i]; + } + + this.Module._SherpaOnnxCircularBufferFree(p); + + return samples; + } + + pop(n) { + this.Module._SherpaOnnxCircularBufferPop(this.handle, n); + } + + size() { + return this.Module._SherpaOnnxCircularBufferSize(this.handle); + } + + head() { + return this.Module._SherpaOnnxCircularBufferHead(this.handle); + } + + reset() { + this.Module._SherpaOnnxCircularBufferReset(this.handle); + } +} + +class Vad { + constructor(configObj, Module) { + this.config = configObj; + const config = initSherpaOnnxVadModelConfig(configObj, Module); + Module._MyPrint(config.ptr); + const handle = Module._SherpaOnnxCreateVoiceActivityDetector( + config.ptr, configObj.bufferSizeInSeconds || 30); + freeConfig(config, Module); + + this.handle = handle; + this.Module = Module; + } + + free() { + this.Module._SherpaOnnxDestroyVoiceActivityDetector(this.handle); + this.handle = 0 + } + + // samples is a float32 array + acceptWaveform(samples) { + const pointer = + this.Module._malloc(samples.length * samples.BYTES_PER_ELEMENT); + this.Module.HEAPF32.set(samples, pointer / samples.BYTES_PER_ELEMENT); + this.Module._SherpaOnnxVoiceActivityDetectorAcceptWaveform( + this.handle, pointer, samples.length); + this.Module._free(pointer); + } + + isEmpty() { + return this.Module._SherpaOnnxVoiceActivityDetectorEmpty(this.handle) == 1; + } + + isDetected() { + return this.Module._SherpaOnnxVoiceActivityDetectorDetected(this.handle) == + 1; + } + + pop() { + this.Module._SherpaOnnxVoiceActivityDetectorPop(this.handle); + } + + clear() { + this.Module._SherpaOnnxVoiceActivityDetectorClear(this.handle); + } + + /* +{ + samples: a 1-d float32 array, + start: an int32 +} + */ + front() { + const h = this.Module._SherpaOnnxVoiceActivityDetectorFront(this.handle); + + const start = this.Module.HEAP32[h / 4]; + const samplesPtr = this.Module.HEAP32[h / 4 + 1] / 4; + const numSamples = this.Module.HEAP32[h / 4 + 2]; + + const samples = new Float32Array(numSamples); + for (let i = 0; i < numSamples; i++) { + samples[i] = this.Module.HEAPF32[samplesPtr + i]; + } + + this.Module._SherpaOnnxDestroySpeechSegment(h); + return {samples: samples, start: start}; + } + + reset() { + this.Module._SherpaOnnxVoiceActivityDetectorReset(this.handle); + } + + flush() { + this.Module._SherpaOnnxVoiceActivityDetectorFlush(this.handle); + } +}; + +if (typeof process == 'object' && typeof process.versions == 'object' && + typeof process.versions.node == 'string') { + module.exports = { + createVad, + CircularBuffer, + }; +} diff --git a/wasm/vad/sherpa-onnx-wasm-main-vad.cc b/wasm/vad/sherpa-onnx-wasm-main-vad.cc new file mode 100644 index 000000000..3c1600ba1 --- /dev/null +++ b/wasm/vad/sherpa-onnx-wasm-main-vad.cc @@ -0,0 +1,45 @@ +// wasm/sherpa-onnx-wasm-main-vad.cc +// +// Copyright (c) 2024 Xiaomi Corporation +#include + +#include +#include + +#include "sherpa-onnx/c-api/c-api.h" + +// see also +// https://emscripten.org/docs/porting/connecting_cpp_and_javascript/Interacting-with-code.html + +extern "C" { + +static_assert(sizeof(SherpaOnnxSileroVadModelConfig) == 5 * 4, ""); + +static_assert(sizeof(SherpaOnnxVadModelConfig) == + sizeof(SherpaOnnxSileroVadModelConfig) + 4 * 4, + ""); +void MyPrint(SherpaOnnxVadModelConfig *config) { + auto silero_vad = &config->silero_vad; + + fprintf(stdout, "----------silero_vad config----------\n"); + fprintf(stdout, "model: %s\n", silero_vad->model); + fprintf(stdout, "threshold: %.3f\n", silero_vad->threshold); + fprintf(stdout, "min_silence_duration: %.3f\n", + silero_vad->min_silence_duration); + fprintf(stdout, "min_speech_duration: %.3f\n", + silero_vad->min_speech_duration); + fprintf(stdout, "window_size: %d\n", silero_vad->window_size); + + fprintf(stdout, "----------config----------\n"); + + fprintf(stdout, "sample_rate: %d\n", config->sample_rate); + fprintf(stdout, "num_threads: %d\n", config->num_threads); + + fprintf(stdout, "provider: %s\n", config->provider); + fprintf(stdout, "debug: %d\n", config->debug); +} + +void CopyHeap(const char *src, int32_t num_bytes, char *dst) { + std::copy(src, src + num_bytes, dst); +} +} From 537e163dd012aec3b250af77697908d37759c057 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sat, 24 Aug 2024 13:24:52 +0800 Subject: [PATCH 164/237] WebAssembly example for VAD + Non-streaming ASR (#1284) --- .../workflows/wasm-simd-hf-space-de-tts.yaml | 4 + .../wasm-simd-hf-space-en-asr-zipformer.yaml | 3 + .../workflows/wasm-simd-hf-space-en-tts.yaml | 4 + .../wasm-simd-hf-space-silero-vad.yaml | 1 + .../workflows/wasm-simd-hf-space-vad-asr.yaml | 93 +++++ ...-space-zh-cantonese-en-asr-paraformer.yaml | 4 + ...sm-simd-hf-space-zh-en-asr-paraformer.yaml | 4 + ...asm-simd-hf-space-zh-en-asr-zipformer.yaml | 4 + CMakeLists.txt | 15 +- README.md | 239 ++++++++--- build-wasm-simd-vad-asr.sh | 68 +++ scripts/wasm/generate-vad-asr.py | 229 +++++++++++ scripts/wasm/run-vad-asr.sh.in | 92 +++++ sherpa-onnx/c-api/c-api.cc | 5 + sherpa-onnx/c-api/c-api.h | 3 + wasm/CMakeLists.txt | 4 + wasm/asr/assets/README.md | 7 + wasm/asr/index.html | 2 +- wasm/tts/assets/README.md | 5 + wasm/vad-asr/CMakeLists.txt | 83 ++++ wasm/vad-asr/app-vad-asr.js | 389 ++++++++++++++++++ wasm/vad-asr/assets/README.md | 23 ++ wasm/vad-asr/index.html | 43 ++ wasm/vad-asr/sherpa-onnx-asr.js | 1 + wasm/vad-asr/sherpa-onnx-vad.js | 1 + wasm/vad-asr/sherpa-onnx-wasm-main-vad-asr.cc | 19 + wasm/vad/assets/README.md | 3 + wasm/vad/index.html | 2 +- wasm/vad/sherpa-onnx-vad.js | 1 - 29 files changed, 1281 insertions(+), 70 deletions(-) create mode 100644 .github/workflows/wasm-simd-hf-space-vad-asr.yaml create mode 100755 build-wasm-simd-vad-asr.sh create mode 100755 scripts/wasm/generate-vad-asr.py create mode 100644 scripts/wasm/run-vad-asr.sh.in create mode 100644 wasm/vad-asr/CMakeLists.txt create mode 100644 wasm/vad-asr/app-vad-asr.js create mode 100644 wasm/vad-asr/assets/README.md create mode 100644 wasm/vad-asr/index.html create mode 120000 wasm/vad-asr/sherpa-onnx-asr.js create mode 120000 wasm/vad-asr/sherpa-onnx-vad.js create mode 100644 wasm/vad-asr/sherpa-onnx-wasm-main-vad-asr.cc diff --git a/.github/workflows/wasm-simd-hf-space-de-tts.yaml b/.github/workflows/wasm-simd-hf-space-de-tts.yaml index f51535379..cbd3b1fce 100644 --- a/.github/workflows/wasm-simd-hf-space-de-tts.yaml +++ b/.github/workflows/wasm-simd-hf-space-de-tts.yaml @@ -25,8 +25,12 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 + - name: Install emsdk uses: mymindstorm/setup-emsdk@v14 + with: + version: 3.1.51 + actions-cache-folder: 'emsdk-cache' - name: View emsdk version shell: bash diff --git a/.github/workflows/wasm-simd-hf-space-en-asr-zipformer.yaml b/.github/workflows/wasm-simd-hf-space-en-asr-zipformer.yaml index 975266917..510a003c7 100644 --- a/.github/workflows/wasm-simd-hf-space-en-asr-zipformer.yaml +++ b/.github/workflows/wasm-simd-hf-space-en-asr-zipformer.yaml @@ -27,6 +27,9 @@ jobs: fetch-depth: 0 - name: Install emsdk uses: mymindstorm/setup-emsdk@v14 + with: + version: 3.1.51 + actions-cache-folder: 'emsdk-cache' - name: View emsdk version shell: bash diff --git a/.github/workflows/wasm-simd-hf-space-en-tts.yaml b/.github/workflows/wasm-simd-hf-space-en-tts.yaml index f5f950c3c..9c5c1d446 100644 --- a/.github/workflows/wasm-simd-hf-space-en-tts.yaml +++ b/.github/workflows/wasm-simd-hf-space-en-tts.yaml @@ -25,8 +25,12 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 + - name: Install emsdk uses: mymindstorm/setup-emsdk@v14 + with: + version: 3.1.51 + actions-cache-folder: 'emsdk-cache' - name: View emsdk version shell: bash diff --git a/.github/workflows/wasm-simd-hf-space-silero-vad.yaml b/.github/workflows/wasm-simd-hf-space-silero-vad.yaml index e384af3fb..dc8bada70 100644 --- a/.github/workflows/wasm-simd-hf-space-silero-vad.yaml +++ b/.github/workflows/wasm-simd-hf-space-silero-vad.yaml @@ -25,6 +25,7 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 + - name: Install emsdk uses: mymindstorm/setup-emsdk@v14 with: diff --git a/.github/workflows/wasm-simd-hf-space-vad-asr.yaml b/.github/workflows/wasm-simd-hf-space-vad-asr.yaml new file mode 100644 index 000000000..726b69826 --- /dev/null +++ b/.github/workflows/wasm-simd-hf-space-vad-asr.yaml @@ -0,0 +1,93 @@ +name: wasm-simd-hf-space-vad-asr + +on: + push: + branches: + - wasm + tags: + - 'v[0-9]+.[0-9]+.[0-9]+*' + + workflow_dispatch: + +concurrency: + group: wasm-simd-hf-space-vad-asr${{ github.ref }} + cancel-in-progress: true + +jobs: + wasm-simd-hf-space-vad-asr: + name: ${{ matrix.index }}/${{ matrix.total }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + total: ["8"] + index: ["0", "1", "2", "3", "4", "5", "6", "7"] + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Install Python dependencies + shell: bash + run: | + python3 -m pip install --upgrade pip jinja2 + + - name: Install emsdk + uses: mymindstorm/setup-emsdk@v14 + with: + version: 3.1.51 + actions-cache-folder: 'emsdk-cache' + + - name: View emsdk version + shell: bash + run: | + emcc -v + echo "--------------------" + emcc --check + + - name: Generate build script + shell: bash + run: | + cd scripts/wasm + + total=${{ matrix.total }} + index=${{ matrix.index }} + + ./generate-vad-asr.py --total $total --index $index + + chmod +x run-vad-asr.sh + mv -v ./run-vad-asr.sh ../.. + + - name: Show build scripts + shell: bash + run: | + cat ./run-vad-asr.sh + + - uses: actions/upload-artifact@v4 + with: + name: run-vad-asr-${{ matrix.index }} + path: ./run-vad-asr.sh + + - name: Build sherpa-onnx for WebAssembly + shell: bash + env: + MS_TOKEN: ${{ secrets.MODEL_SCOPE_GIT_TOKEN }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + ./run-vad-asr.sh + + - name: Release jar + if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && github.event_name == 'push' && contains(github.ref, 'refs/tags/') + uses: svenstaro/upload-release-action@v2 + with: + file_glob: true + overwrite: true + file: ./*.tar.bz2 + + - name: Upload wasm files + uses: actions/upload-artifact@v4 + with: + name: sherpa-onnx-wasm-simd-vad-asr-${{ matrix.index }} + path: ./sherpa-onnx-wasm-simd-*.tar.bz2 diff --git a/.github/workflows/wasm-simd-hf-space-zh-cantonese-en-asr-paraformer.yaml b/.github/workflows/wasm-simd-hf-space-zh-cantonese-en-asr-paraformer.yaml index e0c665737..c72e0cef2 100644 --- a/.github/workflows/wasm-simd-hf-space-zh-cantonese-en-asr-paraformer.yaml +++ b/.github/workflows/wasm-simd-hf-space-zh-cantonese-en-asr-paraformer.yaml @@ -25,8 +25,12 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 + - name: Install emsdk uses: mymindstorm/setup-emsdk@v14 + with: + version: 3.1.51 + actions-cache-folder: 'emsdk-cache' - name: View emsdk version shell: bash diff --git a/.github/workflows/wasm-simd-hf-space-zh-en-asr-paraformer.yaml b/.github/workflows/wasm-simd-hf-space-zh-en-asr-paraformer.yaml index 500305420..b76f912b4 100644 --- a/.github/workflows/wasm-simd-hf-space-zh-en-asr-paraformer.yaml +++ b/.github/workflows/wasm-simd-hf-space-zh-en-asr-paraformer.yaml @@ -25,8 +25,12 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 + - name: Install emsdk uses: mymindstorm/setup-emsdk@v14 + with: + version: 3.1.51 + actions-cache-folder: 'emsdk-cache' - name: View emsdk version shell: bash diff --git a/.github/workflows/wasm-simd-hf-space-zh-en-asr-zipformer.yaml b/.github/workflows/wasm-simd-hf-space-zh-en-asr-zipformer.yaml index dfa0e1614..9bdd90ee2 100644 --- a/.github/workflows/wasm-simd-hf-space-zh-en-asr-zipformer.yaml +++ b/.github/workflows/wasm-simd-hf-space-zh-en-asr-zipformer.yaml @@ -25,8 +25,12 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 + - name: Install emsdk uses: mymindstorm/setup-emsdk@v14 + with: + version: 3.1.51 + actions-cache-folder: 'emsdk-cache' - name: View emsdk version shell: bash diff --git a/CMakeLists.txt b/CMakeLists.txt index 7408f8d69..b71bb133d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,6 +36,7 @@ option(SHERPA_ONNX_ENABLE_WASM_TTS "Whether to enable WASM for TTS" OFF) option(SHERPA_ONNX_ENABLE_WASM_ASR "Whether to enable WASM for ASR" OFF) option(SHERPA_ONNX_ENABLE_WASM_KWS "Whether to enable WASM for KWS" OFF) option(SHERPA_ONNX_ENABLE_WASM_VAD "Whether to enable WASM for VAD" OFF) +option(SHERPA_ONNX_ENABLE_WASM_VAD_ASR "Whether to enable WASM for VAD+ASR" OFF) option(SHERPA_ONNX_ENABLE_WASM_NODEJS "Whether to enable WASM for NodeJS" OFF) option(SHERPA_ONNX_ENABLE_BINARY "Whether to build binaries" ON) option(SHERPA_ONNX_ENABLE_TTS "Whether to build TTS related code" ON) @@ -137,6 +138,7 @@ message(STATUS "SHERPA_ONNX_ENABLE_WASM_TTS ${SHERPA_ONNX_ENABLE_WASM_TTS}") message(STATUS "SHERPA_ONNX_ENABLE_WASM_ASR ${SHERPA_ONNX_ENABLE_WASM_ASR}") message(STATUS "SHERPA_ONNX_ENABLE_WASM_KWS ${SHERPA_ONNX_ENABLE_WASM_KWS}") message(STATUS "SHERPA_ONNX_ENABLE_WASM_VAD ${SHERPA_ONNX_ENABLE_WASM_VAD}") +message(STATUS "SHERPA_ONNX_ENABLE_WASM_VAD_ASR ${SHERPA_ONNX_ENABLE_WASM_VAD_ASR}") message(STATUS "SHERPA_ONNX_ENABLE_WASM_NODEJS ${SHERPA_ONNX_ENABLE_WASM_NODEJS}") message(STATUS "SHERPA_ONNX_ENABLE_BINARY ${SHERPA_ONNX_ENABLE_BINARY}") message(STATUS "SHERPA_ONNX_ENABLE_TTS ${SHERPA_ONNX_ENABLE_TTS}") @@ -211,11 +213,22 @@ if(SHERPA_ONNX_ENABLE_WASM) endif() if(SHERPA_ONNX_ENABLE_WASM_KWS) + if(NOT SHERPA_ONNX_ENABLE_WASM) + message(FATAL_ERROR "Please set SHERPA_ONNX_ENABLE_WASM to ON if you enable WASM for KWS") + endif() add_definitions(-DSHERPA_ONNX_ENABLE_WASM_KWS=1) endif() if(SHERPA_ONNX_ENABLE_WASM_VAD) - add_definitions(-DSHERPA_ONNX_ENABLE_WASM_VAD=1) + if(NOT SHERPA_ONNX_ENABLE_WASM) + message(FATAL_ERROR "Please set SHERPA_ONNX_ENABLE_WASM to ON if you enable WASM for VAD") + endif() +endif() + +if(SHERPA_ONNX_ENABLE_WASM_VAD_ASR) + if(NOT SHERPA_ONNX_ENABLE_WASM) + message(FATAL_ERROR "Please set SHERPA_ONNX_ENABLE_WASM to ON if you enable WASM for VAD+ASR") + endif() endif() if(NOT CMAKE_CXX_STANDARD) diff --git a/README.md b/README.md index dcdaec2f2..cc9acb2b1 100644 --- a/README.md +++ b/README.md @@ -14,13 +14,13 @@ ### Supported platforms -|Architecture| Android | iOS | Windows | macOS | linux | -|------------|------------------|---------------|------------|-------|-------| -| x64 | ✔️ | | ✔️ | ✔️ | ✔️ | -| x86 | ✔️ | | ✔️ | | | -| arm64 | ✔️ | ✔️ | ✔️ | ✔️ | ✔️ | -| arm32 | ✔️ | | | | ✔️ | -| riscv64 | | | | | ✔️ | +|Architecture| Android | iOS | Windows | macOS | linux | +|------------|---------|---------|------------|-------|-------| +| x64 | ✔️ | | ✔️ | ✔️ | ✔️ | +| x86 | ✔️ | | ✔️ | | | +| arm64 | ✔️ | ✔️ | ✔️ | ✔️ | ✔️ | +| arm32 | ✔️ | | | | ✔️ | +| riscv64 | | | | | ✔️ | ### Supported programming languages @@ -37,7 +37,7 @@ |-------|----------|----------|------------| | ✔️ | ✔️ | ✔️ | ✔️ | -For Rust support, please see https://github.com/thewh1teagle/sherpa-rs +For Rust support, please see [sherpa-rs][sherpa-rs] It also supports WebAssembly. @@ -51,7 +51,7 @@ This repository supports running the following functions **locally** - Speaker verification - Spoken language identification - Audio tagging - - VAD (e.g., [silero-vad](https://github.com/snakers4/silero-vad)) + - VAD (e.g., [silero-vad][silero-vad]) - Keyword spotting on the following platforms and operating systems: @@ -62,11 +62,12 @@ on the following platforms and operating systems: - iOS - NodeJS - WebAssembly - - [Raspberry Pi](https://www.raspberrypi.com/) - - [RV1126](https://www.rock-chips.com/uploads/pdf/2022.8.26/191/RV1126%20Brief%20Datasheet.pdf) - - [LicheePi4A](https://sipeed.com/licheepi4a) - - [VisionFive 2](https://www.starfivetech.com/en/site/boards) - - [旭日X3派](https://developer.horizon.ai/api/v1/fileData/documents_pi/index.html) + - [Raspberry Pi][Raspberry Pi] + - [RV1126][RV1126] + - [LicheePi4A][LicheePi4A] + - [VisionFive 2][VisionFive 2] + - [旭日X3派][旭日X3派] + - [爱芯派][爱芯派] - etc with the following APIs @@ -81,59 +82,68 @@ with the following APIs You can visit the following Huggingface spaces to try `sherpa-onnx` without installing anything. All you need is a browser. -| Description | URL | -|---|---| -| Speech recognition | [Click me](https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition)| -| Speech recognition with [Whisper](https://github.com/openai/whisper)| [Click me](https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition-with-whisper)| -| Speech synthesis | [Click me](https://huggingface.co/spaces/k2-fsa/text-to-speech)| -| Generate subtitles| [Click me](https://huggingface.co/spaces/k2-fsa/generate-subtitles-for-videos)| -|Audio tagging| [Click me](https://huggingface.co/spaces/k2-fsa/audio-tagging)| -|Spoken language identification with [Whisper](https://github.com/openai/whisper)|[Click me](https://huggingface.co/spaces/k2-fsa/spoken-language-identification)| +| Description | URL | +|-------------------------------------------------------|------------------------------------| +| Speech recognition | [Click me][hf-space-asr] | +| Speech recognition with [Whisper][Whisper] | [Click me][hf-space-asr-whisper] | +| Speech synthesis | [Click me][hf-space-tts] | +| Generate subtitles | [Click me][hf-space-subtitle] | +| Audio tagging | [Click me][hf-space-audio-tagging] | +| Spoken language identification with [Whisper][Whisper]| [Click me][hf-space-slid-whisper] | We also have spaces built using WebAssembly. The are listed below: -| Description | URL| Chinese users| -|---|---|---| -|Voice activity detection with [silero-vad](https://github.com/snakers4/silero-vad)| [Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-vad-sherpa-onnx)|[地址](https://modelscope.cn/studios/csukuangfj/web-assembly-vad-sherpa-onnx)| -|Real-time speech recognition (Chinese + English) with Zipformer | [Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en)|[地址](https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en)| -|Real-time speech recognition (Chinese + English) with Paraformer|[Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en-paraformer)| [地址](https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en-paraformer)| -|Real-time speech recognition (Chinese + English + Cantonese) with Paraformer|[Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-cantonese-en-paraformer)| [地址](https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-cantonese-en-paraformer)| -|Real-time speech recognition (English) |[Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-en)|[地址](https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-en)| -|Speech synthesis (English) |[Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-en)| [地址](https://modelscope.cn/studios/k2-fsa/web-assembly-tts-sherpa-onnx-en)| -|Speech synthesis (German)|[Click me](https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-de)| [地址](https://modelscope.cn/studios/k2-fsa/web-assembly-tts-sherpa-onnx-de)| +| Description | Huggingface space| ModelScope space| +|------------------------------------------------------------------------------------------|------------------|-----------------| +|Voice activity detection with [silero-vad][silero-vad] | [Click me][wasm-hf-vad]|[地址][wasm-ms-vad]| +|Real-time speech recognition (Chinese + English) with Zipformer | [Click me][wasm-hf-streaming-asr-zh-en-zipformer]|[地址][wasm-hf-streaming-asr-zh-en-zipformer]| +|Real-time speech recognition (Chinese + English) with Paraformer |[Click me][wasm-hf-streaming-asr-zh-en-paraformer]| [地址][wasm-ms-streaming-asr-zh-en-paraformer]| +|Real-time speech recognition (Chinese + English + Cantonese) with [Paraformer-large][Paraformer-large]|[Click me][wasm-hf-streaming-asr-zh-en-yue-paraformer]| [地址][wasm-ms-streaming-asr-zh-en-yue-paraformer]| +|Real-time speech recognition (English) |[Click me][wasm-hf-streaming-asr-en-zipformer] |[地址][wasm-ms-streaming-asr-en-zipformer]| +|VAD + speech recognition (Chinese + English + Korean + Japanese + Cantonese) with [SenseVoice][SenseVoice]|[Click me][wasm-hf-vad-asr-zh-en-ko-ja-yue-sense-voice]| [地址][wasm-ms-vad-asr-zh-en-ko-ja-yue-sense-voice]| +|VAD + speech recognition (English) with [Whisper][Whisper] tiny.en|[Click me][wasm-hf-vad-asr-en-whisper-tiny-en]| [地址][wasm-ms-vad-asr-en-whisper-tiny-en]| +|VAD + speech recognition (English) with Zipformer trained with [GigaSpeech][GigaSpeech] |[Click me][wasm-hf-vad-asr-en-zipformer-gigaspeech]| [地址][wasm-ms-vad-asr-en-zipformer-gigaspeech]| +|VAD + speech recognition (Chinese) with Zipformer trained with [WenetSpeech][WenetSpeech] |[Click me][wasm-hf-vad-asr-zh-zipformer-wenetspeech]| [地址][wasm-ms-vad-asr-zh-zipformer-wenetspeech]| +|VAD + speech recognition (Japanese) with Zipformer trained with [ReazonSpeech][ReazonSpeech]|[Click me][wasm-hf-vad-asr-ja-zipformer-reazonspeech]| [地址][wasm-ms-vad-asr-ja-zipformer-reazonspeech]| +|VAD + speech recognition (Thai) with Zipformer trained with [GigaSpeech2][GigaSpeech2] |[Click me][wasm-hf-vad-asr-th-zipformer-gigaspeech2]| [地址][wasm-ms-vad-asr-th-zipformer-gigaspeech2]| +|VAD + speech recognition (Chinese 多种方言) with a [TeleSpeech-ASR][TeleSpeech-ASR] CTC model|[Click me][wasm-hf-vad-asr-zh-telespeech]| [地址][wasm-ms-vad-asr-zh-telespeech]| +|VAD + speech recognition (English + Chinese, 及多种中文方言) with Paraformer-large |[Click me][wasm-hf-vad-asr-zh-en-paraformer-large]| [地址][wasm-ms-vad-asr-zh-en-paraformer-large]| +|VAD + speech recognition (English + Chinese, 及多种中文方言) with Paraformer-small |[Click me][wasm-hf-vad-asr-zh-en-paraformer-small]| [地址][wasm-ms-vad-asr-zh-en-paraformer-small]| +|Speech synthesis (English) |[Click me][wasm-hf-tts-piper-en]| [地址][wasm-ms-tts-piper-en]| +|Speech synthesis (German) |[Click me][wasm-hf-tts-piper-de]| [地址][wasm-ms-tts-piper-de]| ### Links for pre-built Android APKs -| Description | URL | 中国用户 | -|--------------------------------|-----------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------| -| Streaming speech recognition | [Address](https://k2-fsa.github.io/sherpa/onnx/android/apk.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/android/apk-cn.html) | -| Text-to-speech | [Address](https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine-cn.html) | -|Voice activity detection (VAD) | [Address](https://k2-fsa.github.io/sherpa/onnx/vad/apk.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/vad/apk-cn.html)| -|VAD + non-streaming speech recognition| [Address](https://k2-fsa.github.io/sherpa/onnx/vad/apk-asr.html)| [点此](https://k2-fsa.github.io/sherpa/onnx/vad/apk-asr-cn.html)| -|Two-pass speech recognition| [Address](https://k2-fsa.github.io/sherpa/onnx/android/apk-2pass.html)| [点此](https://k2-fsa.github.io/sherpa/onnx/android/apk-2pass-cn.html)| -| Audio tagging | [Address](https://k2-fsa.github.io/sherpa/onnx/audio-tagging/apk.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/audio-tagging/apk-cn.html) | -| Audio tagging (WearOS) | [Address](https://k2-fsa.github.io/sherpa/onnx/audio-tagging/apk-wearos.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/audio-tagging/apk-wearos-cn.html) | -| Speaker identification | [Address](https://k2-fsa.github.io/sherpa/onnx/speaker-identification/apk.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/speaker-identification/apk-cn.html) | -| Spoken language identification | [Address](https://k2-fsa.github.io/sherpa/onnx/spoken-language-identification/apk.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/spoken-language-identification/apk-cn.html) | -|Keyword spotting| [Address](https://k2-fsa.github.io/sherpa/onnx/kws/apk.html)| [点此](https://k2-fsa.github.io/sherpa/onnx/kws/apk-cn.html)| +| Description | URL | 中国用户 | +|----------------------------------------|------------------------------|-----------------------------| +| Streaming speech recognition | [Address][apk-streaming-asr] | [点此][apk-streaming-asr-cn]| +| Text-to-speech | [Address][apk-tts] | [点此][apk-tts-cn] | +| Voice activity detection (VAD) | [Address][apk-vad] | [点此][apk-vad-cn] | +| VAD + non-streaming speech recognition | [Address][apk-vad-asr] | [点此][apk-vad-asr-cn] | +| Two-pass speech recognition | [Address][apk-2pass] | [点此][apk-2pass-cn] | +| Audio tagging | [Address][apk-at] | [点此][apk-at-cn] | +| Audio tagging (WearOS) | [Address][apk-at-wearos] | [点此][apk-at-wearos-cn] | +| Speaker identification | [Address][apk-sid] | [点此][apk-sid-cn] | +| Spoken language identification | [Address][apk-slid] | [点此][apk-slid-cn] | +| Keyword spotting | [Address][apk-kws] | [点此][apk-kws-cn] | ### Links for pre-built Flutter APPs #### Real-time speech recognition -| Description | URL | 中国用户 | -|--------------------------------|---------------------------------------------------------------------|---------------------------------------------------------------------| -| Streaming speech recognition | [Address](https://k2-fsa.github.io/sherpa/onnx/flutter/asr/app.html)| [点此](https://k2-fsa.github.io/sherpa/onnx/flutter/asr/app-cn.html)| +| Description | URL | 中国用户 | +|--------------------------------|-------------------------------------|-------------------------------------| +| Streaming speech recognition | [Address][apk-flutter-streaming-asr]| [点此][apk-flutter-streaming-asr-cn]| #### Text-to-speech -| Description | URL | 中国用户 | -|--------------------------------|--------------------------------------------------------------|-----------------------------------------------------------------------------| -| Android (arm64-v8a, armeabi-v7a, x86_64) | [Address](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-android.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-android-cn.html)| -| Linux (x64) | [Address](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-linux.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-linux-cn.html) | -| macOS (x64) | [Address](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-macos-x64.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-macos-x64-cn.html) | -| macOS (arm64) | [Address](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-macos-arm64.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-macos-arm64-cn.html)| -| Windows (x64) | [Address](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-win.html) | [点此](https://k2-fsa.github.io/sherpa/onnx/flutter/tts-win-cn.html) | +| Description | URL | 中国用户 | +|------------------------------------------|------------------------------------|------------------------------------| +| Android (arm64-v8a, armeabi-v7a, x86_64) | [Address][flutter-tts-android] | [点此][flutter-tts-android-cn] | +| Linux (x64) | [Address][flutter-tts-linux] | [点此][flutter-tts-linux-cn] | +| macOS (x64) | [Address][flutter-tts-macos-x64] | [点此][flutter-tts-macos-arm64-cn] | +| macOS (arm64) | [Address][flutter-tts-macos-arm64] | [点此][flutter-tts-macos-x64-cn] | +| Windows (x64) | [Address][flutter-tts-win-x64] | [点此][flutter-tts-win-x64-cn] | > Note: You need to build from source for iOS. @@ -141,23 +151,23 @@ We also have spaces built using WebAssembly. The are listed below: #### Generating subtitles -| Description | URL | 中国用户 | -|--------------------------------|---------------------------------------------------------------------|---------------------------------------------------------------------| -| Generate subtitles (生成字幕) | [Address](https://k2-fsa.github.io/sherpa/onnx/lazarus/download-generated-subtitles.html)| [点此](https://k2-fsa.github.io/sherpa/onnx/lazarus/download-generated-subtitles-cn.html)| +| Description | URL | 中国用户 | +|--------------------------------|----------------------------|----------------------------| +| Generate subtitles (生成字幕) | [Address][lazarus-subtitle]| [点此][lazarus-subtitle-cn]| ### Links for pre-trained models -| Description | URL | -|--------------------------------|--------------------------------------------------------------------------------------------------------------------------------| -| Speech recognition (speech to text, ASR) | [Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models) | -| Text-to-speech (TTS) | [Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models) | -| VAD | [Address](https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx)| -| Keyword spotting |[Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/kws-models)| -| Audio tagging | [Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/audio-tagging-models)| -| Speaker identification (Speaker ID) | [Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models)| -| Spoken language identification (Language ID) | See multi-lingual [Whisper](https://github.com/openai/whisper) ASR models from [Speech recognition](https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models) | -| Punctuation| [Address](https://github.com/k2-fsa/sherpa-onnx/releases/tag/punctuation-models)| +| Description | URL | +|---------------------------------------------|---------------------------------------------------------------------------------------| +| Speech recognition (speech to text, ASR) | [Address][asr-models] | +| Text-to-speech (TTS) | [Address][tts-models] | +| VAD | [Address][vad-models] | +| Keyword spotting | [Address][kws-models] | +| Audio tagging | [Address][at-models] | +| Speaker identification (Speaker ID) | [Address][sid-models] | +| Spoken language identification (Language ID)| See multi-lingual [Whisper][Whisper] ASR models from [Speech recognition][asr-models]| +| Punctuation | [Address][punct-models] | ### Useful links @@ -169,3 +179,100 @@ We also have spaces built using WebAssembly. The are listed below: Please see https://k2-fsa.github.io/sherpa/social-groups.html for 新一代 Kaldi **微信交流群** and **QQ 交流群**. + +[sherpa-rs]: https://github.com/thewh1teagle/sherpa-rs +[silero-vad]: https://github.com/snakers4/silero-vad +[Raspberry Pi]: https://www.raspberrypi.com/ +[RV1126]: https://www.rock-chips.com/uploads/pdf/2022.8.26/191/RV1126%20Brief%20Datasheet.pdf +[LicheePi4A]: https://sipeed.com/licheepi4a +[VisionFive 2]: https://www.starfivetech.com/en/site/boards +[旭日X3派]: https://developer.horizon.ai/api/v1/fileData/documents_pi/index.html +[爱芯派]: https://wiki.sipeed.com/hardware/zh/maixIII/ax-pi/axpi.html +[hf-space-asr]: https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition +[Whisper]: https://github.com/openai/whisper +[hf-space-asr-whisper]: https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition-with-whisper +[hf-space-tts]: https://huggingface.co/spaces/k2-fsa/text-to-speech +[hf-space-subtitle]: https://huggingface.co/spaces/k2-fsa/generate-subtitles-for-videos +[hf-space-audio-tagging]: https://huggingface.co/spaces/k2-fsa/audio-tagging +[hf-space-slid-whisper]: https://huggingface.co/spaces/k2-fsa/spoken-language-identification +[wasm-hf-vad]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-sherpa-onnx +[wasm-ms-vad]: https://modelscope.cn/studios/csukuangfj/web-assembly-vad-sherpa-onnx +[wasm-hf-streaming-asr-zh-en-zipformer]: https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en +[wasm-ms-streaming-asr-zh-en-zipformer]: https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en +[wasm-hf-streaming-asr-zh-en-paraformer]: https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en-paraformer +[wasm-ms-streaming-asr-zh-en-paraformer]: https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-en-paraformer +[Paraformer-large]: https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary +[wasm-hf-streaming-asr-zh-en-yue-paraformer]: https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-zh-cantonese-en-paraformer +[wasm-ms-streaming-asr-zh-en-yue-paraformer]: https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-zh-cantonese-en-paraformer +[wasm-hf-streaming-asr-en-zipformer]: https://huggingface.co/spaces/k2-fsa/web-assembly-asr-sherpa-onnx-en +[wasm-ms-streaming-asr-en-zipformer]: https://modelscope.cn/studios/k2-fsa/web-assembly-asr-sherpa-onnx-en +[SenseVoice]: https://github.com/FunAudioLLM/SenseVoice +[wasm-hf-vad-asr-zh-en-ko-ja-yue-sense-voice]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-ja-ko-cantonese-sense-voice +[wasm-ms-vad-asr-zh-en-ko-ja-yue-sense-voice]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-zh-en-jp-ko-cantonese-sense-voice +[wasm-hf-vad-asr-en-whisper-tiny-en]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-whisper-tiny +[wasm-ms-vad-asr-en-whisper-tiny-en]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-en-whisper-tiny +[wasm-hf-vad-asr-en-zipformer-gigaspeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-zipformer-gigaspeech +[wasm-ms-vad-asr-en-zipformer-gigaspeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-zipformer-gigaspeech +[wasm-hf-vad-asr-zh-zipformer-wenetspeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-wenetspeech +[wasm-ms-vad-asr-zh-zipformer-wenetspeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-wenetspeech +[ReazonSpeech]: https://research.reazon.jp/_static/reazonspeech_nlp2023.pdf +[wasm-hf-vad-asr-ja-zipformer-reazonspeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-ja-zipformer +[wasm-ms-vad-asr-ja-zipformer-reazonspeech]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-ja-zipformer +[GigaSpeech2]: https://github.com/SpeechColab/GigaSpeech2 +[wasm-hf-vad-asr-th-zipformer-gigaspeech2]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-th-zipformer +[wasm-ms-vad-asr-th-zipformer-gigaspeech2]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-vad-asr-sherpa-onnx-th-zipformer +[TeleSpeech-ASR]: https://github.com/Tele-AI/TeleSpeech-ASR +[wasm-hf-vad-asr-zh-telespeech]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-telespeech +[wasm-ms-vad-asr-zh-telespeech]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-telespeech +[wasm-hf-vad-asr-zh-en-paraformer-large]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer +[wasm-ms-vad-asr-zh-en-paraformer-large]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer +[wasm-hf-vad-asr-zh-en-paraformer-small]: https://huggingface.co/spaces/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small +[wasm-ms-vad-asr-zh-en-paraformer-small]: https://www.modelscope.cn/studios/k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small +[wasm-hf-tts-piper-en]: https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-en +[wasm-ms-tts-piper-en]: https://modelscope.cn/studios/k2-fsa/web-assembly-tts-sherpa-onnx-en +[wasm-hf-tts-piper-de]: https://huggingface.co/spaces/k2-fsa/web-assembly-tts-sherpa-onnx-de +[wasm-ms-tts-piper-de]: https://modelscope.cn/studios/k2-fsa/web-assembly-tts-sherpa-onnx-de +[apk-streaming-asr]: https://k2-fsa.github.io/sherpa/onnx/android/apk.html +[apk-streaming-asr-cn]: https://k2-fsa.github.io/sherpa/onnx/android/apk-cn.html +[apk-tts]: https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine.html +[apk-tts-cn]: https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine-cn.html +[apk-vad]: https://k2-fsa.github.io/sherpa/onnx/vad/apk.html +[apk-vad-cn]: https://k2-fsa.github.io/sherpa/onnx/vad/apk-cn.html +[apk-vad-asr]: https://k2-fsa.github.io/sherpa/onnx/vad/apk-asr.html +[apk-vad-asr-cn]: https://k2-fsa.github.io/sherpa/onnx/vad/apk-asr-cn.html +[apk-2pass]: https://k2-fsa.github.io/sherpa/onnx/android/apk-2pass.html +[apk-2pass-cn]: https://k2-fsa.github.io/sherpa/onnx/android/apk-2pass-cn.html +[apk-at]: https://k2-fsa.github.io/sherpa/onnx/audio-tagging/apk.html +[apk-at-cn]: https://k2-fsa.github.io/sherpa/onnx/audio-tagging/apk-cn.html +[apk-at-wearos]: https://k2-fsa.github.io/sherpa/onnx/audio-tagging/apk-wearos.html +[apk-at-wearos-cn]: https://k2-fsa.github.io/sherpa/onnx/audio-tagging/apk-wearos-cn.html +[apk-sid]: https://k2-fsa.github.io/sherpa/onnx/speaker-identification/apk.html +[apk-sid-cn]: https://k2-fsa.github.io/sherpa/onnx/speaker-identification/apk-cn.html +[apk-slid]: https://k2-fsa.github.io/sherpa/onnx/spoken-language-identification/apk.html +[apk-slid-cn]: https://k2-fsa.github.io/sherpa/onnx/spoken-language-identification/apk-cn.html +[apk-kws]: https://k2-fsa.github.io/sherpa/onnx/kws/apk.html +[apk-kws-cn]: https://k2-fsa.github.io/sherpa/onnx/kws/apk-cn.html +[apk-flutter-streaming-asr]: https://k2-fsa.github.io/sherpa/onnx/flutter/asr/app.html +[apk-flutter-streaming-asr-cn]: https://k2-fsa.github.io/sherpa/onnx/flutter/asr/app-cn.html +[flutter-tts-android]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-android.html +[flutter-tts-android-cn]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-android-cn.html +[flutter-tts-linux]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-linux.html +[flutter-tts-linux-cn]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-linux-cn.html +[flutter-tts-macos-x64]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-macos-x64.html +[flutter-tts-macos-arm64-cn]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-macos-x64-cn.html +[flutter-tts-macos-arm64]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-macos-arm64.html +[flutter-tts-macos-x64-cn]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-macos-arm64-cn.html +[flutter-tts-win-x64]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-win.html +[flutter-tts-win-x64-cn]: https://k2-fsa.github.io/sherpa/onnx/flutter/tts-win-cn.html +[lazarus-subtitle]: https://k2-fsa.github.io/sherpa/onnx/lazarus/download-generated-subtitles.html +[lazarus-subtitle-cn]: https://k2-fsa.github.io/sherpa/onnx/lazarus/download-generated-subtitles-cn.html +[asr-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +[tts-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models +[vad-models]: https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +[kws-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/kws-models +[at-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/audio-tagging-models +[sid-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models +[slid-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models +[punct-models]: https://github.com/k2-fsa/sherpa-onnx/releases/tag/punctuation-models +[GigaSpeech]: https://github.com/SpeechColab/GigaSpeech +[WenetSpeech]: https://github.com/wenet-e2e/WenetSpeech diff --git a/build-wasm-simd-vad-asr.sh b/build-wasm-simd-vad-asr.sh new file mode 100755 index 000000000..5d15cf651 --- /dev/null +++ b/build-wasm-simd-vad-asr.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash +# Copyright (c) 2024 Xiaomi Corporation +# +# This script is to build sherpa-onnx for WebAssembly (VAD+ASR) +# Note: ASR here means non-streaming ASR + +set -ex + +if [ x"$EMSCRIPTEN" == x"" ]; then + if ! command -v emcc &> /dev/null; then + echo "Please install emscripten first" + echo "" + echo "You can use the following commands to install it:" + echo "" + echo "git clone https://github.com/emscripten-core/emsdk.git" + echo "cd emsdk" + echo "git pull" + echo "./emsdk install latest" + echo "./emsdk activate latest" + echo "source ./emsdk_env.sh" + exit 1 + else + EMSCRIPTEN=$(dirname $(realpath $(which emcc))) + fi +fi + +export EMSCRIPTEN=$EMSCRIPTEN +echo "EMSCRIPTEN: $EMSCRIPTEN" +if [ ! -f $EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake ]; then + echo "Cannot find $EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake" + echo "Please make sure you have installed emsdk correctly" + exit 1 +fi + +mkdir -p build-wasm-simd-vad-asr +pushd build-wasm-simd-vad-asr + +export SHERPA_ONNX_IS_USING_BUILD_WASM_SH=ON + +cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_TOOLCHAIN_FILE=$EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake \ + \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=OFF \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + -DSHERPA_ONNX_ENABLE_JNI=OFF \ + -DSHERPA_ONNX_ENABLE_TTS=OFF \ + -DSHERPA_ONNX_ENABLE_C_API=ON \ + -DSHERPA_ONNX_ENABLE_WEBSOCKET=OFF \ + -DSHERPA_ONNX_ENABLE_GPU=OFF \ + -DSHERPA_ONNX_ENABLE_WASM=ON \ + -DSHERPA_ONNX_ENABLE_WASM_VAD_ASR=ON \ + -DSHERPA_ONNX_ENABLE_BINARY=OFF \ + -DSHERPA_ONNX_LINK_LIBSTDCPP_STATICALLY=OFF \ + .. +make -j2 +make install + +echo "pwd: $PWD" + +cp -fv ../wasm/vad/sherpa-onnx-vad.js ./install/bin/wasm/vad-asr/ +cp -fv ../wasm/asr/sherpa-onnx-asr.js ./install/bin/wasm/vad-asr/ + +ls -lh install/bin/wasm/vad-asr diff --git a/scripts/wasm/generate-vad-asr.py b/scripts/wasm/generate-vad-asr.py new file mode 100755 index 000000000..4c0099af8 --- /dev/null +++ b/scripts/wasm/generate-vad-asr.py @@ -0,0 +1,229 @@ +#!/usr/bin/env python3 + +import argparse +from dataclasses import dataclass +from typing import List, Optional + +import jinja2 + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--total", + type=int, + default=1, + help="Number of runners", + ) + parser.add_argument( + "--index", + type=int, + default=0, + help="Index of the current runner", + ) + return parser.parse_args() + + +@dataclass +class Model: + model_name: str + hf: str # huggingface space name + ms: str # modelscope space name + short_name: str + cmd: str = "" + + +def get_models(): + models = [ + Model( + model_name="sherpa-onnx-whisper-tiny.en", + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-whisper-tiny", + ms="csukuangfj/web-assembly-vad-asr-sherpa-onnx-en-whisper-tiny", + short_name="vad-asr-en-whisper_tiny", + cmd=""" + pushd $model_name + mv -v tiny.en-encoder.int8.onnx ../whisper-encoder.onnx + mv -v tiny.en-decoder.int8.onnx ../whisper-decoder.onnx + mv -v tiny.en-tokens.txt ../tokens.txt + popd + rm -rf $model_name + sed -i.bak 's/Zipformer/Whisper tiny.en supporting English 英文/g' ../index.html + git diff + """, + ), + Model( + model_name="sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17", + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-ja-ko-cantonese-sense-voice", + ms="csukuangfj/web-assembly-vad-asr-sherpa-onnx-zh-en-jp-ko-cantonese-sense-voice", + short_name="vad-asr-zh_en_ja_ko_cantonese-sense_voice_small", + cmd=""" + pushd $model_name + mv -v model.int8.onnx ../sense-voice.onnx + mv -v tokens.txt ../ + popd + rm -rf $model_name + sed -i.bak 's/Zipformer/SenseVoice Small supporting English, Chinese, Japanese, Korean, Cantonese 中英日韩粤/g' ../index.html + git diff + """, + ), + Model( + model_name="sherpa-onnx-paraformer-zh-2023-09-14", + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer", + ms="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer", + short_name="vad-asr-zh_en-paraformer_large", + cmd=""" + pushd $model_name + mv -v model.int8.onnx ../paraformer.onnx + mv -v tokens.txt ../ + popd + rm -rf $model_name + sed -i.bak 's/Zipformer/Paraformer supporting Chinese, English 中英/g' ../index.html + git diff + """, + ), + Model( + model_name="sherpa-onnx-paraformer-zh-small-2024-03-09", + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small", + ms="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-en-paraformer-small", + short_name="vad-asr-zh_en-paraformer_small", + cmd=""" + pushd $model_name + mv -v model.int8.onnx ../paraformer.onnx + mv -v tokens.txt ../ + popd + rm -rf $model_name + sed -i.bak 's/Zipformer/Paraformer-small supporting Chinese, English 中英文/g' ../index.html + git diff + """, + ), + Model( + model_name="sherpa-onnx-zipformer-gigaspeech-2023-12-12", + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-zipformer-gigaspeech", + ms="k2-fsa/web-assembly-vad-asr-sherpa-onnx-en-zipformer-gigaspeech", + short_name="vad-asr-en-zipformer_gigaspeech", + cmd=""" + pushd $model_name + mv encoder-epoch-30-avg-1.int8.onnx ../transducer-encoder.onnx + mv decoder-epoch-30-avg-1.onnx ../transducer-decoder.onnx + mv joiner-epoch-30-avg-1.int8.onnx ../transducer-joiner.onnx + mv tokens.txt ../ + popd + rm -rf $model_name + sed -i.bak 's/Zipformer/Zipformer supporting English 英语/g' ../index.html + git diff + """, + ), + Model( + model_name="icefall-asr-zipformer-wenetspeech-20230615", + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-wenetspeech", + ms="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-zipformer-wenetspeech", + short_name="vad-asr-zh-zipformer_wenetspeech", + cmd=""" + pushd $model_name + mv -v data/lang_char/tokens.txt ../ + mv -v exp/encoder-epoch-12-avg-4.int8.onnx ../transducer-encoder.onnx + mv -v exp/decoder-epoch-12-avg-4.onnx ../transducer-decoder.onnx + mv -v exp/joiner-epoch-12-avg-4.int8.onnx ../transducer-joiner.onnx + popd + rm -rf $model_name + sed -i.bak 's/Zipformer/Zipformer supporting Chinese 中文/g' ../index.html + git diff + """, + ), + Model( + model_name="sherpa-onnx-zipformer-ja-reazonspeech-2024-08-01", + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-ja-zipformer", + ms="csukuangfj/web-assembly-vad-asr-sherpa-onnx-ja-zipformer", + short_name="vad-asr-ja-zipformer_reazonspeech", + cmd=""" + pushd $model_name + mv encoder-epoch-99-avg-1.int8.onnx ../transducer-encoder.onnx + mv decoder-epoch-99-avg-1.onnx ../transducer-decoder.onnx + mv joiner-epoch-99-avg-1.int8.onnx ../transducer-joiner.onnx + mv tokens.txt ../ + popd + rm -rf $model_name + sed -i.bak 's/Zipformer/Zipformer supporting Japanese 日语/g' ../index.html + git diff + """, + ), + Model( + model_name="sherpa-onnx-zipformer-thai-2024-06-20", + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-th-zipformer", + ms="csukuangfj/web-assembly-vad-asr-sherpa-onnx-th-zipformer", + short_name="vad-asr-th-zipformer_gigaspeech2", + cmd=""" + pushd $model_name + mv encoder-epoch-12-avg-5.int8.onnx ../transducer-encoder.onnx + mv decoder-epoch-12-avg-5.onnx ../transducer-decoder.onnx + mv joiner-epoch-12-avg-5.int8.onnx ../transducer-joiner.onnx + mv tokens.txt ../ + popd + rm -rf $model_name + sed -i.bak 's/Zipformer/Zipformer supporting Thai 泰语/g' ../index.html + git diff + """, + ), + Model( + model_name="sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04", + hf="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-telespeech", + ms="k2-fsa/web-assembly-vad-asr-sherpa-onnx-zh-telespeech", + short_name="vad-asr-zh-telespeech", + cmd=""" + pushd $model_name + mv model.int8.onnx ../telespeech.onnx + mv tokens.txt ../ + popd + rm -rf $model_name + sed -i.bak 's/Zipformer/TeleSpeech-ASR supporting Chinese 多种中文方言/g' ../index.html + git diff + """, + ), + ] + return models + + +def main(): + args = get_args() + index = args.index + total = args.total + assert 0 <= index < total, (index, total) + + all_model_list = get_models() + + num_models = len(all_model_list) + + num_per_runner = num_models // total + if num_per_runner <= 0: + raise ValueError(f"num_models: {num_models}, num_runners: {total}") + + start = index * num_per_runner + end = start + num_per_runner + + remaining = num_models - args.total * num_per_runner + + print(f"{index}/{total}: {start}-{end}/{num_models}") + + d = dict() + d["model_list"] = all_model_list[start:end] + if index < remaining: + s = args.total * num_per_runner + index + d["model_list"].append(all_model_list[s]) + print(f"{s}/{num_models}") + + filename_list = [ + "./run-vad-asr.sh", + ] + for filename in filename_list: + environment = jinja2.Environment() + with open(f"{filename}.in") as f: + s = f.read() + template = environment.from_string(s) + + s = template.render(**d) + with open(filename, "w") as f: + print(s, file=f) + + +if __name__ == "__main__": + main() diff --git a/scripts/wasm/run-vad-asr.sh.in b/scripts/wasm/run-vad-asr.sh.in new file mode 100644 index 000000000..8d5e1d206 --- /dev/null +++ b/scripts/wasm/run-vad-asr.sh.in @@ -0,0 +1,92 @@ +#!/usr/bin/env bash +# +# Build WebAssembly APPs for huggingface spaces and modelscope spaces + +set -ex + +log() { + # This function is from espnet + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} + +SHERPA_ONNX_VERSION=$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2) + + +{% for model in model_list %} +model_name={{ model.model_name }} +short_name={{ model.short_name }} +hf_name={{ model.hf }} +ms_name={{ model.ms }} + +pushd wasm/vad-asr +git checkout . +rm -rf assets +mkdir assets +cd assets +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/${model_name}.tar.bz2 +tar xvf ${model_name}.tar.bz2 +rm ${model_name}.tar.bz2 + +{{ model.cmd }} + +popd + +ls -lh wasm/vad-asr/assets + +rm -rf build-wasm-simd-vad-asr/install +rm -rf build-wasm-simd-vad-asr/wasm + +./build-wasm-simd-vad-asr.sh + +dst=sherpa-onnx-wasm-simd-${SHERPA_ONNX_VERSION}-${short_name} +mv build-wasm-simd-vad-asr/install/bin/wasm/vad-asr $dst +ls -lh $dst +tar cjfv $dst.tar.bz2 ./$dst +ls -lh *.tar.bz2 + +git config --global user.email "csukuangfj@gmail.com" +git config --global user.name "Fangjun Kuang" + +export GIT_LFS_SKIP_SMUDGE=1 +export GIT_CLONE_PROTECTION_ACTIVE=false + +rm -rf ms +git clone https://www.modelscope.cn/studios/$ms_name.git ms + +cd ms +cp -v ../$dst/* . + +git status +git lfs track "*.data" +git lfs track "*.wasm" +ls -lh + +git add . +git commit -m "update model" +git push https://oauth2:${MS_TOKEN}@www.modelscope.cn/studios/$ms_name.git +cd .. +rm -rf ms + +rm -rf huggingface + +git clone https://huggingface.co/spaces/$hf_name huggingface +cd huggingface +cp -v ../$dst/* . + +git status +git lfs track "*.data" +git lfs track "*.wasm" +ls -lh + +git add . +git commit -m "update model" +git push https://csukuangfj:$HF_TOKEN@huggingface.co/spaces/$hf_name main +cd .. +rm -rf huggingface +rm -rf $dst + +ls -lh *.tar.bz2 + +{% endfor %} diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index e01ae0478..f2bbf9d76 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -13,6 +13,7 @@ #include "sherpa-onnx/csrc/audio-tagging.h" #include "sherpa-onnx/csrc/circular-buffer.h" #include "sherpa-onnx/csrc/display.h" +#include "sherpa-onnx/csrc/file-utils.h" #include "sherpa-onnx/csrc/keyword-spotter.h" #include "sherpa-onnx/csrc/macros.h" #include "sherpa-onnx/csrc/offline-punctuation.h" @@ -1638,3 +1639,7 @@ int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate( void SherpaOnnxLinearResamplerReset(SherpaOnnxLinearResampler *p) { p->impl->Reset(); } + +int32_t SherpaOnnxFileExists(const char *filename) { + return sherpa_onnx::FileExists(filename); +} diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index 97b8d8081..d4844aed1 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -1361,6 +1361,9 @@ SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetInputSampleRate( SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate( const SherpaOnnxLinearResampler *p); +// Return 1 if the file exists; return 0 if the file does not exist. +SHERPA_ONNX_API int32_t SherpaOnnxFileExists(const char *filename); + #if defined(__GNUC__) #pragma GCC diagnostic pop #endif diff --git a/wasm/CMakeLists.txt b/wasm/CMakeLists.txt index 075dfbf8d..b143e57b8 100644 --- a/wasm/CMakeLists.txt +++ b/wasm/CMakeLists.txt @@ -14,6 +14,10 @@ if(SHERPA_ONNX_ENABLE_WASM_VAD) add_subdirectory(vad) endif() +if(SHERPA_ONNX_ENABLE_WASM_VAD_ASR) + add_subdirectory(vad-asr) +endif() + if(SHERPA_ONNX_ENABLE_WASM_NODEJS) add_subdirectory(nodejs) endif() diff --git a/wasm/asr/assets/README.md b/wasm/asr/assets/README.md index d37c431a7..983347f78 100644 --- a/wasm/asr/assets/README.md +++ b/wasm/asr/assets/README.md @@ -80,3 +80,10 @@ assets fangjun$ tree -L 1 0 directories, 4 files ``` + +You can find example build scripts at: + + - Streaming Zipformer (English + Chinese): https://github.com/k2-fsa/sherpa-onnx/blob/master/.github/workflows/ wasm-simd-hf-space-zh-en-asr-zipformer.yaml + - Streaming Zipformer (English): https://github.com/k2-fsa/sherpa-onnx/blob/master/.github/workflows/wasm-simd-hf-space-en-asr-zipformer.yaml + - Streaming Paraformer (English + Chinese): https://github.com/k2-fsa/sherpa-onnx/blob/master/.github/workflows/wasm-simd-hf-space-zh-en-asr-paraformer.yaml + - Streaming Paraformer (English + Chinese + Cantonese): https://github.com/k2-fsa/sherpa-onnx/blob/master/.github/workflows/wasm-simd-hf-space-zh-cantonese-en-asr-paraformer.yaml diff --git a/wasm/asr/index.html b/wasm/asr/index.html index 3156321c6..53ee43d8f 100644 --- a/wasm/asr/index.html +++ b/wasm/asr/index.html @@ -3,7 +3,7 @@ - Next-gen Kaldi WebAssembly with sherpa-onnx for Text-to-speech + Next-gen Kaldi WebAssembly with sherpa-onnx for ASR + + + +

+ Next-gen Kaldi + WebAssembly
+ VAD+ASR Demo with sherpa-onnx
+ (with Zipformer) +

+ +
+ Loading model ... ... +
+
+ + + +
+
+ +
+ +
+
+ + + + + + diff --git a/wasm/vad-asr/sherpa-onnx-asr.js b/wasm/vad-asr/sherpa-onnx-asr.js new file mode 120000 index 000000000..fada5db1d --- /dev/null +++ b/wasm/vad-asr/sherpa-onnx-asr.js @@ -0,0 +1 @@ +../asr/sherpa-onnx-asr.js \ No newline at end of file diff --git a/wasm/vad-asr/sherpa-onnx-vad.js b/wasm/vad-asr/sherpa-onnx-vad.js new file mode 120000 index 000000000..47b3c8d0b --- /dev/null +++ b/wasm/vad-asr/sherpa-onnx-vad.js @@ -0,0 +1 @@ +../vad/sherpa-onnx-vad.js \ No newline at end of file diff --git a/wasm/vad-asr/sherpa-onnx-wasm-main-vad-asr.cc b/wasm/vad-asr/sherpa-onnx-wasm-main-vad-asr.cc new file mode 100644 index 000000000..1e2fc00b2 --- /dev/null +++ b/wasm/vad-asr/sherpa-onnx-wasm-main-vad-asr.cc @@ -0,0 +1,19 @@ +// wasm/sherpa-onnx-wasm-main-vad-asr.cc +// +// Copyright (c) 2024 Xiaomi Corporation +#include + +#include +#include + +#include "sherpa-onnx/c-api/c-api.h" + +// see also +// https://emscripten.org/docs/porting/connecting_cpp_and_javascript/Interacting-with-code.html + +extern "C" { + +void CopyHeap(const char *src, int32_t num_bytes, char *dst) { + std::copy(src, src + num_bytes, dst); +} +} diff --git a/wasm/vad/assets/README.md b/wasm/vad/assets/README.md index 99510982a..3d5a76210 100644 --- a/wasm/vad/assets/README.md +++ b/wasm/vad/assets/README.md @@ -3,3 +3,6 @@ Please download https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx and put `silero_vad.onnx` into the current directory, i.e., `wasm/vad/assets`. + +You can find example build script at +https://github.com/k2-fsa/sherpa-onnx/blob/master/.github/workflows/wasm-simd-hf-space-silero-vad.yaml diff --git a/wasm/vad/index.html b/wasm/vad/index.html index 5d8e0372c..7ae2a76e6 100644 --- a/wasm/vad/index.html +++ b/wasm/vad/index.html @@ -3,7 +3,7 @@ - Next-gen Kaldi WebAssembly with sherpa-onnx for Text-to-speech + Next-gen Kaldi WebAssembly with sherpa-onnx for VAD + + + +

+ Next-gen Kaldi + WebAssembly
+ Speaker Diarization
with sherpa-onnx +

+
+ Loading model ... ... +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ +
+ + + + + diff --git a/wasm/speaker-diarization/sherpa-onnx-speaker-diarization.js b/wasm/speaker-diarization/sherpa-onnx-speaker-diarization.js new file mode 100644 index 000000000..ccfc8373c --- /dev/null +++ b/wasm/speaker-diarization/sherpa-onnx-speaker-diarization.js @@ -0,0 +1,295 @@ + +function freeConfig(config, Module) { + if ('buffer' in config) { + Module._free(config.buffer); + } + + if ('config' in config) { + freeConfig(config.config, Module) + } + + if ('segmentation' in config) { + freeConfig(config.segmentation, Module) + } + + if ('embedding' in config) { + freeConfig(config.embedding, Module) + } + + if ('clustering' in config) { + freeConfig(config.clustering, Module) + } + + Module._free(config.ptr); +} + +function initSherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig( + config, Module) { + const modelLen = Module.lengthBytesUTF8(config.model || '') + 1; + const n = modelLen; + const buffer = Module._malloc(n); + + const len = 1 * 4; + const ptr = Module._malloc(len); + + let offset = 0; + Module.stringToUTF8(config.model || '', buffer + offset, modelLen); + offset += modelLen; + + offset = 0; + Module.setValue(ptr, buffer + offset, 'i8*'); + + return { + buffer: buffer, ptr: ptr, len: len, + } +} + +function initSherpaOnnxOfflineSpeakerSegmentationModelConfig(config, Module) { + if (!('pyannote' in config)) { + config.pyannote = { + model: '', + }; + } + + const pyannote = initSherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig( + config.pyannote, Module); + + const len = pyannote.len + 3 * 4; + const ptr = Module._malloc(len); + + let offset = 0; + Module._CopyHeap(pyannote.ptr, pyannote.len, ptr + offset); + offset += pyannote.len; + + Module.setValue(ptr + offset, config.numThreads || 1, 'i32'); + offset += 4; + + Module.setValue(ptr + offset, config.debug || 1, 'i32'); + offset += 4; + + const providerLen = Module.lengthBytesUTF8(config.provider || 'cpu') + 1; + const buffer = Module._malloc(providerLen); + Module.stringToUTF8(config.provider || 'cpu', buffer, providerLen); + Module.setValue(ptr + offset, buffer, 'i8*'); + + return { + buffer: buffer, + ptr: ptr, + len: len, + config: pyannote, + }; +} + +function initSherpaOnnxSpeakerEmbeddingExtractorConfig(config, Module) { + const modelLen = Module.lengthBytesUTF8(config.model || '') + 1; + const providerLen = Module.lengthBytesUTF8(config.provider || 'cpu') + 1; + const n = modelLen + providerLen; + const buffer = Module._malloc(n); + + const len = 4 * 4; + const ptr = Module._malloc(len); + + let offset = 0; + Module.stringToUTF8(config.model || '', buffer + offset, modelLen); + offset += modelLen; + + Module.stringToUTF8(config.provider || 'cpu', buffer + offset, providerLen); + offset += providerLen; + + offset = 0 + Module.setValue(ptr + offset, buffer, 'i8*'); + offset += 4; + + Module.setValue(ptr + offset, config.numThreads || 1, 'i32'); + offset += 4; + + Module.setValue(ptr + offset, config.debug || 1, 'i32'); + offset += 4; + + Module.setValue(ptr + offset, buffer + modelLen, 'i8*'); + offset += 4; + + return { + buffer: buffer, + ptr: ptr, + len: len, + }; +} + +function initSherpaOnnxFastClusteringConfig(config, Module) { + const len = 2 * 4; + const ptr = Module._malloc(len); + + let offset = 0; + Module.setValue(ptr + offset, config.numClusters || -1, 'i32'); + offset += 4; + + Module.setValue(ptr + offset, config.threshold || 0.5, 'float'); + offset += 4; + + return { + ptr: ptr, + len: len, + }; +} + +function initSherpaOnnxOfflineSpeakerDiarizationConfig(config, Module) { + if (!('segmentation' in config)) { + config.segmentation = { + pyannote: {model: ''}, + numThreads: 1, + debug: 0, + provider: 'cpu', + }; + } + + if (!('embedding' in config)) { + config.embedding = { + model: '', + numThreads: 1, + debug: 0, + provider: 'cpu', + }; + } + + if (!('clustering' in config)) { + config.clustering = { + numClusters: -1, + threshold: 0.5, + }; + } + + const segmentation = initSherpaOnnxOfflineSpeakerSegmentationModelConfig( + config.segmentation, Module); + + const embedding = + initSherpaOnnxSpeakerEmbeddingExtractorConfig(config.embedding, Module); + + const clustering = + initSherpaOnnxFastClusteringConfig(config.clustering, Module); + + const len = segmentation.len + embedding.len + clustering.len + 2 * 4; + const ptr = Module._malloc(len); + + let offset = 0; + Module._CopyHeap(segmentation.ptr, segmentation.len, ptr + offset); + offset += segmentation.len; + + Module._CopyHeap(embedding.ptr, embedding.len, ptr + offset); + offset += embedding.len; + + Module._CopyHeap(clustering.ptr, clustering.len, ptr + offset); + offset += clustering.len; + + Module.setValue(ptr + offset, config.minDurationOn || 0.2, 'float'); + offset += 4; + + Module.setValue(ptr + offset, config.minDurationOff || 0.5, 'float'); + offset += 4; + + return { + ptr: ptr, len: len, segmentation: segmentation, embedding: embedding, + clustering: clustering, + } +} + +class OfflineSpeakerDiarization { + constructor(configObj, Module) { + const config = + initSherpaOnnxOfflineSpeakerDiarizationConfig(configObj, Module) + // Module._MyPrint(config.ptr); + + const handle = + Module._SherpaOnnxCreateOfflineSpeakerDiarization(config.ptr); + + freeConfig(config, Module); + + this.handle = handle; + this.sampleRate = + Module._SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(this.handle); + this.Module = Module + + this.config = configObj; + } + + free() { + this.Module._SherpaOnnxDestroyOfflineSpeakerDiarization(this.handle); + this.handle = 0 + } + + setConfig(configObj) { + if (!('clustering' in configObj)) { + return; + } + + const config = + initSherpaOnnxOfflineSpeakerDiarizationConfig(configObj, this.Module); + + this.Module._SherpaOnnxOfflineSpeakerDiarizationSetConfig( + this.handle, config.ptr); + + freeConfig(config, Module); + + this.config.clustering = configObj.clustering; + } + + process(samples) { + const pointer = + this.Module._malloc(samples.length * samples.BYTES_PER_ELEMENT); + this.Module.HEAPF32.set(samples, pointer / samples.BYTES_PER_ELEMENT); + + let r = this.Module._SherpaOnnxOfflineSpeakerDiarizationProcess( + this.handle, pointer, samples.length); + this.Module._free(pointer); + + let numSegments = + this.Module._SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(r); + + let segments = + this.Module._SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime( + r); + + let ans = []; + + let sizeOfSegment = 3 * 4; + for (let i = 0; i < numSegments; ++i) { + let p = segments + i * sizeOfSegment + + let start = this.Module.HEAPF32[p / 4 + 0]; + let end = this.Module.HEAPF32[p / 4 + 1]; + let speaker = this.Module.HEAP32[p / 4 + 2]; + + ans.push({start: start, end: end, speaker: speaker}); + } + + this.Module._SherpaOnnxOfflineSpeakerDiarizationDestroySegment(segments); + this.Module._SherpaOnnxOfflineSpeakerDiarizationDestroyResult(r); + + return ans; + } +} + +function createOfflineSpeakerDiarization(Module, myConfig) { + const config = { + segmentation: { + pyannote: {model: './segmentation.onnx'}, + }, + embedding: {model: './embedding.onnx'}, + clustering: {numClusters: -1, threshold: 0.5}, + minDurationOn: 0.3, + minDurationOff: 0.5, + }; + + if (myConfig) { + config = myConfig; + } + + return new OfflineSpeakerDiarization(config, Module); +} + +if (typeof process == 'object' && typeof process.versions == 'object' && + typeof process.versions.node == 'string') { + module.exports = { + createOfflineSpeakerDiarization, + }; +} diff --git a/wasm/speaker-diarization/sherpa-onnx-wasm-main-speaker-diarization.cc b/wasm/speaker-diarization/sherpa-onnx-wasm-main-speaker-diarization.cc new file mode 100644 index 000000000..6e83f61d8 --- /dev/null +++ b/wasm/speaker-diarization/sherpa-onnx-wasm-main-speaker-diarization.cc @@ -0,0 +1,63 @@ +// wasm/sherpa-onnx-wasm-main-speaker-diarization.cc +// +// Copyright (c) 2024 Xiaomi Corporation +#include + +#include +#include + +#include "sherpa-onnx/c-api/c-api.h" + +// see also +// https://emscripten.org/docs/porting/connecting_cpp_and_javascript/Interacting-with-code.html + +extern "C" { + +static_assert(sizeof(SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig) == + 1 * 4, + ""); + +static_assert( + sizeof(SherpaOnnxOfflineSpeakerSegmentationModelConfig) == + sizeof(SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig) + 3 * 4, + ""); + +static_assert(sizeof(SherpaOnnxFastClusteringConfig) == 2 * 4, ""); + +static_assert(sizeof(SherpaOnnxSpeakerEmbeddingExtractorConfig) == 4 * 4, ""); + +static_assert(sizeof(SherpaOnnxOfflineSpeakerDiarizationConfig) == + sizeof(SherpaOnnxOfflineSpeakerSegmentationModelConfig) + + sizeof(SherpaOnnxSpeakerEmbeddingExtractorConfig) + + sizeof(SherpaOnnxFastClusteringConfig) + 2 * 4, + ""); + +void MyPrint(const SherpaOnnxOfflineSpeakerDiarizationConfig *sd_config) { + const auto &segmentation = sd_config->segmentation; + const auto &embedding = sd_config->embedding; + const auto &clustering = sd_config->clustering; + + fprintf(stdout, "----------segmentation config----------\n"); + fprintf(stdout, "pyannote model: %s\n", segmentation.pyannote.model); + fprintf(stdout, "num threads: %d\n", segmentation.num_threads); + fprintf(stdout, "debug: %d\n", segmentation.debug); + fprintf(stdout, "provider: %s\n", segmentation.provider); + + fprintf(stdout, "----------embedding config----------\n"); + fprintf(stdout, "model: %s\n", embedding.model); + fprintf(stdout, "num threads: %d\n", embedding.num_threads); + fprintf(stdout, "debug: %d\n", embedding.debug); + fprintf(stdout, "provider: %s\n", embedding.provider); + + fprintf(stdout, "----------clustering config----------\n"); + fprintf(stdout, "num_clusters: %d\n", clustering.num_clusters); + fprintf(stdout, "threshold: %.3f\n", clustering.threshold); + + fprintf(stdout, "min_duration_on: %.3f\n", sd_config->min_duration_on); + fprintf(stdout, "min_duration_off: %.3f\n", sd_config->min_duration_off); +} + +void CopyHeap(const char *src, int32_t num_bytes, char *dst) { + std::copy(src, src + num_bytes, dst); +} +} From f1b311ee4fe4d84468ed93d8479097b21d13c5d2 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 11 Oct 2024 10:27:16 +0800 Subject: [PATCH 227/237] Handle audio files less than 10s long for speaker diarization. (#1412) If the input audio file is less than 10 seconds long, there is only one chunk, and there is no need to compute embeddings or do clustering. We can use the segmentation result from the speaker segmentation model directly. --- ...ffline-speaker-diarization-pyannote-impl.h | 35 +++++++++++++++++-- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/sherpa-onnx/csrc/offline-speaker-diarization-pyannote-impl.h b/sherpa-onnx/csrc/offline-speaker-diarization-pyannote-impl.h index 9667088d5..8f669e27c 100644 --- a/sherpa-onnx/csrc/offline-speaker-diarization-pyannote-impl.h +++ b/sherpa-onnx/csrc/offline-speaker-diarization-pyannote-impl.h @@ -99,6 +99,14 @@ class OfflineSpeakerDiarizationPyannoteImpl segmentations.clear(); + if (labels.size() == 1) { + if (callback) { + callback(1, 1, callback_arg); + } + + return HandleOneChunkSpecialCase(labels[0], n); + } + // labels[i] is a 0-1 matrix of shape (num_frames, num_speakers) // speaker count per frame @@ -201,7 +209,7 @@ class OfflineSpeakerDiarizationPyannoteImpl } int32_t num_chunks = (n - window_size) / window_shift + 1; - bool has_last_chunk = (n - window_size) % window_shift > 0; + bool has_last_chunk = ((n - window_size) % window_shift) > 0; ans.reserve(num_chunks + has_last_chunk); @@ -524,9 +532,9 @@ class OfflineSpeakerDiarizationPyannoteImpl count(seq, Eigen::all).array() += labels[i].array(); } - bool has_last_chunk = (num_samples - window_size) % window_shift > 0; + bool has_last_chunk = ((num_samples - window_size) % window_shift) > 0; - if (has_last_chunk) { + if (!has_last_chunk) { return count; } @@ -622,6 +630,27 @@ class OfflineSpeakerDiarizationPyannoteImpl return ans; } + OfflineSpeakerDiarizationResult HandleOneChunkSpecialCase( + const Matrix2DInt32 &final_labels, int32_t num_samples) const { + const auto &meta_data = segmentation_model_.GetModelMetaData(); + int32_t window_size = meta_data.window_size; + int32_t window_shift = meta_data.window_shift; + int32_t receptive_field_shift = meta_data.receptive_field_shift; + + bool has_last_chunk = (num_samples - window_size) % window_shift > 0; + if (!has_last_chunk) { + return ComputeResult(final_labels); + } + + int32_t num_frames = final_labels.rows(); + + int32_t new_num_frames = num_samples / receptive_field_shift; + + num_frames = (new_num_frames <= num_frames) ? new_num_frames : num_frames; + + return ComputeResult(final_labels(Eigen::seq(0, num_frames), Eigen::all)); + } + void MergeSegments( std::vector *segments) const { float min_duration_off = config_.min_duration_off; From eefc17209589fe3b950f561bc9c8b8d1c9b8a742 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 11 Oct 2024 11:40:10 +0800 Subject: [PATCH 228/237] JavaScript API with WebAssembly for speaker diarization (#1414) #1408 uses [node-addon-api](https://github.com/nodejs/node-addon-api) to call C API from JavaScript, whereas this pull request uses WebAssembly to call C API from JavaScript. --- .github/scripts/test-nodejs-npm.sh | 12 ++++ .github/workflows/test-build-wheel.yaml | 2 +- .github/workflows/test-pip-install.yaml | 2 +- nodejs-examples/README.md | 16 +++++ .../test-offline-speaker-diarization.js | 64 +++++++++++++++++++ scripts/nodejs/index.js | 8 +++ wasm/nodejs/CMakeLists.txt | 12 ++++ wasm/speaker-diarization/assets/README.md | 4 -- .../sherpa-onnx-speaker-diarization.js | 12 ++-- 9 files changed, 122 insertions(+), 10 deletions(-) create mode 100644 nodejs-examples/test-offline-speaker-diarization.js diff --git a/.github/scripts/test-nodejs-npm.sh b/.github/scripts/test-nodejs-npm.sh index c41a0de65..03dec04aa 100755 --- a/.github/scripts/test-nodejs-npm.sh +++ b/.github/scripts/test-nodejs-npm.sh @@ -9,6 +9,18 @@ git status ls -lh ls -lh node_modules +echo '-----speaker diarization----------' +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 +tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 +rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav + +node ./test-offline-speaker-diarization.js +rm -rfv *.wav *.onnx sherpa-onnx-pyannote-* + echo '-----vad+whisper----------' curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 diff --git a/.github/workflows/test-build-wheel.yaml b/.github/workflows/test-build-wheel.yaml index a9b2db589..8b7472b84 100644 --- a/.github/workflows/test-build-wheel.yaml +++ b/.github/workflows/test-build-wheel.yaml @@ -139,7 +139,7 @@ jobs: export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH - export PATH=/c/hostedtoolcache/windows/Python/3.12.6/x64/bin:$PATH + export PATH=/c/hostedtoolcache/windows/Python/3.12.7/x64/bin:$PATH which sherpa-onnx sherpa-onnx --help diff --git a/.github/workflows/test-pip-install.yaml b/.github/workflows/test-pip-install.yaml index 0f73e3643..b59b66b53 100644 --- a/.github/workflows/test-pip-install.yaml +++ b/.github/workflows/test-pip-install.yaml @@ -104,7 +104,7 @@ jobs: export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH export PATH=/c/hostedtoolcache/windows/Python/3.11.9/x64/bin:$PATH - export PATH=/c/hostedtoolcache/windows/Python/3.12.6/x64/bin:$PATH + export PATH=/c/hostedtoolcache/windows/Python/3.12.7/x64/bin:$PATH sherpa-onnx --help sherpa-onnx-keyword-spotter --help diff --git a/nodejs-examples/README.md b/nodejs-examples/README.md index 73a85de77..496a0062b 100644 --- a/nodejs-examples/README.md +++ b/nodejs-examples/README.md @@ -22,6 +22,22 @@ In the following, we describe how to use [sherpa-onnx](https://github.com/k2-fsa for text-to-speech and speech-to-text. +# Speaker diarization + +In the following, we demonstrate how to run speaker diarization. + +```bash +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 +tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 +rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav + +node ./test-offline-speaker-diarization.js +``` + # Text-to-speech In the following, we demonstrate how to run text-to-speech. diff --git a/nodejs-examples/test-offline-speaker-diarization.js b/nodejs-examples/test-offline-speaker-diarization.js new file mode 100644 index 000000000..de0f4a45b --- /dev/null +++ b/nodejs-examples/test-offline-speaker-diarization.js @@ -0,0 +1,64 @@ +// Copyright (c) 2024 Xiaomi Corporation +const sherpa_onnx = require('sherpa-onnx'); + +// clang-format off +/* Please use the following commands to download files + used in this script + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 +tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 +rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav + + */ +// clang-format on + +const config = { + segmentation: { + pyannote: { + model: './sherpa-onnx-pyannote-segmentation-3-0/model.onnx', + debug: 1, + }, + }, + embedding: { + model: './3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx', + debug: 1, + }, + clustering: { + // since we know that the test wave file + // ./0-four-speakers-zh.wav contains 4 speakers, we use 4 for numClusters + // here. if you don't have such information, please set numClusters to -1 + numClusters: 4, + + // If numClusters is not -1, then threshold is ignored. + // + // A larger threshold leads to fewer clusters, i.e., fewer speakers + // A smaller threshold leads to more clusters, i.e., more speakers + // You need to tune it by yourself. + threshold: 0.5, + }, + + // If a segment is shorter than minDurationOn, we discard it + minDurationOn: 0.2, // in seconds + + // If the gap between two segments is less than minDurationOff, then we + // merge these two segments into a single one + minDurationOff: 0.5, // in seconds +}; + +const waveFilename = './0-four-speakers-zh.wav'; + +const sd = sherpa_onnx.createOfflineSpeakerDiarization(config); +console.log('Started') + +const wave = sherpa_onnx.readWave(waveFilename); +if (sd.sampleRate != wave.sampleRate) { + throw new Error( + `Expected sample rate: ${sd.sampleRate}, given: ${wave.sampleRate}`); +} + +const segments = sd.process(wave.samples); +console.log(segments); diff --git a/scripts/nodejs/index.js b/scripts/nodejs/index.js index 3f0789edb..b1b77841c 100644 --- a/scripts/nodejs/index.js +++ b/scripts/nodejs/index.js @@ -7,6 +7,8 @@ const sherpa_onnx_tts = require('./sherpa-onnx-tts.js'); const sherpa_onnx_kws = require('./sherpa-onnx-kws.js'); const sherpa_onnx_wave = require('./sherpa-onnx-wave.js'); const sherpa_onnx_vad = require('./sherpa-onnx-vad.js'); +const sherpa_onnx_speaker_diarization = + require('./sherpa-onnx-speaker-diarization.js'); function createOnlineRecognizer(config) { return sherpa_onnx_asr.createOnlineRecognizer(wasmModule, config); @@ -32,6 +34,11 @@ function createVad(config) { return sherpa_onnx_vad.createVad(wasmModule, config); } +function createOfflineSpeakerDiarization(config) { + return sherpa_onnx_speaker_diarization.createOfflineSpeakerDiarization( + wasmModule, config); +} + function readWave(filename) { return sherpa_onnx_wave.readWave(filename, wasmModule); } @@ -51,4 +58,5 @@ module.exports = { writeWave, createCircularBuffer, createVad, + createOfflineSpeakerDiarization, }; diff --git a/wasm/nodejs/CMakeLists.txt b/wasm/nodejs/CMakeLists.txt index 4efc879a1..dc8d8c854 100644 --- a/wasm/nodejs/CMakeLists.txt +++ b/wasm/nodejs/CMakeLists.txt @@ -70,6 +70,17 @@ set(exported_functions SherpaOnnxDestroySpeechSegment SherpaOnnxVoiceActivityDetectorReset SherpaOnnxVoiceActivityDetectorFlush + # Speaker diarization + SherpaOnnxCreateOfflineSpeakerDiarization + SherpaOnnxDestroyOfflineSpeakerDiarization + SherpaOnnxOfflineSpeakerDiarizationDestroyResult + SherpaOnnxOfflineSpeakerDiarizationDestroySegment + SherpaOnnxOfflineSpeakerDiarizationGetSampleRate + SherpaOnnxOfflineSpeakerDiarizationProcess + SherpaOnnxOfflineSpeakerDiarizationProcessWithCallback + SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments + SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime + SherpaOnnxOfflineSpeakerDiarizationSetConfig # SherpaOnnxFileExists SherpaOnnxReadWave @@ -109,6 +120,7 @@ install( ${CMAKE_SOURCE_DIR}/wasm/tts/sherpa-onnx-tts.js ${CMAKE_SOURCE_DIR}/wasm/kws/sherpa-onnx-kws.js ${CMAKE_SOURCE_DIR}/wasm/vad/sherpa-onnx-vad.js + ${CMAKE_SOURCE_DIR}/wasm/speaker-diarization/sherpa-onnx-speaker-diarization.js ${CMAKE_SOURCE_DIR}/wasm/nodejs/sherpa-onnx-wave.js "$/sherpa-onnx-wasm-nodejs.js" "$/sherpa-onnx-wasm-nodejs.wasm" diff --git a/wasm/speaker-diarization/assets/README.md b/wasm/speaker-diarization/assets/README.md index 5c06139e2..f09a5899d 100644 --- a/wasm/speaker-diarization/assets/README.md +++ b/wasm/speaker-diarization/assets/README.md @@ -12,7 +12,6 @@ Remember to rename the downloaded files. The following is an example. - ```bash cd wasm/speaker-diarization/assets/ @@ -22,9 +21,6 @@ rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 cp sherpa-onnx-pyannote-segmentation-3-0/model.onnx ./segmentation.onnx rm -rf sherpa-onnx-pyannote-segmentation-3-0 - curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx mv 3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ./embedding.onnx - - ``` diff --git a/wasm/speaker-diarization/sherpa-onnx-speaker-diarization.js b/wasm/speaker-diarization/sherpa-onnx-speaker-diarization.js index ccfc8373c..741013480 100644 --- a/wasm/speaker-diarization/sherpa-onnx-speaker-diarization.js +++ b/wasm/speaker-diarization/sherpa-onnx-speaker-diarization.js @@ -64,7 +64,7 @@ function initSherpaOnnxOfflineSpeakerSegmentationModelConfig(config, Module) { Module.setValue(ptr + offset, config.numThreads || 1, 'i32'); offset += 4; - Module.setValue(ptr + offset, config.debug || 1, 'i32'); + Module.setValue(ptr + offset, config.debug || 0, 'i32'); offset += 4; const providerLen = Module.lengthBytesUTF8(config.provider || 'cpu') + 1; @@ -103,7 +103,7 @@ function initSherpaOnnxSpeakerEmbeddingExtractorConfig(config, Module) { Module.setValue(ptr + offset, config.numThreads || 1, 'i32'); offset += 4; - Module.setValue(ptr + offset, config.debug || 1, 'i32'); + Module.setValue(ptr + offset, config.debug || 0, 'i32'); offset += 4; Module.setValue(ptr + offset, buffer + modelLen, 'i8*'); @@ -270,11 +270,15 @@ class OfflineSpeakerDiarization { } function createOfflineSpeakerDiarization(Module, myConfig) { - const config = { + let config = { segmentation: { pyannote: {model: './segmentation.onnx'}, + debug: 1, + }, + embedding: { + model: './embedding.onnx', + debug: 1, }, - embedding: {model: './embedding.onnx'}, clustering: {numClusters: -1, threshold: 0.5}, minDurationOn: 0.3, minDurationOff: 0.5, From 2d412b1190778bc35f337ef1feeb12292b5c9f92 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 11 Oct 2024 14:41:53 +0800 Subject: [PATCH 229/237] Kotlin API for speaker diarization (#1415) --- .../OfflineSpeakerDiarization.kt | 1 + kotlin-api-examples/run.sh | 31 +++ .../test_offline_speaker_diarization.kt | 53 +++++ .../csrc/offline-speaker-diarization-result.h | 2 +- sherpa-onnx/jni/CMakeLists.txt | 6 + .../jni/offline-speaker-diarization.cc | 219 ++++++++++++++++++ .../kotlin-api/OfflineSpeakerDiarization.kt | 101 ++++++++ 7 files changed, 412 insertions(+), 1 deletion(-) create mode 120000 kotlin-api-examples/OfflineSpeakerDiarization.kt create mode 100644 kotlin-api-examples/test_offline_speaker_diarization.kt create mode 100644 sherpa-onnx/jni/offline-speaker-diarization.cc create mode 100644 sherpa-onnx/kotlin-api/OfflineSpeakerDiarization.kt diff --git a/kotlin-api-examples/OfflineSpeakerDiarization.kt b/kotlin-api-examples/OfflineSpeakerDiarization.kt new file mode 120000 index 000000000..870612b4c --- /dev/null +++ b/kotlin-api-examples/OfflineSpeakerDiarization.kt @@ -0,0 +1 @@ +../sherpa-onnx/kotlin-api/OfflineSpeakerDiarization.kt \ No newline at end of file diff --git a/kotlin-api-examples/run.sh b/kotlin-api-examples/run.sh index 23e86886e..50e7816f1 100755 --- a/kotlin-api-examples/run.sh +++ b/kotlin-api-examples/run.sh @@ -285,6 +285,37 @@ function testPunctuation() { java -Djava.library.path=../build/lib -jar $out_filename } +function testOfflineSpeakerDiarization() { + if [ ! -f ./sherpa-onnx-pyannote-segmentation-3-0/model.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + fi + + if [ ! -f ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx + fi + + if [ ! -f ./0-four-speakers-zh.wav ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav + fi + + out_filename=test_offline_speaker_diarization.jar + kotlinc-jvm -include-runtime -d $out_filename \ + test_offline_speaker_diarization.kt \ + OfflineSpeakerDiarization.kt \ + Speaker.kt \ + OnlineStream.kt \ + WaveReader.kt \ + faked-asset-manager.kt \ + faked-log.kt + + ls -lh $out_filename + + java -Djava.library.path=../build/lib -jar $out_filename +} + +testOfflineSpeakerDiarization testSpeakerEmbeddingExtractor testOnlineAsr testTts diff --git a/kotlin-api-examples/test_offline_speaker_diarization.kt b/kotlin-api-examples/test_offline_speaker_diarization.kt new file mode 100644 index 000000000..96c33f062 --- /dev/null +++ b/kotlin-api-examples/test_offline_speaker_diarization.kt @@ -0,0 +1,53 @@ +package com.k2fsa.sherpa.onnx + +fun main() { + testOfflineSpeakerDiarization() +} + +fun callback(numProcessedChunks: Int, numTotalChunks: Int, arg: Long): Int { + val progress = numProcessedChunks.toFloat() / numTotalChunks * 100 + val s = "%.2f".format(progress) + println("Progress: ${s}%"); + + return 0 +} + +fun testOfflineSpeakerDiarization() { + var config = OfflineSpeakerDiarizationConfig( + segmentation=OfflineSpeakerSegmentationModelConfig( + pyannote=OfflineSpeakerSegmentationPyannoteModelConfig("./sherpa-onnx-pyannote-segmentation-3-0/model.onnx"), + ), + embedding=SpeakerEmbeddingExtractorConfig( + model="./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx", + ), + + // The test wave file ./0-four-speakers-zh.wav contains four speakers, so + // we use numClusters=4 here. If you don't know the number of speakers + // in the test wave file, please set the threshold like below. + // + // clustering=FastClusteringConfig(threshold=0.5), + // + // WARNING: You need to tune threshold by yourself. + // A larger threshold leads to fewer clusters, i.e., few speakers. + // A smaller threshold leads to more clusters, i.e., more speakers. + // + clustering=FastClusteringConfig(numClusters=4), + ) + + val sd = OfflineSpeakerDiarization(config=config) + + val waveData = WaveReader.readWave( + filename = "./0-four-speakers-zh.wav", + ) + + if (sd.sampleRate() != waveData.sampleRate) { + println("Expected sample rate: ${sd.sampleRate()}, given: ${waveData.sampleRate}") + return + } + + // val segments = sd.process(waveData.samples) // this one is also ok + val segments = sd.processWithCallback(waveData.samples, callback=::callback) + for (segment in segments) { + println("${segment.start} -- ${segment.end} speaker_${segment.speaker}") + } +} diff --git a/sherpa-onnx/csrc/offline-speaker-diarization-result.h b/sherpa-onnx/csrc/offline-speaker-diarization-result.h index 5fb144f5c..6298a87c7 100644 --- a/sherpa-onnx/csrc/offline-speaker-diarization-result.h +++ b/sherpa-onnx/csrc/offline-speaker-diarization-result.h @@ -58,7 +58,7 @@ class OfflineSpeakerDiarizationResult { std::vector> SortBySpeaker() const; - public: + private: std::vector segments_; }; diff --git a/sherpa-onnx/jni/CMakeLists.txt b/sherpa-onnx/jni/CMakeLists.txt index 998379084..23544c177 100644 --- a/sherpa-onnx/jni/CMakeLists.txt +++ b/sherpa-onnx/jni/CMakeLists.txt @@ -33,6 +33,12 @@ if(SHERPA_ONNX_ENABLE_TTS) ) endif() +if(SHERPA_ONNX_ENABLE_SPEAKER_DIARIZATION) + list(APPEND sources + offline-speaker-diarization.cc + ) +endif() + add_library(sherpa-onnx-jni ${sources}) target_compile_definitions(sherpa-onnx-jni PRIVATE SHERPA_ONNX_BUILD_SHARED_LIBS=1) diff --git a/sherpa-onnx/jni/offline-speaker-diarization.cc b/sherpa-onnx/jni/offline-speaker-diarization.cc new file mode 100644 index 000000000..a0eef8b9c --- /dev/null +++ b/sherpa-onnx/jni/offline-speaker-diarization.cc @@ -0,0 +1,219 @@ +// sherpa-onnx/jni/offline-speaker-diarization.cc +// +// Copyright (c) 2024 Xiaomi Corporation + +#include "sherpa-onnx/csrc/offline-speaker-diarization.h" + +#include "sherpa-onnx/csrc/macros.h" +#include "sherpa-onnx/jni/common.h" + +namespace sherpa_onnx { + +static OfflineSpeakerDiarizationConfig GetOfflineSpeakerDiarizationConfig( + JNIEnv *env, jobject config) { + OfflineSpeakerDiarizationConfig ans; + + jclass cls = env->GetObjectClass(config); + jfieldID fid; + + //---------- segmentation ---------- + fid = env->GetFieldID( + cls, "segmentation", + "Lcom/k2fsa/sherpa/onnx/OfflineSpeakerSegmentationModelConfig;"); + jobject segmentation_config = env->GetObjectField(config, fid); + jclass segmentation_config_cls = env->GetObjectClass(segmentation_config); + + fid = env->GetFieldID( + segmentation_config_cls, "pyannote", + "Lcom/k2fsa/sherpa/onnx/OfflineSpeakerSegmentationPyannoteModelConfig;"); + jobject pyannote_config = env->GetObjectField(segmentation_config, fid); + jclass pyannote_config_cls = env->GetObjectClass(pyannote_config); + + fid = env->GetFieldID(pyannote_config_cls, "model", "Ljava/lang/String;"); + jstring s = (jstring)env->GetObjectField(pyannote_config, fid); + const char *p = env->GetStringUTFChars(s, nullptr); + ans.segmentation.pyannote.model = p; + env->ReleaseStringUTFChars(s, p); + + fid = env->GetFieldID(segmentation_config_cls, "numThreads", "I"); + ans.segmentation.num_threads = env->GetIntField(segmentation_config, fid); + + fid = env->GetFieldID(segmentation_config_cls, "debug", "Z"); + ans.segmentation.debug = env->GetBooleanField(segmentation_config, fid); + + fid = env->GetFieldID(segmentation_config_cls, "provider", + "Ljava/lang/String;"); + s = (jstring)env->GetObjectField(segmentation_config, fid); + p = env->GetStringUTFChars(s, nullptr); + ans.segmentation.provider = p; + env->ReleaseStringUTFChars(s, p); + + //---------- embedding ---------- + fid = env->GetFieldID( + cls, "embedding", + "Lcom/k2fsa/sherpa/onnx/SpeakerEmbeddingExtractorConfig;"); + jobject embedding_config = env->GetObjectField(config, fid); + jclass embedding_config_cls = env->GetObjectClass(embedding_config); + + fid = env->GetFieldID(embedding_config_cls, "model", "Ljava/lang/String;"); + s = (jstring)env->GetObjectField(embedding_config, fid); + p = env->GetStringUTFChars(s, nullptr); + ans.embedding.model = p; + env->ReleaseStringUTFChars(s, p); + + fid = env->GetFieldID(embedding_config_cls, "numThreads", "I"); + ans.embedding.num_threads = env->GetIntField(embedding_config, fid); + + fid = env->GetFieldID(embedding_config_cls, "debug", "Z"); + ans.embedding.debug = env->GetBooleanField(embedding_config, fid); + + fid = env->GetFieldID(embedding_config_cls, "provider", "Ljava/lang/String;"); + s = (jstring)env->GetObjectField(embedding_config, fid); + p = env->GetStringUTFChars(s, nullptr); + ans.embedding.provider = p; + env->ReleaseStringUTFChars(s, p); + + //---------- clustering ---------- + fid = env->GetFieldID(cls, "clustering", + "Lcom/k2fsa/sherpa/onnx/FastClusteringConfig;"); + jobject clustering_config = env->GetObjectField(config, fid); + jclass clustering_config_cls = env->GetObjectClass(clustering_config); + + fid = env->GetFieldID(clustering_config_cls, "numClusters", "I"); + ans.clustering.num_clusters = env->GetIntField(clustering_config, fid); + + fid = env->GetFieldID(clustering_config_cls, "threshold", "F"); + ans.clustering.threshold = env->GetFloatField(clustering_config, fid); + + // its own fields + fid = env->GetFieldID(cls, "minDurationOn", "F"); + ans.min_duration_on = env->GetFloatField(config, fid); + + fid = env->GetFieldID(cls, "minDurationOff", "F"); + ans.min_duration_off = env->GetFloatField(config, fid); + + return ans; +} + +} // namespace sherpa_onnx + +SHERPA_ONNX_EXTERN_C +JNIEXPORT jlong JNICALL +Java_com_k2fsa_sherpa_onnx_OfflineSpeakerDiarization_newFromAsset( + JNIEnv *env, jobject /*obj*/, jobject asset_manager, jobject _config) { + return 0; +} + +SHERPA_ONNX_EXTERN_C +JNIEXPORT jlong JNICALL +Java_com_k2fsa_sherpa_onnx_OfflineSpeakerDiarization_newFromFile( + JNIEnv *env, jobject /*obj*/, jobject _config) { + auto config = sherpa_onnx::GetOfflineSpeakerDiarizationConfig(env, _config); + SHERPA_ONNX_LOGE("config:\n%s", config.ToString().c_str()); + + if (!config.Validate()) { + SHERPA_ONNX_LOGE("Errors found in config!"); + return 0; + } + + auto sd = new sherpa_onnx::OfflineSpeakerDiarization(config); + + return (jlong)sd; +} + +SHERPA_ONNX_EXTERN_C +JNIEXPORT void JNICALL +Java_com_k2fsa_sherpa_onnx_OfflineSpeakerDiarization_setConfig( + JNIEnv *env, jobject /*obj*/, jlong ptr, jobject _config) { + auto config = sherpa_onnx::GetOfflineSpeakerDiarizationConfig(env, _config); + SHERPA_ONNX_LOGE("config:\n%s", config.ToString().c_str()); + + auto sd = reinterpret_cast(ptr); + sd->SetConfig(config); +} + +SHERPA_ONNX_EXTERN_C +JNIEXPORT void JNICALL +Java_com_k2fsa_sherpa_onnx_OfflineSpeakerDiarization_delete(JNIEnv * /*env*/, + jobject /*obj*/, + jlong ptr) { + delete reinterpret_cast(ptr); +} + +static jobjectArray ProcessImpl( + JNIEnv *env, + const std::vector + &segments) { + jclass cls = + env->FindClass("com/k2fsa/sherpa/onnx/OfflineSpeakerDiarizationSegment"); + + jobjectArray obj_arr = + (jobjectArray)env->NewObjectArray(segments.size(), cls, nullptr); + + jmethodID constructor = env->GetMethodID(cls, "", "(FFI)V"); + + for (int32_t i = 0; i != segments.size(); ++i) { + const auto &s = segments[i]; + jobject segment = + env->NewObject(cls, constructor, s.Start(), s.End(), s.Speaker()); + env->SetObjectArrayElement(obj_arr, i, segment); + } + + return obj_arr; +} + +SHERPA_ONNX_EXTERN_C +JNIEXPORT jobjectArray JNICALL +Java_com_k2fsa_sherpa_onnx_OfflineSpeakerDiarization_process( + JNIEnv *env, jobject /*obj*/, jlong ptr, jfloatArray samples) { + auto sd = reinterpret_cast(ptr); + + jfloat *p = env->GetFloatArrayElements(samples, nullptr); + jsize n = env->GetArrayLength(samples); + auto segments = sd->Process(p, n).SortByStartTime(); + env->ReleaseFloatArrayElements(samples, p, JNI_ABORT); + + return ProcessImpl(env, segments); +} + +SHERPA_ONNX_EXTERN_C +JNIEXPORT jobjectArray JNICALL +Java_com_k2fsa_sherpa_onnx_OfflineSpeakerDiarization_processWithCallback( + JNIEnv *env, jobject /*obj*/, jlong ptr, jfloatArray samples, + jobject callback, jlong arg) { + std::function callback_wrapper = + [env, callback](int32_t num_processed_chunks, int32_t num_total_chunks, + void *data) -> int { + jclass cls = env->GetObjectClass(callback); + + jmethodID mid = env->GetMethodID(cls, "invoke", "(IIJ)Ljava/lang/Integer;"); + if (mid == nullptr) { + SHERPA_ONNX_LOGE("Failed to get the callback. Ignore it."); + return 0; + } + + jobject ret = env->CallObjectMethod(callback, mid, num_processed_chunks, + num_total_chunks, (jlong)data); + jclass jklass = env->GetObjectClass(ret); + jmethodID int_value_mid = env->GetMethodID(jklass, "intValue", "()I"); + return env->CallIntMethod(ret, int_value_mid); + }; + + auto sd = reinterpret_cast(ptr); + + jfloat *p = env->GetFloatArrayElements(samples, nullptr); + jsize n = env->GetArrayLength(samples); + auto segments = + sd->Process(p, n, callback_wrapper, (void *)arg).SortByStartTime(); + env->ReleaseFloatArrayElements(samples, p, JNI_ABORT); + + return ProcessImpl(env, segments); +} + +SHERPA_ONNX_EXTERN_C +JNIEXPORT jint JNICALL +Java_com_k2fsa_sherpa_onnx_OfflineSpeakerDiarization_getSampleRate( + JNIEnv * /*env*/, jobject /*obj*/, jlong ptr) { + return reinterpret_cast(ptr) + ->SampleRate(); +} diff --git a/sherpa-onnx/kotlin-api/OfflineSpeakerDiarization.kt b/sherpa-onnx/kotlin-api/OfflineSpeakerDiarization.kt new file mode 100644 index 000000000..de0a9dffd --- /dev/null +++ b/sherpa-onnx/kotlin-api/OfflineSpeakerDiarization.kt @@ -0,0 +1,101 @@ +package com.k2fsa.sherpa.onnx + +import android.content.res.AssetManager + +data class OfflineSpeakerSegmentationPyannoteModelConfig( + var model: String, +) + +data class OfflineSpeakerSegmentationModelConfig( + var pyannote: OfflineSpeakerSegmentationPyannoteModelConfig, + var numThreads: Int = 1, + var debug: Boolean = false, + var provider: String = "cpu", +) + +data class FastClusteringConfig( + var numClusters: Int = -1, + var threshold: Float = 0.5f, +) + +data class OfflineSpeakerDiarizationConfig( + var segmentation: OfflineSpeakerSegmentationModelConfig, + var embedding: SpeakerEmbeddingExtractorConfig, + var clustering: FastClusteringConfig, + var minDurationOn: Float = 0.2f, + var minDurationOff: Float = 0.5f, +) + +data class OfflineSpeakerDiarizationSegment( + val start: Float, // in seconds + val end: Float, // in seconds + val speaker: Int, // ID of the speaker; count from 0 +) + +class OfflineSpeakerDiarization( + assetManager: AssetManager? = null, + config: OfflineSpeakerDiarizationConfig, +) { + private var ptr: Long + + init { + ptr = if (assetManager != null) { + newFromAsset(assetManager, config) + } else { + newFromFile(config) + } + } + + protected fun finalize() { + if (ptr != 0L) { + delete(ptr) + ptr = 0 + } + } + + fun release() = finalize() + + // Only config.clustering is used. All other fields in config + // are ignored + fun setConfig(config: OfflineSpeakerDiarizationConfig) = setConfig(ptr, config) + + fun sampleRate() = getSampleRate(ptr) + + fun process(samples: FloatArray) = process(ptr, samples) + + fun processWithCallback( + samples: FloatArray, + callback: (numProcessedChunks: Int, numTotalChunks: Int, arg: Long) -> Int, + arg: Long = 0, + ) = processWithCallback(ptr, samples, callback, arg) + + private external fun delete(ptr: Long) + + private external fun newFromAsset( + assetManager: AssetManager, + config: OfflineSpeakerDiarizationConfig, + ): Long + + private external fun newFromFile( + config: OfflineSpeakerDiarizationConfig, + ): Long + + private external fun setConfig(ptr: Long, config: OfflineSpeakerDiarizationConfig) + + private external fun getSampleRate(ptr: Long): Int + + private external fun process(ptr: Long, samples: FloatArray): Array + + private external fun processWithCallback( + ptr: Long, + samples: FloatArray, + callback: (numProcessedChunks: Int, numTotalChunks: Int, arg: Long) -> Int, + arg: Long, + ): Array + + companion object { + init { + System.loadLibrary("sherpa-onnx-jni") + } + } +} From 1851ff63373ed1d3ef614b431a153bcc6528e4e2 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 11 Oct 2024 16:51:40 +0800 Subject: [PATCH 230/237] Java API for speaker diarization (#1416) --- .github/workflows/run-java-test.yaml | 7 ++ .../OfflineSpeakerDiarizationDemo.java | 99 +++++++++++++++++++ java-api-examples/README.md | 6 ++ .../run-offline-speaker-diarization.sh | 45 +++++++++ sherpa-onnx/java-api/Makefile | 9 ++ .../sherpa/onnx/FastClusteringConfig.java | 44 +++++++++ .../onnx/OfflineSpeakerDiarization.java | 61 ++++++++++++ .../OfflineSpeakerDiarizationCallback.java | 8 ++ .../onnx/OfflineSpeakerDiarizationConfig.java | 79 +++++++++++++++ .../OfflineSpeakerDiarizationSegment.java | 27 +++++ ...OfflineSpeakerSegmentationModelConfig.java | 52 ++++++++++ ...peakerSegmentationPyannoteModelConfig.java | 32 ++++++ .../k2fsa/sherpa/onnx/OfflineTtsCallback.java | 2 + .../onnx/SpeakerEmbeddingExtractorConfig.java | 1 - 14 files changed, 471 insertions(+), 1 deletion(-) create mode 100644 java-api-examples/OfflineSpeakerDiarizationDemo.java create mode 100755 java-api-examples/run-offline-speaker-diarization.sh create mode 100644 sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/FastClusteringConfig.java create mode 100644 sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerDiarization.java create mode 100644 sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerDiarizationCallback.java create mode 100644 sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerDiarizationConfig.java create mode 100644 sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerDiarizationSegment.java create mode 100644 sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerSegmentationModelConfig.java create mode 100644 sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerSegmentationPyannoteModelConfig.java diff --git a/.github/workflows/run-java-test.yaml b/.github/workflows/run-java-test.yaml index 3e932707c..5759ea5d8 100644 --- a/.github/workflows/run-java-test.yaml +++ b/.github/workflows/run-java-test.yaml @@ -107,6 +107,13 @@ jobs: make -j4 ls -lh lib + - name: Run java test (speaker diarization) + shell: bash + run: | + cd ./java-api-examples + ./run-offline-speaker-diarization.sh + rm -rfv *.onnx *.wav sherpa-onnx-pyannote-* + - name: Run java test (kws) shell: bash run: | diff --git a/java-api-examples/OfflineSpeakerDiarizationDemo.java b/java-api-examples/OfflineSpeakerDiarizationDemo.java new file mode 100644 index 000000000..a5ef8d1f4 --- /dev/null +++ b/java-api-examples/OfflineSpeakerDiarizationDemo.java @@ -0,0 +1,99 @@ +// Copyright 2024 Xiaomi Corporation + +// This file shows how to use sherpa-onnx Java API for speaker diarization, +import com.k2fsa.sherpa.onnx.*; + +public class OfflineSpeakerDiarizationDemo { + public static void main(String[] args) { + /* Please use the following commands to download files used in this file + Step 1: Download a speaker segmentation model + + Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models + for a list of available models. The following is an example + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + + Step 2: Download a speaker embedding extractor model + + Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models + for a list of available models. The following is an example + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx + + Step 3. Download test wave files + + Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models + for a list of available test wave files. The following is an example + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav + + Step 4. Run it + */ + + String segmentationModel = "./sherpa-onnx-pyannote-segmentation-3-0/model.onnx"; + String embeddingModel = "./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx"; + String waveFilename = "./0-four-speakers-zh.wav"; + + WaveReader reader = new WaveReader(waveFilename); + + OfflineSpeakerSegmentationPyannoteModelConfig pyannote = + OfflineSpeakerSegmentationPyannoteModelConfig.builder().setModel(segmentationModel).build(); + + OfflineSpeakerSegmentationModelConfig segmentation = + OfflineSpeakerSegmentationModelConfig.builder() + .setPyannote(pyannote) + .setDebug(true) + .build(); + + SpeakerEmbeddingExtractorConfig embedding = + SpeakerEmbeddingExtractorConfig.builder().setModel(embeddingModel).setDebug(true).build(); + + // The test wave file ./0-four-speakers-zh.wav contains four speakers, so + // we use numClusters=4 here. If you don't know the number of speakers + // in the test wave file, please set the numClusters to -1 and provide + // threshold for clustering + FastClusteringConfig clustering = + FastClusteringConfig.builder() + .setNumClusters(4) // set it to -1 if you don't know the actual number + .setThreshold(0.5f) + .build(); + + OfflineSpeakerDiarizationConfig config = + OfflineSpeakerDiarizationConfig.builder() + .setSegmentation(segmentation) + .setEmbedding(embedding) + .setClustering(clustering) + .setMinDurationOn(0.2f) + .setMinDurationOff(0.5f) + .build(); + + OfflineSpeakerDiarization sd = new OfflineSpeakerDiarization(config); + if (sd.getSampleRate() != reader.getSampleRate()) { + System.out.printf( + "Expected sample rate: %d, given: %d\n", sd.getSampleRate(), reader.getSampleRate()); + return; + } + + // OfflineSpeakerDiarizationSegment[] segments = sd.process(reader.getSamples()); + // without callback is also ok + + // or you can use a callback to show the progress + OfflineSpeakerDiarizationSegment[] segments = + sd.processWithCallback( + reader.getSamples(), + (int numProcessedChunks, int numTotalChunks, long arg) -> { + float progress = 100.0f * numProcessedChunks / numTotalChunks; + System.out.printf("Progress: %.2f%%\n", progress); + + return 0; + }); + + for (OfflineSpeakerDiarizationSegment s : segments) { + System.out.printf("%.3f -- %.3f speaker_%02d\n", s.getStart(), s.getEnd(), s.getSpeaker()); + } + + sd.release(); + } +} diff --git a/java-api-examples/README.md b/java-api-examples/README.md index 697f0c876..779c1b254 100755 --- a/java-api-examples/README.md +++ b/java-api-examples/README.md @@ -4,6 +4,12 @@ This directory contains examples for the JAVA API of sherpa-onnx. # Usage +## Non-streaming speaker diarization + +```bash +./run-offline-speaker-diarization.sh +``` + ## Streaming Speech recognition ``` diff --git a/java-api-examples/run-offline-speaker-diarization.sh b/java-api-examples/run-offline-speaker-diarization.sh new file mode 100755 index 000000000..d5cd63b5f --- /dev/null +++ b/java-api-examples/run-offline-speaker-diarization.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash + +set -ex + +if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then + mkdir -p ../build + pushd ../build + cmake \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + -DSHERPA_ONNX_ENABLE_JNI=ON \ + .. + + make -j4 + ls -lh lib + popd +fi + +if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then + pushd ../sherpa-onnx/java-api + make + popd +fi + +if [ ! -f ./sherpa-onnx-pyannote-segmentation-3-0/model.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 +fi + +if [ ! -f ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx +fi + +if [ ! -f ./0-four-speakers-zh.wav ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav +fi + +java \ + -Djava.library.path=$PWD/../build/lib \ + -cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \ + ./OfflineSpeakerDiarizationDemo.java diff --git a/sherpa-onnx/java-api/Makefile b/sherpa-onnx/java-api/Makefile index 69c3631b4..6e4778ae7 100644 --- a/sherpa-onnx/java-api/Makefile +++ b/sherpa-onnx/java-api/Makefile @@ -68,6 +68,15 @@ java_files += KeywordSpotterConfig.java java_files += KeywordSpotterResult.java java_files += KeywordSpotter.java +java_files += OfflineSpeakerSegmentationPyannoteModelConfig.java +java_files += OfflineSpeakerSegmentationModelConfig.java +java_files += FastClusteringConfig.java +java_files += OfflineSpeakerDiarizationConfig.java +java_files += OfflineSpeakerDiarizationSegment.java +java_files += OfflineSpeakerDiarizationCallback.java +java_files += OfflineSpeakerDiarization.java + + class_files := $(java_files:%.java=%.class) java_files := $(addprefix src/$(package_dir)/,$(java_files)) diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/FastClusteringConfig.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/FastClusteringConfig.java new file mode 100644 index 000000000..f2e957259 --- /dev/null +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/FastClusteringConfig.java @@ -0,0 +1,44 @@ +// Copyright 2024 Xiaomi Corporation + +package com.k2fsa.sherpa.onnx; + +public class FastClusteringConfig { + private final int numClusters; + private final float threshold; + + private FastClusteringConfig(Builder builder) { + this.numClusters = builder.numClusters; + this.threshold = builder.threshold; + } + + public static Builder builder() { + return new Builder(); + } + + public int getNumClusters() { + return numClusters; + } + + public float getThreshold() { + return threshold; + } + + public static class Builder { + private int numClusters = -1; + private float threshold = 0.5f; + + public FastClusteringConfig build() { + return new FastClusteringConfig(this); + } + + public Builder setNumClusters(int numClusters) { + this.numClusters = numClusters; + return this; + } + + public Builder setThreshold(float threshold) { + this.threshold = threshold; + return this; + } + } +} diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerDiarization.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerDiarization.java new file mode 100644 index 000000000..b75cd09ea --- /dev/null +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerDiarization.java @@ -0,0 +1,61 @@ +// Copyright 2024 Xiaomi Corporation + +package com.k2fsa.sherpa.onnx; + +public class OfflineSpeakerDiarization { + static { + System.loadLibrary("sherpa-onnx-jni"); + } + + private long ptr = 0; + + public OfflineSpeakerDiarization(OfflineSpeakerDiarizationConfig config) { + ptr = newFromFile(config); + } + + public int getSampleRate() { + return getSampleRate(ptr); + } + + // Only config.clustering is used. All other fields are ignored + public void setConfig(OfflineSpeakerDiarizationConfig config) { + setConfig(ptr, config); + } + + public OfflineSpeakerDiarizationSegment[] process(float[] samples) { + return process(ptr, samples); + } + + public OfflineSpeakerDiarizationSegment[] processWithCallback(float[] samples, OfflineSpeakerDiarizationCallback callback) { + return processWithCallback(ptr, samples, callback, 0); + } + + public OfflineSpeakerDiarizationSegment[] processWithCallback(float[] samples, OfflineSpeakerDiarizationCallback callback, long arg) { + return processWithCallback(ptr, samples, callback, arg); + } + + protected void finalize() throws Throwable { + release(); + } + + // You'd better call it manually if it is not used anymore + public void release() { + if (this.ptr == 0) { + return; + } + delete(this.ptr); + this.ptr = 0; + } + + private native int getSampleRate(long ptr); + + private native void delete(long ptr); + + private native long newFromFile(OfflineSpeakerDiarizationConfig config); + + private native void setConfig(long ptr, OfflineSpeakerDiarizationConfig config); + + private native OfflineSpeakerDiarizationSegment[] process(long ptr, float[] samples); + + private native OfflineSpeakerDiarizationSegment[] processWithCallback(long ptr, float[] samples, OfflineSpeakerDiarizationCallback callback, long arg); +} \ No newline at end of file diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerDiarizationCallback.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerDiarizationCallback.java new file mode 100644 index 000000000..7787386d3 --- /dev/null +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerDiarizationCallback.java @@ -0,0 +1,8 @@ +// Copyright 2024 Xiaomi Corporation + +package com.k2fsa.sherpa.onnx; + +@FunctionalInterface +public interface OfflineSpeakerDiarizationCallback { + Integer invoke(int numProcessedChunks, int numTotalCunks, long arg); +} diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerDiarizationConfig.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerDiarizationConfig.java new file mode 100644 index 000000000..9965c5742 --- /dev/null +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerDiarizationConfig.java @@ -0,0 +1,79 @@ +package com.k2fsa.sherpa.onnx; + +public class OfflineSpeakerDiarizationConfig { + private final OfflineSpeakerSegmentationModelConfig segmentation; + private final SpeakerEmbeddingExtractorConfig embedding; + private final FastClusteringConfig clustering; + private final float minDurationOn; + private final float minDurationOff; + + private OfflineSpeakerDiarizationConfig(Builder builder) { + this.segmentation = builder.segmentation; + this.embedding = builder.embedding; + this.clustering = builder.clustering; + this.minDurationOff = builder.minDurationOff; + this.minDurationOn = builder.minDurationOn; + } + + public static Builder builder() { + return new Builder(); + } + + public OfflineSpeakerSegmentationModelConfig getSegmentation() { + return segmentation; + } + + public SpeakerEmbeddingExtractorConfig getEmbedding() { + return embedding; + } + + public FastClusteringConfig getClustering() { + return clustering; + } + + public float getMinDurationOff() { + return minDurationOff; + } + + public float getMinDurationOn() { + return minDurationOn; + } + + public static class Builder { + private OfflineSpeakerSegmentationModelConfig segmentation = OfflineSpeakerSegmentationModelConfig.builder().build(); + private SpeakerEmbeddingExtractorConfig embedding = SpeakerEmbeddingExtractorConfig.builder().build(); + private FastClusteringConfig clustering = FastClusteringConfig.builder().build(); + private float minDurationOn = 0.2f; + private float minDurationOff = 0.5f; + + public OfflineSpeakerDiarizationConfig build() { + return new OfflineSpeakerDiarizationConfig(this); + } + + public Builder setSegmentation(OfflineSpeakerSegmentationModelConfig segmentation) { + this.segmentation = segmentation; + return this; + } + + public Builder setEmbedding(SpeakerEmbeddingExtractorConfig embedding) { + this.embedding = embedding; + return this; + } + + public Builder setClustering(FastClusteringConfig clustering) { + this.clustering = clustering; + return this; + } + + public Builder setMinDurationOff(float minDurationOff) { + this.minDurationOff = minDurationOff; + return this; + } + + public Builder setMinDurationOn(float minDurationOn) { + this.minDurationOn = minDurationOn; + return this; + } + } + +} diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerDiarizationSegment.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerDiarizationSegment.java new file mode 100644 index 000000000..1bb1a7635 --- /dev/null +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerDiarizationSegment.java @@ -0,0 +1,27 @@ +// Copyright 2024 Xiaomi Corporation + +package com.k2fsa.sherpa.onnx; + +public class OfflineSpeakerDiarizationSegment { + private final float start; + private final float end; + private final int speaker; + + public OfflineSpeakerDiarizationSegment(float start, float end, int speaker) { + this.start = start; + this.end = end; + this.speaker = speaker; + } + + public float getStart() { + return start; + } + + public float getEnd() { + return end; + } + + public int getSpeaker() { + return speaker; + } +} diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerSegmentationModelConfig.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerSegmentationModelConfig.java new file mode 100644 index 000000000..55df6c295 --- /dev/null +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerSegmentationModelConfig.java @@ -0,0 +1,52 @@ +// Copyright 2024 Xiaomi Corporation + +package com.k2fsa.sherpa.onnx; + +public class OfflineSpeakerSegmentationModelConfig { + private final OfflineSpeakerSegmentationPyannoteModelConfig pyannote; + private final int numThreads; + private final boolean debug; + private final String provider; + + private OfflineSpeakerSegmentationModelConfig(Builder builder) { + this.pyannote = builder.pyannote; + this.numThreads = builder.numThreads; + this.debug = builder.debug; + this.provider = builder.provider; + } + + public static Builder builder() { + return new Builder(); + } + + public static class Builder { + private OfflineSpeakerSegmentationPyannoteModelConfig pyannote = OfflineSpeakerSegmentationPyannoteModelConfig.builder().build(); + private int numThreads = 1; + private boolean debug = true; + private String provider = "cpu"; + + public OfflineSpeakerSegmentationModelConfig build() { + return new OfflineSpeakerSegmentationModelConfig(this); + } + + public Builder setPyannote(OfflineSpeakerSegmentationPyannoteModelConfig pyannote) { + this.pyannote = pyannote; + return this; + } + + public Builder setNumThreads(int numThreads) { + this.numThreads = numThreads; + return this; + } + + public Builder setDebug(boolean debug) { + this.debug = debug; + return this; + } + + public Builder setProvider(String provider) { + this.provider = provider; + return this; + } + } +} \ No newline at end of file diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerSegmentationPyannoteModelConfig.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerSegmentationPyannoteModelConfig.java new file mode 100644 index 000000000..51fd99874 --- /dev/null +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineSpeakerSegmentationPyannoteModelConfig.java @@ -0,0 +1,32 @@ +// Copyright 2024 Xiaomi Corporation + +package com.k2fsa.sherpa.onnx; + +public class OfflineSpeakerSegmentationPyannoteModelConfig { + private final String model; + + private OfflineSpeakerSegmentationPyannoteModelConfig(Builder builder) { + this.model = builder.model; + } + + public static Builder builder() { + return new Builder(); + } + + public String getModel() { + return model; + } + + public static class Builder { + private String model = ""; + + public OfflineSpeakerSegmentationPyannoteModelConfig build() { + return new OfflineSpeakerSegmentationPyannoteModelConfig(this); + } + + public Builder setModel(String model) { + this.model = model; + return this; + } + } +} diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsCallback.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsCallback.java index 396594a96..2fc1d45dd 100644 --- a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsCallback.java +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsCallback.java @@ -1,3 +1,5 @@ +// Copyright 2024 Xiaomi Corporation + package com.k2fsa.sherpa.onnx; @FunctionalInterface diff --git a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/SpeakerEmbeddingExtractorConfig.java b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/SpeakerEmbeddingExtractorConfig.java index ffc688f34..80f800cdc 100644 --- a/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/SpeakerEmbeddingExtractorConfig.java +++ b/sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/SpeakerEmbeddingExtractorConfig.java @@ -50,5 +50,4 @@ public Builder setProvider(String provider) { return this; } } - } From 1ed803adc13a3b060a6b972253e3adfa81be8126 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 11 Oct 2024 21:17:41 +0800 Subject: [PATCH 231/237] Dart API for speaker diarization (#1418) --- .github/scripts/test-dart.sh | 5 + .github/workflows/test-dart.yaml | 1 + dart-api-examples/README.md | 1 + .../speaker-diarization/.gitignore | 3 + .../speaker-diarization/CHANGELOG.md | 3 + .../speaker-diarization/README.md | 7 + .../speaker-diarization/analysis_options.yaml | 30 ++ .../speaker-diarization/bin/init.dart | 1 + .../bin/speaker-diarization.dart | 100 +++++++ .../speaker-diarization/pubspec.yaml | 17 ++ dart-api-examples/speaker-diarization/run.sh | 21 ++ flutter/sherpa_onnx/example/example.md | 1 + flutter/sherpa_onnx/lib/sherpa_onnx.dart | 1 + .../lib/src/offline_speaker_diarization.dart | 243 ++++++++++++++++ .../lib/src/sherpa_onnx_bindings.dart | 263 +++++++++++++++++- flutter/sherpa_onnx/pubspec.yaml | 8 +- scripts/dart/speaker-diarization-pubspec.yaml | 16 ++ sherpa-onnx/c-api/c-api.cc | 16 ++ sherpa-onnx/c-api/c-api.h | 9 + ...ffline-speaker-diarization-pyannote-impl.h | 1 + .../jni/offline-speaker-diarization.cc | 3 +- 21 files changed, 733 insertions(+), 17 deletions(-) create mode 100644 dart-api-examples/speaker-diarization/.gitignore create mode 100644 dart-api-examples/speaker-diarization/CHANGELOG.md create mode 100644 dart-api-examples/speaker-diarization/README.md create mode 100644 dart-api-examples/speaker-diarization/analysis_options.yaml create mode 120000 dart-api-examples/speaker-diarization/bin/init.dart create mode 100644 dart-api-examples/speaker-diarization/bin/speaker-diarization.dart create mode 100644 dart-api-examples/speaker-diarization/pubspec.yaml create mode 100755 dart-api-examples/speaker-diarization/run.sh create mode 100644 flutter/sherpa_onnx/lib/src/offline_speaker_diarization.dart create mode 100644 scripts/dart/speaker-diarization-pubspec.yaml diff --git a/.github/scripts/test-dart.sh b/.github/scripts/test-dart.sh index 0aff2085e..27c21573a 100755 --- a/.github/scripts/test-dart.sh +++ b/.github/scripts/test-dart.sh @@ -4,6 +4,11 @@ set -ex cd dart-api-examples +pushd speaker-diarization +echo '----------speaker diarization----------' +./run.sh +popd + pushd speaker-identification echo '----------3d speaker----------' ./run-3d-speaker.sh diff --git a/.github/workflows/test-dart.yaml b/.github/workflows/test-dart.yaml index 58d505490..d9e27e86f 100644 --- a/.github/workflows/test-dart.yaml +++ b/.github/workflows/test-dart.yaml @@ -114,6 +114,7 @@ jobs: cp scripts/dart/audio-tagging-pubspec.yaml dart-api-examples/audio-tagging/pubspec.yaml cp scripts/dart/add-punctuations-pubspec.yaml dart-api-examples/add-punctuations/pubspec.yaml cp scripts/dart/speaker-id-pubspec.yaml dart-api-examples/speaker-identification/pubspec.yaml + cp scripts/dart/speaker-diarization-pubspec.yaml dart-api-examples/speaker-diarization/pubspec.yaml cp scripts/dart/sherpa-onnx-pubspec.yaml flutter/sherpa_onnx/pubspec.yaml diff --git a/dart-api-examples/README.md b/dart-api-examples/README.md index 9370372e7..3d66cb04e 100644 --- a/dart-api-examples/README.md +++ b/dart-api-examples/README.md @@ -9,6 +9,7 @@ https://pub.dev/packages/sherpa_onnx | Directory | Description | |-----------|-------------| +| [./speaker-diarization](./speaker-diarization)| Example for speaker diarization.| | [./add-punctuations](./add-punctuations)| Example for adding punctuations to text.| | [./audio-tagging](./audio-tagging)| Example for audio tagging.| | [./keyword-spotter](./keyword-spotter)| Example for keyword spotting| diff --git a/dart-api-examples/speaker-diarization/.gitignore b/dart-api-examples/speaker-diarization/.gitignore new file mode 100644 index 000000000..3a8579040 --- /dev/null +++ b/dart-api-examples/speaker-diarization/.gitignore @@ -0,0 +1,3 @@ +# https://dart.dev/guides/libraries/private-files +# Created by `dart pub` +.dart_tool/ diff --git a/dart-api-examples/speaker-diarization/CHANGELOG.md b/dart-api-examples/speaker-diarization/CHANGELOG.md new file mode 100644 index 000000000..effe43c82 --- /dev/null +++ b/dart-api-examples/speaker-diarization/CHANGELOG.md @@ -0,0 +1,3 @@ +## 1.0.0 + +- Initial version. diff --git a/dart-api-examples/speaker-diarization/README.md b/dart-api-examples/speaker-diarization/README.md new file mode 100644 index 000000000..d4d8c4fd2 --- /dev/null +++ b/dart-api-examples/speaker-diarization/README.md @@ -0,0 +1,7 @@ +# Introduction + +This example shows how to use the Dart API from sherpa-onnx for speaker diarization. + +# Usage + +Please see [./run.sh](./run.sh) diff --git a/dart-api-examples/speaker-diarization/analysis_options.yaml b/dart-api-examples/speaker-diarization/analysis_options.yaml new file mode 100644 index 000000000..dee8927aa --- /dev/null +++ b/dart-api-examples/speaker-diarization/analysis_options.yaml @@ -0,0 +1,30 @@ +# This file configures the static analysis results for your project (errors, +# warnings, and lints). +# +# This enables the 'recommended' set of lints from `package:lints`. +# This set helps identify many issues that may lead to problems when running +# or consuming Dart code, and enforces writing Dart using a single, idiomatic +# style and format. +# +# If you want a smaller set of lints you can change this to specify +# 'package:lints/core.yaml'. These are just the most critical lints +# (the recommended set includes the core lints). +# The core lints are also what is used by pub.dev for scoring packages. + +include: package:lints/recommended.yaml + +# Uncomment the following section to specify additional rules. + +# linter: +# rules: +# - camel_case_types + +# analyzer: +# exclude: +# - path/to/excluded/files/** + +# For more information about the core and recommended set of lints, see +# https://dart.dev/go/core-lints + +# For additional information about configuring this file, see +# https://dart.dev/guides/language/analysis-options diff --git a/dart-api-examples/speaker-diarization/bin/init.dart b/dart-api-examples/speaker-diarization/bin/init.dart new file mode 120000 index 000000000..48508cfd3 --- /dev/null +++ b/dart-api-examples/speaker-diarization/bin/init.dart @@ -0,0 +1 @@ +../../vad/bin/init.dart \ No newline at end of file diff --git a/dart-api-examples/speaker-diarization/bin/speaker-diarization.dart b/dart-api-examples/speaker-diarization/bin/speaker-diarization.dart new file mode 100644 index 000000000..760adc868 --- /dev/null +++ b/dart-api-examples/speaker-diarization/bin/speaker-diarization.dart @@ -0,0 +1,100 @@ +// Copyright (c) 2024 Xiaomi Corporation +import 'dart:io'; +import 'dart:typed_data'; +import 'dart:ffi'; + +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; +import './init.dart'; + +void main(List arguments) async { + await initSherpaOnnx(); + + /* Please use the following commands to download files used in this file + Step 1: Download a speaker segmentation model + + Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models + for a list of available models. The following is an example + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + + Step 2: Download a speaker embedding extractor model + + Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models + for a list of available models. The following is an example + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx + + Step 3. Download test wave files + + Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models + for a list of available test wave files. The following is an example + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav + + Step 4. Run it + */ + + final segmentationModel = + "./sherpa-onnx-pyannote-segmentation-3-0/model.onnx"; + + final embeddingModel = + "./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx"; + + final waveFilename = "./0-four-speakers-zh.wav"; + + final segmentationConfig = sherpa_onnx.OfflineSpeakerSegmentationModelConfig( + pyannote: sherpa_onnx.OfflineSpeakerSegmentationPyannoteModelConfig( + model: segmentationModel), + ); + + final embeddingConfig = + sherpa_onnx.SpeakerEmbeddingExtractorConfig(model: embeddingModel); + + // since we know there are 4 speakers in ./0-four-speakers-zh.wav, we set + // numClusters to 4. If you don't know the exact number, please set it to -1. + // in that case, you have to set threshold. A larger threshold leads to + // fewer clusters, i.e., fewer speakers. + final clusteringConfig = + sherpa_onnx.FastClusteringConfig(numClusters: 4, threshold: 0.5); + + var config = sherpa_onnx.OfflineSpeakerDiarizationConfig( + segmentation: segmentationConfig, + embedding: embeddingConfig, + clustering: clusteringConfig, + minDurationOn: 0.2, + minDurationOff: 0.5); + + final sd = sherpa_onnx.OfflineSpeakerDiarization(config); + if (sd.ptr == nullptr) { + return; + } + + final waveData = sherpa_onnx.readWave(waveFilename); + if (sd.sampleRate != waveData.sampleRate) { + print( + 'Expected sample rate: ${sd.sampleRate}, given: ${waveData.sampleRate}'); + return; + } + + print('started'); + + // Use the following statement if you don't want to use a callback + // final segments = sd.process(samples: waveData.samples); + + final segments = sd.processWithCallback( + samples: waveData.samples, + callback: (int numProcessedChunk, int numTotalChunks) { + final progress = 100.0 * numProcessedChunk / numTotalChunks; + + print('Progress ${progress.toStringAsFixed(2)}%'); + + return 0; + }); + + for (int i = 0; i < segments.length; ++i) { + print( + '${segments[i].start.toStringAsFixed(3)} -- ${segments[i].end.toStringAsFixed(3)} speaker_${segments[i].speaker}'); + } +} diff --git a/dart-api-examples/speaker-diarization/pubspec.yaml b/dart-api-examples/speaker-diarization/pubspec.yaml new file mode 100644 index 000000000..28154a49c --- /dev/null +++ b/dart-api-examples/speaker-diarization/pubspec.yaml @@ -0,0 +1,17 @@ +name: speaker_diarization +description: > + This example demonstrates how to use the Dart API for speaker diarization. + +version: 1.0.0 + +environment: + sdk: ">=3.0.0 <4.0.0" + +dependencies: + sherpa_onnx: ^1.10.27 + # sherpa_onnx: + # path: ../../flutter/sherpa_onnx + path: ^1.9.0 + +dev_dependencies: + lints: ^3.0.0 diff --git a/dart-api-examples/speaker-diarization/run.sh b/dart-api-examples/speaker-diarization/run.sh new file mode 100755 index 000000000..7717870dc --- /dev/null +++ b/dart-api-examples/speaker-diarization/run.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +set -ex + +dart pub get + +if [ ! -f ./sherpa-onnx-pyannote-segmentation-3-0/model.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 +fi + +if [ ! -f ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx +fi + +if [ ! -f ./0-four-speakers-zh.wav ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav +fi + +dart run ./bin/speaker-diarization.dart diff --git a/flutter/sherpa_onnx/example/example.md b/flutter/sherpa_onnx/example/example.md index 7e7e8031d..0c24a79b2 100644 --- a/flutter/sherpa_onnx/example/example.md +++ b/flutter/sherpa_onnx/example/example.md @@ -11,6 +11,7 @@ | Functions | URL | Supported Platforms| |---|---|---| +|Speaker diarization| [Address](https://github.com/k2-fsa/sherpa-onnx/tree/master/dart-api-examples/speaker-diarization)| macOS, Windows, Linux| |Streaming speech recognition| [Address](https://github.com/k2-fsa/sherpa-onnx/tree/master/dart-api-examples/streaming-asr)| macOS, Windows, Linux| |Non-Streaming speech recognition| [Address](https://github.com/k2-fsa/sherpa-onnx/tree/master/dart-api-examples/non-streaming-asr)| macOS, Windows, Linux| |Text to speech| [Address](https://github.com/k2-fsa/sherpa-onnx/tree/master/dart-api-examples/tts)| macOS, Windows, Linux| diff --git a/flutter/sherpa_onnx/lib/sherpa_onnx.dart b/flutter/sherpa_onnx/lib/sherpa_onnx.dart index b15e67532..9fcd2872f 100644 --- a/flutter/sherpa_onnx/lib/sherpa_onnx.dart +++ b/flutter/sherpa_onnx/lib/sherpa_onnx.dart @@ -6,6 +6,7 @@ export 'src/audio_tagging.dart'; export 'src/feature_config.dart'; export 'src/keyword_spotter.dart'; export 'src/offline_recognizer.dart'; +export 'src/offline_speaker_diarization.dart'; export 'src/offline_stream.dart'; export 'src/online_recognizer.dart'; export 'src/online_stream.dart'; diff --git a/flutter/sherpa_onnx/lib/src/offline_speaker_diarization.dart b/flutter/sherpa_onnx/lib/src/offline_speaker_diarization.dart new file mode 100644 index 000000000..5981e3c04 --- /dev/null +++ b/flutter/sherpa_onnx/lib/src/offline_speaker_diarization.dart @@ -0,0 +1,243 @@ +// Copyright (c) 2024 Xiaomi Corporation +import 'dart:ffi'; +import 'dart:typed_data'; + +import 'package:ffi/ffi.dart'; + +import './sherpa_onnx_bindings.dart'; +import './speaker_identification.dart'; + +class OfflineSpeakerDiarizationSegment { + const OfflineSpeakerDiarizationSegment({ + required this.start, + required this.end, + required this.speaker, + }); + + @override + String toString() { + return 'OfflineSpeakerDiarizationSegment(start: $start, end: $end, speaker: $speaker)'; + } + + final double start; + final double end; + final int speaker; +} + +class OfflineSpeakerSegmentationPyannoteModelConfig { + const OfflineSpeakerSegmentationPyannoteModelConfig({ + this.model = '', + }); + + @override + String toString() { + return 'OfflineSpeakerSegmentationPyannoteModelConfig(model: $model)'; + } + + final String model; +} + +class OfflineSpeakerSegmentationModelConfig { + const OfflineSpeakerSegmentationModelConfig({ + this.pyannote = const OfflineSpeakerSegmentationPyannoteModelConfig(), + this.numThreads = 1, + this.debug = true, + this.provider = 'cpu', + }); + + @override + String toString() { + return 'OfflineSpeakerSegmentationModelConfig(pyannote: $pyannote, numThreads: $numThreads, debug: $debug, provider: $provider)'; + } + + final OfflineSpeakerSegmentationPyannoteModelConfig pyannote; + + final int numThreads; + final bool debug; + final String provider; +} + +class FastClusteringConfig { + const FastClusteringConfig({ + this.numClusters = -1, + this.threshold = 0.5, + }); + + @override + String toString() { + return 'FastClusteringConfig(numClusters: $numClusters, threshold: $threshold)'; + } + + final int numClusters; + final double threshold; +} + +class OfflineSpeakerDiarizationConfig { + const OfflineSpeakerDiarizationConfig({ + this.segmentation = const OfflineSpeakerSegmentationModelConfig(), + this.embedding = const SpeakerEmbeddingExtractorConfig(model: ''), + this.clustering = const FastClusteringConfig(), + this.minDurationOn = 0.2, + this.minDurationOff = 0.5, + }); + + @override + String toString() { + return 'OfflineSpeakerDiarizationConfig(segmentation: $segmentation, embedding: $embedding, clustering: $clustering, minDurationOn: $minDurationOn, minDurationOff: $minDurationOff)'; + } + + final OfflineSpeakerSegmentationModelConfig segmentation; + final SpeakerEmbeddingExtractorConfig embedding; + final FastClusteringConfig clustering; + final double minDurationOff; // in seconds + final double minDurationOn; // in seconds +} + +class OfflineSpeakerDiarization { + OfflineSpeakerDiarization._( + {required this.ptr, required this.config, required this.sampleRate}); + + void free() { + SherpaOnnxBindings.sherpaOnnxDestroyOfflineSpeakerDiarization?.call(ptr); + ptr = nullptr; + } + + /// The user is responsible to call the OfflineSpeakerDiarization.free() + /// method of the returned instance to avoid memory leak. + factory OfflineSpeakerDiarization(OfflineSpeakerDiarizationConfig config) { + final c = calloc(); + + c.ref.segmentation.pyannote.model = + config.segmentation.pyannote.model.toNativeUtf8(); + c.ref.segmentation.numThreads = config.segmentation.numThreads; + c.ref.segmentation.debug = config.segmentation.debug ? 1 : 0; + c.ref.segmentation.provider = config.segmentation.provider.toNativeUtf8(); + + c.ref.embedding.model = config.embedding.model.toNativeUtf8(); + c.ref.embedding.numThreads = config.embedding.numThreads; + c.ref.embedding.debug = config.embedding.debug ? 1 : 0; + c.ref.embedding.provider = config.embedding.provider.toNativeUtf8(); + + c.ref.clustering.numClusters = config.clustering.numClusters; + c.ref.clustering.threshold = config.clustering.threshold; + + c.ref.minDurationOn = config.minDurationOn; + c.ref.minDurationOff = config.minDurationOff; + + final ptr = + SherpaOnnxBindings.sherpaOnnxCreateOfflineSpeakerDiarization?.call(c) ?? + nullptr; + + calloc.free(c.ref.embedding.provider); + calloc.free(c.ref.embedding.model); + calloc.free(c.ref.segmentation.provider); + calloc.free(c.ref.segmentation.pyannote.model); + + int sampleRate = 0; + if (ptr != nullptr) { + sampleRate = SherpaOnnxBindings + .sherpaOnnxOfflineSpeakerDiarizationGetSampleRate + ?.call(ptr) ?? + 0; + } + return OfflineSpeakerDiarization._( + ptr: ptr, config: config, sampleRate: sampleRate); + } + + List process( + {required Float32List samples}) { + if (ptr == nullptr) { + return []; + } + + final n = samples.length; + final Pointer p = calloc(n); + + final pList = p.asTypedList(n); + pList.setAll(0, samples); + + final r = SherpaOnnxBindings.sherpaOnnxOfflineSpeakerDiarizationProcess + ?.call(ptr, p, n) ?? + nullptr; + + final ans = _processImpl(r); + + SherpaOnnxBindings.sherpaOnnxOfflineSpeakerDiarizationDestroyResult + ?.call(r); + + return ans; + } + + List processWithCallback({ + required Float32List samples, + required int Function(int numProcessedChunks, int numTotalChunks) callback, + }) { + if (ptr == nullptr) { + return []; + } + + final n = samples.length; + final Pointer p = calloc(n); + + final pList = p.asTypedList(n); + pList.setAll(0, samples); + + final wrapper = NativeCallable< + SherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArgNative>.isolateLocal( + (int numProcessedChunks, int numTotalChunks) { + return callback(numProcessedChunks, numTotalChunks); + }, exceptionalReturn: 0); + + final r = SherpaOnnxBindings + .sherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg + ?.call(ptr, p, n, wrapper.nativeFunction) ?? + nullptr; + + wrapper.close(); + + final ans = _processImpl(r); + + SherpaOnnxBindings.sherpaOnnxOfflineSpeakerDiarizationDestroyResult + ?.call(r); + + return ans; + } + + List _processImpl( + Pointer r) { + if (r == nullptr) { + return []; + } + + final numSegments = SherpaOnnxBindings + .sherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments + ?.call(r) ?? + 0; + final segments = SherpaOnnxBindings + .sherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime + ?.call(r) ?? + nullptr; + + if (segments == nullptr) { + return []; + } + + final ans = []; + for (int i = 0; i != numSegments; ++i) { + final s = segments + i; + + final tmp = OfflineSpeakerDiarizationSegment( + start: s.ref.start, end: s.ref.end, speaker: s.ref.speaker); + ans.add(tmp); + } + + SherpaOnnxBindings.sherpaOnnxOfflineSpeakerDiarizationDestroySegment + ?.call(segments); + + return ans; + } + + Pointer ptr; + OfflineSpeakerDiarizationConfig config; + final int sampleRate; +} diff --git a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart index 42294c2d4..8a8817d63 100644 --- a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart +++ b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart @@ -2,6 +2,66 @@ import 'dart:ffi'; import 'package:ffi/ffi.dart'; +final class SherpaOnnxSpeakerEmbeddingExtractorConfig extends Struct { + external Pointer model; + + @Int32() + external int numThreads; + + @Int32() + external int debug; + + external Pointer provider; +} + +final class SherpaOnnxOfflineSpeakerDiarizationSegment extends Struct { + @Float() + external double start; + + @Float() + external double end; + + @Int32() + external int speaker; +} + +final class SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig + extends Struct { + external Pointer model; +} + +final class SherpaOnnxOfflineSpeakerSegmentationModelConfig extends Struct { + external SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig pyannote; + + @Int32() + external int numThreads; + + @Int32() + external int debug; + + external Pointer provider; +} + +final class SherpaOnnxFastClusteringConfig extends Struct { + @Int32() + external int numClusters; + + @Float() + external double threshold; +} + +final class SherpaOnnxOfflineSpeakerDiarizationConfig extends Struct { + external SherpaOnnxOfflineSpeakerSegmentationModelConfig segmentation; + external SherpaOnnxSpeakerEmbeddingExtractorConfig embedding; + external SherpaOnnxFastClusteringConfig clustering; + + @Float() + external double minDurationOn; + + @Float() + external double minDurationOff; +} + final class SherpaOnnxOfflinePunctuationModelConfig extends Struct { external Pointer ctTransformer; @@ -341,18 +401,6 @@ final class SherpaOnnxWave extends Struct { external int numSamples; } -final class SherpaOnnxSpeakerEmbeddingExtractorConfig extends Struct { - external Pointer model; - - @Int32() - external int numThreads; - - @Int32() - external int debug; - - external Pointer provider; -} - final class SherpaOnnxKeywordSpotterConfig extends Struct { external SherpaOnnxFeatureConfig feat; @@ -402,10 +450,101 @@ final class SherpaOnnxSpeakerEmbeddingExtractor extends Opaque {} final class SherpaOnnxSpeakerEmbeddingManager extends Opaque {} +final class SherpaOnnxOfflineSpeakerDiarization extends Opaque {} + +final class SherpaOnnxOfflineSpeakerDiarizationResult extends Opaque {} + +typedef SherpaOnnxCreateOfflineSpeakerDiarizationNative + = Pointer Function( + Pointer); + +typedef SherpaOnnxCreateOfflineSpeakerDiarization + = SherpaOnnxCreateOfflineSpeakerDiarizationNative; + +typedef SherpaOnnxDestroyOfflineSpeakerDiarizationNative = Void Function( + Pointer); + +typedef SherpaOnnxDestroyOfflineSpeakerDiarization = void Function( + Pointer); + typedef SherpaOnnxCreateOfflinePunctuationNative = Pointer Function( Pointer); +typedef SherpaOnnxOfflineSpeakerDiarizationGetSampleRateNative = Int32 Function( + Pointer); + +typedef SherpaOnnxOfflineSpeakerDiarizationGetSampleRate = int Function( + Pointer); + +typedef SherpaOnnxOfflineSpeakerDiarizationSetConfigNative = Void Function( + Pointer, + Pointer); + +typedef SherpaOnnxOfflineSpeakerDiarizationResultGetNumSpeakersNative = Int32 + Function(Pointer); + +typedef SherpaOnnxOfflineSpeakerDiarizationResultGetNumSpeakers = int Function( + Pointer); + +typedef SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegmentsNative = Int32 + Function(Pointer); + +typedef SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments = int Function( + Pointer); + +typedef SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTimeNative + = Pointer Function( + Pointer); + +typedef SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime + = SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTimeNative; + +typedef SherpaOnnxOfflineSpeakerDiarizationDestroySegmentNative = Void Function( + Pointer); + +typedef SherpaOnnxOfflineSpeakerDiarizationDestroySegment = void Function( + Pointer); + +typedef SherpaOnnxOfflineSpeakerDiarizationProcessNative + = Pointer Function( + Pointer, Pointer, Int32); + +typedef SherpaOnnxOfflineSpeakerDiarizationProcess + = Pointer Function( + Pointer, Pointer, int); + +typedef SherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArgNative = Int32 + Function(Int32, Int32); + +typedef SherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArgNative + = Pointer Function( + Pointer, + Pointer, + Int32, + Pointer< + NativeFunction< + SherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArgNative>>); + +typedef SherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg + = Pointer Function( + Pointer, + Pointer, + int, + Pointer< + NativeFunction< + SherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArgNative>>); + +typedef SherpaOnnxOfflineSpeakerDiarizationDestroyResultNative = Void Function( + Pointer); + +typedef SherpaOnnxOfflineSpeakerDiarizationDestroyResult = void Function( + Pointer); + +typedef SherpaOnnxOfflineSpeakerDiarizationSetConfig = void Function( + Pointer, + Pointer); + typedef SherpaOnnxCreateOfflinePunctuation = SherpaOnnxCreateOfflinePunctuationNative; @@ -940,6 +1079,29 @@ typedef SherpaOnnxFreeWaveNative = Void Function(Pointer); typedef SherpaOnnxFreeWave = void Function(Pointer); class SherpaOnnxBindings { + static SherpaOnnxCreateOfflineSpeakerDiarization? + sherpaOnnxCreateOfflineSpeakerDiarization; + static SherpaOnnxDestroyOfflineSpeakerDiarization? + sherpaOnnxDestroyOfflineSpeakerDiarization; + static SherpaOnnxOfflineSpeakerDiarizationGetSampleRate? + sherpaOnnxOfflineSpeakerDiarizationGetSampleRate; + static SherpaOnnxOfflineSpeakerDiarizationSetConfig? + sherpaOnnxOfflineSpeakerDiarizationSetConfig; + static SherpaOnnxOfflineSpeakerDiarizationResultGetNumSpeakers? + sherpaOnnxOfflineSpeakerDiarizationResultGetNumSpeakers; + static SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments? + sherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments; + static SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime? + sherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime; + static SherpaOnnxOfflineSpeakerDiarizationDestroySegment? + sherpaOnnxOfflineSpeakerDiarizationDestroySegment; + static SherpaOnnxOfflineSpeakerDiarizationProcess? + sherpaOnnxOfflineSpeakerDiarizationProcess; + static SherpaOnnxOfflineSpeakerDiarizationDestroyResult? + sherpaOnnxOfflineSpeakerDiarizationDestroyResult; + static SherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg? + sherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg; + static SherpaOnnxCreateOfflinePunctuation? sherpaOnnxCreateOfflinePunctuation; static SherpaOnnxDestroyOfflinePunctuation? sherpaOnnxDestroyOfflinePunctuation; @@ -1107,6 +1269,83 @@ class SherpaOnnxBindings { static SherpaOnnxFreeWave? freeWave; static void init(DynamicLibrary dynamicLibrary) { + sherpaOnnxCreateOfflineSpeakerDiarization ??= dynamicLibrary + .lookup< + NativeFunction< + SherpaOnnxCreateOfflineSpeakerDiarizationNative>>( + 'SherpaOnnxCreateOfflineSpeakerDiarization') + .asFunction(); + + sherpaOnnxDestroyOfflineSpeakerDiarization ??= dynamicLibrary + .lookup< + NativeFunction< + SherpaOnnxDestroyOfflineSpeakerDiarizationNative>>( + 'SherpaOnnxDestroyOfflineSpeakerDiarization') + .asFunction(); + + sherpaOnnxOfflineSpeakerDiarizationGetSampleRate ??= dynamicLibrary + .lookup< + NativeFunction< + SherpaOnnxOfflineSpeakerDiarizationGetSampleRateNative>>( + 'SherpaOnnxOfflineSpeakerDiarizationGetSampleRate') + .asFunction(); + + sherpaOnnxOfflineSpeakerDiarizationSetConfig ??= dynamicLibrary + .lookup< + NativeFunction< + SherpaOnnxOfflineSpeakerDiarizationSetConfigNative>>( + 'SherpaOnnxOfflineSpeakerDiarizationSetConfig') + .asFunction(); + + sherpaOnnxOfflineSpeakerDiarizationResultGetNumSpeakers ??= dynamicLibrary + .lookup< + NativeFunction< + SherpaOnnxOfflineSpeakerDiarizationResultGetNumSpeakersNative>>( + 'SherpaOnnxOfflineSpeakerDiarizationResultGetNumSpeakers') + .asFunction(); + + sherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments ??= dynamicLibrary + .lookup< + NativeFunction< + SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegmentsNative>>( + 'SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments') + .asFunction(); + + sherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime ??= dynamicLibrary + .lookup< + NativeFunction< + SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTimeNative>>( + 'SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime') + .asFunction(); + + sherpaOnnxOfflineSpeakerDiarizationDestroySegment ??= dynamicLibrary + .lookup< + NativeFunction< + SherpaOnnxOfflineSpeakerDiarizationDestroySegmentNative>>( + 'SherpaOnnxOfflineSpeakerDiarizationDestroySegment') + .asFunction(); + + sherpaOnnxOfflineSpeakerDiarizationProcess ??= dynamicLibrary + .lookup< + NativeFunction< + SherpaOnnxOfflineSpeakerDiarizationProcessNative>>( + 'SherpaOnnxOfflineSpeakerDiarizationProcess') + .asFunction(); + + sherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg ??= dynamicLibrary + .lookup< + NativeFunction< + SherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArgNative>>( + 'SherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg') + .asFunction(); + + sherpaOnnxOfflineSpeakerDiarizationDestroyResult ??= dynamicLibrary + .lookup< + NativeFunction< + SherpaOnnxOfflineSpeakerDiarizationDestroyResultNative>>( + 'SherpaOnnxOfflineSpeakerDiarizationDestroyResult') + .asFunction(); + sherpaOnnxCreateOfflinePunctuation ??= dynamicLibrary .lookup>( 'SherpaOnnxCreateOfflinePunctuation') diff --git a/flutter/sherpa_onnx/pubspec.yaml b/flutter/sherpa_onnx/pubspec.yaml index 5b693ef0b..e92071833 100644 --- a/flutter/sherpa_onnx/pubspec.yaml +++ b/flutter/sherpa_onnx/pubspec.yaml @@ -1,8 +1,8 @@ name: sherpa_onnx description: > - Speech recognition, speech synthesis, and speaker recognition using next-gen Kaldi - with onnxruntime without Internet connection. + Speech recognition, speech synthesis, speaker diarization, and speaker recognition + using next-gen Kaldi with onnxruntime without Internet connection. repository: https://github.com/k2-fsa/sherpa-onnx/tree/master/flutter @@ -12,7 +12,7 @@ documentation: https://k2-fsa.github.io/sherpa/onnx/ topics: - speech-recognition - speech-synthesis - - speaker-identification + - speaker-diarization - audio-tagging - voice-activity-detection @@ -41,7 +41,7 @@ dependencies: sherpa_onnx_linux: ^1.10.27 # sherpa_onnx_linux: # path: ../sherpa_onnx_linux - # + sherpa_onnx_windows: ^1.10.27 # sherpa_onnx_windows: # path: ../sherpa_onnx_windows diff --git a/scripts/dart/speaker-diarization-pubspec.yaml b/scripts/dart/speaker-diarization-pubspec.yaml new file mode 100644 index 000000000..fec147e75 --- /dev/null +++ b/scripts/dart/speaker-diarization-pubspec.yaml @@ -0,0 +1,16 @@ +name: speaker_diarization +description: > + This example demonstrates how to use the Dart API for speaker diarization. + +version: 1.0.0 + +environment: + sdk: ">=3.0.0 <4.0.0" + +dependencies: + sherpa_onnx: + path: ../../flutter/sherpa_onnx + path: ^1.9.0 + +dev_dependencies: + lints: ^3.0.0 diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index abcfc5b82..4ba0a4a60 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -1828,4 +1828,20 @@ SherpaOnnxOfflineSpeakerDiarizationProcessWithCallback( return ans; } +const SherpaOnnxOfflineSpeakerDiarizationResult * +SherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg( + const SherpaOnnxOfflineSpeakerDiarization *sd, const float *samples, + int32_t n, + SherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg callback) { + auto wrapper = [callback](int32_t num_processed_chunks, + int32_t num_total_chunks, void *) { + return callback(num_processed_chunks, num_total_chunks); + }; + + auto ans = new SherpaOnnxOfflineSpeakerDiarizationResult; + ans->impl = sd->impl->Process(samples, n, wrapper); + + return ans; +} + #endif diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index c9e7f9ee1..4b41a81a9 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -1485,6 +1485,9 @@ SHERPA_ONNX_API void SherpaOnnxOfflineSpeakerDiarizationDestroySegment( typedef int32_t (*SherpaOnnxOfflineSpeakerDiarizationProgressCallback)( int32_t num_processed_chunk, int32_t num_total_chunks, void *arg); +typedef int32_t (*SherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg)( + int32_t num_processed_chunk, int32_t num_total_chunks); + // The user has to invoke SherpaOnnxOfflineSpeakerDiarizationDestroyResult() // to free the returned pointer to avoid memory leak. SHERPA_ONNX_API const SherpaOnnxOfflineSpeakerDiarizationResult * @@ -1500,6 +1503,12 @@ SherpaOnnxOfflineSpeakerDiarizationProcessWithCallback( int32_t n, SherpaOnnxOfflineSpeakerDiarizationProgressCallback callback, void *arg); +SHERPA_ONNX_API const SherpaOnnxOfflineSpeakerDiarizationResult * +SherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg( + const SherpaOnnxOfflineSpeakerDiarization *sd, const float *samples, + int32_t n, + SherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg callback); + SHERPA_ONNX_API void SherpaOnnxOfflineSpeakerDiarizationDestroyResult( const SherpaOnnxOfflineSpeakerDiarizationResult *r); diff --git a/sherpa-onnx/csrc/offline-speaker-diarization-pyannote-impl.h b/sherpa-onnx/csrc/offline-speaker-diarization-pyannote-impl.h index 8f669e27c..0c70f0bc6 100644 --- a/sherpa-onnx/csrc/offline-speaker-diarization-pyannote-impl.h +++ b/sherpa-onnx/csrc/offline-speaker-diarization-pyannote-impl.h @@ -5,6 +5,7 @@ #define SHERPA_ONNX_CSRC_OFFLINE_SPEAKER_DIARIZATION_PYANNOTE_IMPL_H_ #include +#include #include #include #include diff --git a/sherpa-onnx/jni/offline-speaker-diarization.cc b/sherpa-onnx/jni/offline-speaker-diarization.cc index a0eef8b9c..e82962c80 100644 --- a/sherpa-onnx/jni/offline-speaker-diarization.cc +++ b/sherpa-onnx/jni/offline-speaker-diarization.cc @@ -204,7 +204,8 @@ Java_com_k2fsa_sherpa_onnx_OfflineSpeakerDiarization_processWithCallback( jfloat *p = env->GetFloatArrayElements(samples, nullptr); jsize n = env->GetArrayLength(samples); auto segments = - sd->Process(p, n, callback_wrapper, (void *)arg).SortByStartTime(); + sd->Process(p, n, callback_wrapper, reinterpret_cast(arg)) + .SortByStartTime(); env->ReleaseFloatArrayElements(samples, p, JNI_ABORT); return ProcessImpl(env, segments); From 5e273c5be44e349b8e65cb649bc6e7e05f4f5ba7 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sat, 12 Oct 2024 12:28:38 +0800 Subject: [PATCH 232/237] Pascal API for speaker diarization (#1420) --- .github/workflows/pascal.yaml | 15 + pascal-api-examples/README.md | 1 + .../speaker-diarization/main.pas | 104 ++++++ .../speaker-diarization/run.sh | 49 +++ sherpa-onnx/pascal-api/sherpa_onnx.pas | 339 +++++++++++++++++- 5 files changed, 506 insertions(+), 2 deletions(-) create mode 100644 pascal-api-examples/speaker-diarization/main.pas create mode 100755 pascal-api-examples/speaker-diarization/run.sh diff --git a/.github/workflows/pascal.yaml b/.github/workflows/pascal.yaml index 2ed213184..ba9a73163 100644 --- a/.github/workflows/pascal.yaml +++ b/.github/workflows/pascal.yaml @@ -127,6 +127,21 @@ jobs: cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/tts fi + - name: Run Pascal test (Speaker diarization) + shell: bash + run: | + export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH + + cd ./pascal-api-examples + pushd speaker-diarization + + ./run.sh + rm -rfv *.onnx *.wav sherpa-onnx-* + ls -lh + echo "---" + + popd + - name: Run Pascal test (TTS) shell: bash run: | diff --git a/pascal-api-examples/README.md b/pascal-api-examples/README.md index 5475d825b..5e709cd7e 100644 --- a/pascal-api-examples/README.md +++ b/pascal-api-examples/README.md @@ -9,6 +9,7 @@ https://k2-fsa.github.io/sherpa/onnx/pascal-api/index.html |Directory| Description| |---------|------------| |[read-wav](./read-wav)|It shows how to read a wave file.| +|[speaker-diarization](./speaker-diarization)|It shows how to use Pascal API for speaker diarization.| |[streaming-asr](./streaming-asr)| It shows how to use streaming models for speech recognition.| |[non-streaming-asr](./non-streaming-asr)| It shows how to use non-streaming models for speech recognition.| |[vad](./vad)| It shows how to use the voice activity detection API.| diff --git a/pascal-api-examples/speaker-diarization/main.pas b/pascal-api-examples/speaker-diarization/main.pas new file mode 100644 index 000000000..35d915d0b --- /dev/null +++ b/pascal-api-examples/speaker-diarization/main.pas @@ -0,0 +1,104 @@ +{ Copyright (c) 2024 Xiaomi Corporation } +{ +This file shows how to use the Pascal API from sherpa-onnx +for speaker diarization. + +Usage: + +Step 1: Download a speaker segmentation model + +Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models +for a list of available models. The following is an example + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + +Step 2: Download a speaker embedding extractor model + +Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models +for a list of available models. The following is an example + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx + +Step 3. Download test wave files + +Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models +for a list of available test wave files. The following is an example + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav + +Step 4. Run it +} + +program main; + +{$mode delphi} + +uses + sherpa_onnx, + ctypes, + SysUtils; + +function ProgressCallback( + NumProcessedChunks: cint32; + NumTotalChunks: cint32): cint32; cdecl; +var + Progress: Single; +begin + Progress := 100.0 * NumProcessedChunks / NumTotalChunks; + WriteLn(Format('Progress: %.3f%%', [Progress])); + + Result := 0; +end; + +var + Wave: TSherpaOnnxWave; + Config: TSherpaOnnxOfflineSpeakerDiarizationConfig; + Sd: TSherpaOnnxOfflineSpeakerDiarization; + Segments: TSherpaOnnxOfflineSpeakerDiarizationSegmentArray; + I: Integer; +begin + Wave := SherpaOnnxReadWave('./0-four-speakers-zh.wav'); + + Config.Segmentation.Pyannote.Model := './sherpa-onnx-pyannote-segmentation-3-0/model.onnx'; + Config.Embedding.Model := './3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx'; + + { + Since we know that there are 4 speakers in ./0-four-speakers-zh.wav, we + set NumClusters to 4 here. + If you don't have such information, please set NumClusters to -1. + In that case, you have to set Config.Clustering.Threshold. + A larger threshold leads to fewer clusters, i.e., fewer speakers. + } + Config.Clustering.NumClusters := 4; + Config.Segmentation.Debug := True; + Config.Embedding.Debug := True; + + Sd := TSherpaOnnxOfflineSpeakerDiarization.Create(Config); + if Sd.GetHandle = nil then + begin + WriteLn('Please check you config'); + Exit; + end; + + if Sd.GetSampleRate <> Wave.SampleRate then + begin + WriteLn(Format('Expected sample rate: %d, given: %d', [Sd.GetSampleRate, Wave.SampleRate])); + Exit; + end; + + { + // If you don't want to use a callback + Segments := Sd.Process(Wave.Samples); + } + Segments := Sd.Process(Wave.Samples, @ProgressCallback); + + for I := Low(Segments) to High(Segments) do + begin + WriteLn(Format('%.3f -- %.3f speaker_%d', + [Segments[I].Start, Segments[I].Stop, Segments[I].Speaker])); + end; + + FreeAndNil(Sd); +end. diff --git a/pascal-api-examples/speaker-diarization/run.sh b/pascal-api-examples/speaker-diarization/run.sh new file mode 100755 index 000000000..866dc63c9 --- /dev/null +++ b/pascal-api-examples/speaker-diarization/run.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) + +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then + mkdir -p ../../build + pushd ../../build + cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + .. + + cmake --build . --target install --config Release + popd +fi + +fpc \ + -dSHERPA_ONNX_USE_SHARED_LIBS \ + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ + -Fl$SHERPA_ONNX_DIR/build/install/lib \ + ./main.pas + +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH + +if [ ! -f ./sherpa-onnx-pyannote-segmentation-3-0/model.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 +fi + +if [ ! -f ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx +fi + +if [ ! -f ./0-four-speakers-zh.wav ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav +fi + +./main diff --git a/sherpa-onnx/pascal-api/sherpa_onnx.pas b/sherpa-onnx/pascal-api/sherpa_onnx.pas index 7f05793e1..1b24dec80 100644 --- a/sherpa-onnx/pascal-api/sherpa_onnx.pas +++ b/sherpa-onnx/pascal-api/sherpa_onnx.pas @@ -102,7 +102,7 @@ TSherpaOnnxOfflineTts = class function Generate(Text: AnsiString; SpeakerId: Integer; Speed: Single; - Callback:PSherpaOnnxGeneratedAudioCallbackWithArg; + Callback: PSherpaOnnxGeneratedAudioCallbackWithArg; Arg: Pointer ): TSherpaOnnxGeneratedAudio; overload; @@ -398,6 +398,78 @@ TSherpaOnnxVoiceActivityDetector = class property GetHandle: Pointer Read Handle; end; + + TSherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig = record + Model: AnsiString; + function ToString: AnsiString; + end; + + TSherpaOnnxOfflineSpeakerSegmentationModelConfig = record + Pyannote: TSherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig; + NumThreads: Integer; + Debug: Boolean; + Provider: AnsiString; + function ToString: AnsiString; + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSpeakerSegmentationModelConfig); + end; + + TSherpaOnnxFastClusteringConfig = record + NumClusters: Integer; + Threshold: Single; + function ToString: AnsiString; + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxFastClusteringConfig); + end; + + TSherpaOnnxSpeakerEmbeddingExtractorConfig = record + Model: AnsiString; + NumThreads: Integer; + Debug: Boolean; + Provider: AnsiString; + function ToString: AnsiString; + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSpeakerEmbeddingExtractorConfig); + end; + + TSherpaOnnxOfflineSpeakerDiarizationConfig = record + Segmentation: TSherpaOnnxOfflineSpeakerSegmentationModelConfig; + Embedding: TSherpaOnnxSpeakerEmbeddingExtractorConfig; + Clustering: TSherpaOnnxFastClusteringConfig; + MinDurationOn: Single; + MinDurationOff: Single; + function ToString: AnsiString; + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSpeakerDiarizationConfig); + end; + + TSherpaOnnxOfflineSpeakerDiarizationSegment = record + Start: Single; + Stop: Single; + Speaker: Integer; + function ToString: AnsiString; + end; + + TSherpaOnnxOfflineSpeakerDiarizationSegmentArray = array of TSherpaOnnxOfflineSpeakerDiarizationSegment; + + PSherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg = ^TSherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg; + + TSherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg = function( + NumProcessChunks: cint32; + NumTotalChunks: cint32): cint32; cdecl; + + TSherpaOnnxOfflineSpeakerDiarization = class + private + Handle: Pointer; + SampleRate: Integer; + _Config: TSherpaOnnxOfflineSpeakerDiarizationConfig; + public + constructor Create(Config: TSherpaOnnxOfflineSpeakerDiarizationConfig); + destructor Destroy; override; + procedure SetConfig(Config: TSherpaOnnxOfflineSpeakerDiarizationConfig); + function Process(Samples: array of Single): TSherpaOnnxOfflineSpeakerDiarizationSegmentArray; overload; + function Process(Samples: array of Single; Callback: PSherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg): TSherpaOnnxOfflineSpeakerDiarizationSegmentArray; overload; + property GetHandle: Pointer Read Handle; + property GetSampleRate: Integer Read SampleRate; + end; + + { It supports reading a single channel wave with 16-bit encoded samples. Samples are normalized to the range [-1, 1]. } @@ -656,6 +728,47 @@ SherpaOnnxResampleOut = record PSherpaOnnxResampleOut = ^SherpaOnnxResampleOut; + SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig = record + Model: PAnsiChar; + end; + + SherpaOnnxOfflineSpeakerSegmentationModelConfig = record + Pyannote: SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig; + NumThreads: cint32; + Debug: cint32; + Provider: PAnsiChar; + end; + + SherpaOnnxFastClusteringConfig = record + NumClusters: cint32; + Threshold: cfloat; + end; + + SherpaOnnxSpeakerEmbeddingExtractorConfig = record + Model: PAnsiChar; + NumThreads: cint32; + Debug: cint32; + Provider: PAnsiChar; + end; + + SherpaOnnxOfflineSpeakerDiarizationConfig = record + Segmentation: SherpaOnnxOfflineSpeakerSegmentationModelConfig; + Embedding: SherpaOnnxSpeakerEmbeddingExtractorConfig; + Clustering: SherpaOnnxFastClusteringConfig; + MinDurationOn: cfloat; + MinDurationOff: cfloat; + end; + + SherpaOnnxOfflineSpeakerDiarizationSegment = record + Start: cfloat; + Stop: cfloat; + Speaker: cint32; + end; + + PSherpaOnnxOfflineSpeakerDiarizationSegment = ^SherpaOnnxOfflineSpeakerDiarizationSegment; + + PSherpaOnnxOfflineSpeakerDiarizationConfig = ^SherpaOnnxOfflineSpeakerDiarizationConfig; + function SherpaOnnxCreateLinearResampler(SampleRateInHz: cint32; SampleRateOutHz: cint32; FilterCutoffHz: cfloat; @@ -677,6 +790,37 @@ procedure SherpaOnnxLinearResamplerResampleFree(P: PSherpaOnnxResampleOut); cdec procedure SherpaOnnxLinearResamplerReset(P: Pointer); cdecl; external SherpaOnnxLibName; +function SherpaOnnxCreateOfflineSpeakerDiarization(Config: PSherpaOnnxOfflineSpeakerDiarizationConfig): Pointer; cdecl; + external SherpaOnnxLibName; + +procedure SherpaOnnxDestroyOfflineSpeakerDiarization(P: Pointer); cdecl; + external SherpaOnnxLibName; + +function SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(P: Pointer): cint32; cdecl; + external SherpaOnnxLibName; + +procedure SherpaOnnxOfflineSpeakerDiarizationSetConfig(P: Pointer; Config: PSherpaOnnxOfflineSpeakerDiarizationConfig); cdecl; + external SherpaOnnxLibName; + +function SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(P: Pointer): cint32; cdecl; + external SherpaOnnxLibName; + +function SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(P: Pointer): PSherpaOnnxOfflineSpeakerDiarizationSegment; cdecl; + external SherpaOnnxLibName; + +procedure SherpaOnnxOfflineSpeakerDiarizationDestroySegment(P: Pointer); cdecl; + external SherpaOnnxLibName; + +function SherpaOnnxOfflineSpeakerDiarizationProcess(P: Pointer; Samples: pcfloat; N: cint32): Pointer; cdecl; + external SherpaOnnxLibName; + +function SherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg(P: Pointer; + Samples: pcfloat; N: cint32; Callback: PSherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg): Pointer; cdecl; + external SherpaOnnxLibName; + +procedure SherpaOnnxOfflineSpeakerDiarizationDestroyResult(P: Pointer); cdecl; + external SherpaOnnxLibName; + function SherpaOnnxCreateOfflineTts(Config: PSherpaOnnxOfflineTtsConfig): Pointer; cdecl; external SherpaOnnxLibName; @@ -1773,7 +1917,7 @@ function TSherpaOnnxOfflineTts.Generate(Text: AnsiString; SpeakerId: Integer; function TSherpaOnnxOfflineTts.Generate(Text: AnsiString; SpeakerId: Integer; Speed: Single; - Callback:PSherpaOnnxGeneratedAudioCallbackWithArg; + Callback: PSherpaOnnxGeneratedAudioCallbackWithArg; Arg: Pointer ): TSherpaOnnxGeneratedAudio; var @@ -1847,4 +1991,195 @@ procedure TSherpaOnnxLinearResampler.Reset; SherpaOnnxLinearResamplerReset(Self.Handle); end; +function TSherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig.ToString: AnsiString; +begin + Result := Format('TSherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig(' + + 'Model := %s)',[Self.Model]); +end; + +function TSherpaOnnxOfflineSpeakerSegmentationModelConfig.ToString: AnsiString; +begin + Result := Format('TSherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig(' + + 'Pyannote := %s, ' + + 'NumThreads := %d, ' + + 'Debug := %s, ' + + 'Provider := %s)', + [Self.Pyannote.ToString, Self.NumThreads, + Self.Debug.ToString, Self.Provider]); +end; + +class operator TSherpaOnnxOfflineSpeakerSegmentationModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSpeakerSegmentationModelConfig); +begin + Dest.NumThreads := 1; + Dest.Debug := False; + Dest.Provider := 'cpu'; +end; + +function TSherpaOnnxFastClusteringConfig.ToString: AnsiString; +begin + Result := Format('TSherpaOnnxFastClusteringConfig(' + + 'NumClusters := %d, Threshold := %.3f)', + [Self.NumClusters, Self.Threshold]); +end; + +class operator TSherpaOnnxFastClusteringConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxFastClusteringConfig); +begin + Dest.NumClusters := -1; + Dest.Threshold := 0.5; +end; + +function TSherpaOnnxSpeakerEmbeddingExtractorConfig.ToString: AnsiString; +begin + Result := Format('TSherpaOnnxSpeakerEmbeddingExtractorConfig(' + + 'Model := %s, '+ + 'NumThreads := %d, '+ + 'Debug := %s, '+ + 'Provider := %s)', + [Self.Model, Self.NumThreads, Self.Debug.ToString, Self.Provider]); +end; + +class operator TSherpaOnnxSpeakerEmbeddingExtractorConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSpeakerEmbeddingExtractorConfig); +begin + Dest.NumThreads := 1; + Dest.Debug := False; + Dest.Provider := 'cpu'; +end; + +function TSherpaOnnxOfflineSpeakerDiarizationConfig.ToString: AnsiString; +begin + Result := Format('TSherpaOnnxOfflineSpeakerDiarizationConfig(' + + 'Segmentation := %s, '+ + 'Embedding := %s, '+ + 'Clustering := %s, '+ + 'MinDurationOn := %.3f, '+ + 'MinDurationOff := %.3f)', + [Self.Segmentation.ToString, Self.Embedding.ToString, + Self.Clustering.ToString, Self.MinDurationOn, Self.MinDurationOff]); +end; + +class operator TSherpaOnnxOfflineSpeakerDiarizationConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSpeakerDiarizationConfig); +begin + Dest.MinDurationOn := 0.2; + Dest.MinDurationOff := 0.5; +end; + +function TSherpaOnnxOfflineSpeakerDiarizationSegment.ToString: AnsiString; +begin + Result := Format('TSherpaOnnxOfflineSpeakerDiarizationSegment(' + + 'Start := %.3f, '+ + 'Stop := %.3f, '+ + 'Speaker := %d)', + [Self.Start, Self.Stop, Self.Speaker]); +end; + +constructor TSherpaOnnxOfflineSpeakerDiarization.Create(Config: TSherpaOnnxOfflineSpeakerDiarizationConfig); +var + C: SherpaOnnxOfflineSpeakerDiarizationConfig; +begin + C := Default(SherpaOnnxOfflineSpeakerDiarizationConfig); + C.Segmentation.Pyannote.Model := PAnsiChar(Config.Segmentation.Pyannote.Model); + C.Segmentation.NumThreads := Config.Segmentation.NumThreads; + C.Segmentation.Debug := Ord(Config.Segmentation.Debug); + C.Segmentation.Provider := PAnsiChar(Config.Segmentation.Provider); + + C.Embedding.Model := PAnsiChar(Config.Embedding.Model); + C.Embedding.NumThreads := Config.Embedding.NumThreads; + C.Embedding.Debug := Ord(Config.Embedding.Debug); + C.Embedding.Provider := PAnsiChar(Config.Embedding.Provider); + + C.Clustering.NumClusters := Config.Clustering.NumClusters; + C.Clustering.Threshold := Config.Clustering.Threshold; + + C.MinDurationOn := Config.MinDurationOn; + C.MinDurationOff := Config.MinDurationOff; + + Self.Handle := SherpaOnnxCreateOfflineSpeakerDiarization(@C); + Self._Config := Config; + Self.SampleRate := 0; + + if Self.Handle <> nil then + begin + Self.SampleRate := SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(Self.Handle); + end; +end; + +destructor TSherpaOnnxOfflineSpeakerDiarization.Destroy; +begin + SherpaOnnxDestroyOfflineSpeakerDiarization(Self.Handle); + Self.Handle := nil; +end; + +procedure TSherpaOnnxOfflineSpeakerDiarization.SetConfig(Config: TSherpaOnnxOfflineSpeakerDiarizationConfig); +var + C: SherpaOnnxOfflineSpeakerDiarizationConfig; +begin + C := Default(SherpaOnnxOfflineSpeakerDiarizationConfig); + + C.Clustering.NumClusters := Config.Clustering.NumClusters; + C.Clustering.Threshold := Config.Clustering.Threshold; + + SherpaOnnxOfflineSpeakerDiarizationSetConfig(Self.Handle, @C); +end; + +function TSherpaOnnxOfflineSpeakerDiarization.Process(Samples: array of Single): TSherpaOnnxOfflineSpeakerDiarizationSegmentArray; +var + R: Pointer; + NumSegments: Integer; + I: Integer; + Segments: PSherpaOnnxOfflineSpeakerDiarizationSegment; +begin + Result := nil; + + R := SherpaOnnxOfflineSpeakerDiarizationProcess(Self.Handle, pcfloat(Samples), Length(Samples)); + if R = nil then + begin + Exit + end; + NumSegments := SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(R); + + Segments := SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(R); + + SetLength(Result, NumSegments); + for I := Low(Result) to High(Result) do + begin + Result[I].Start := Segments[I].Start; + Result[I].Stop := Segments[I].Stop; + Result[I].Speaker := Segments[I].Speaker; + end; + + SherpaOnnxOfflineSpeakerDiarizationDestroySegment(Segments); + SherpaOnnxOfflineSpeakerDiarizationDestroyResult(R); +end; + +function TSherpaOnnxOfflineSpeakerDiarization.Process(Samples: array of Single; + callback: PSherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg): TSherpaOnnxOfflineSpeakerDiarizationSegmentArray; +var + R: Pointer; + NumSegments: Integer; + I: Integer; + Segments: PSherpaOnnxOfflineSpeakerDiarizationSegment; +begin + Result := nil; + + R := SherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg(Self.Handle, pcfloat(Samples), Length(Samples), callback); + if R = nil then + begin + Exit + end; + NumSegments := SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(R); + + Segments := SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(R); + + SetLength(Result, NumSegments); + for I := Low(Result) to High(Result) do + begin + Result[I].Start := Segments[I].Start; + Result[I].Stop := Segments[I].Stop; + Result[I].Speaker := Segments[I].Speaker; + end; + + SherpaOnnxOfflineSpeakerDiarizationDestroySegment(Segments); + SherpaOnnxOfflineSpeakerDiarizationDestroyResult(R); +end; + end. From 94b26ff07c1b6275d1830cd2987081a0bdbedacb Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sat, 12 Oct 2024 13:03:48 +0800 Subject: [PATCH 233/237] Android JNI support for speaker diarization (#1421) --- .../csrc/offline-speaker-diarization-impl.cc | 14 ++++++++++++++ .../csrc/offline-speaker-diarization-impl.h | 10 ++++++++++ ...ffline-speaker-diarization-pyannote-impl.h | 16 ++++++++++++++++ .../csrc/offline-speaker-diarization.cc | 6 ++++++ .../csrc/offline-speaker-diarization.h | 10 ++++++++++ ...ine-speaker-segmentation-pyannote-model.cc | 18 ++++++++++++++++++ ...line-speaker-segmentation-pyannote-model.h | 10 ++++++++++ .../sherpa-onnx-vad-microphone-offline-asr.cc | 2 +- sherpa-onnx/jni/audio-tagging.cc | 1 + sherpa-onnx/jni/keyword-spotter.cc | 2 ++ sherpa-onnx/jni/offline-punctuation.cc | 2 ++ sherpa-onnx/jni/offline-recognizer.cc | 2 ++ .../jni/offline-speaker-diarization.cc | 19 ++++++++++++++++++- sherpa-onnx/jni/offline-tts.cc | 1 + sherpa-onnx/jni/online-recognizer.cc | 1 + .../jni/speaker-embedding-extractor.cc | 1 + .../jni/spoken-language-identification.cc | 1 + sherpa-onnx/jni/voice-activity-detector.cc | 2 ++ 18 files changed, 116 insertions(+), 2 deletions(-) diff --git a/sherpa-onnx/csrc/offline-speaker-diarization-impl.cc b/sherpa-onnx/csrc/offline-speaker-diarization-impl.cc index e41a7767a..15c3a2eb4 100644 --- a/sherpa-onnx/csrc/offline-speaker-diarization-impl.cc +++ b/sherpa-onnx/csrc/offline-speaker-diarization-impl.cc @@ -23,4 +23,18 @@ OfflineSpeakerDiarizationImpl::Create( return nullptr; } +#if __ANDROID_API__ >= 9 +std::unique_ptr +OfflineSpeakerDiarizationImpl::Create( + AAssetManager *mgr, const OfflineSpeakerDiarizationConfig &config) { + if (!config.segmentation.pyannote.model.empty()) { + return std::make_unique(mgr, config); + } + + SHERPA_ONNX_LOGE("Please specify a speaker segmentation model."); + + return nullptr; +} +#endif + } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/offline-speaker-diarization-impl.h b/sherpa-onnx/csrc/offline-speaker-diarization-impl.h index 3aed9d72f..41f0e1e2f 100644 --- a/sherpa-onnx/csrc/offline-speaker-diarization-impl.h +++ b/sherpa-onnx/csrc/offline-speaker-diarization-impl.h @@ -8,6 +8,11 @@ #include #include +#if __ANDROID_API__ >= 9 +#include "android/asset_manager.h" +#include "android/asset_manager_jni.h" +#endif + #include "sherpa-onnx/csrc/offline-speaker-diarization.h" namespace sherpa_onnx { @@ -16,6 +21,11 @@ class OfflineSpeakerDiarizationImpl { static std::unique_ptr Create( const OfflineSpeakerDiarizationConfig &config); +#if __ANDROID_API__ >= 9 + static std::unique_ptr Create( + AAssetManager *mgr, const OfflineSpeakerDiarizationConfig &config); +#endif + virtual ~OfflineSpeakerDiarizationImpl() = default; virtual int32_t SampleRate() const = 0; diff --git a/sherpa-onnx/csrc/offline-speaker-diarization-pyannote-impl.h b/sherpa-onnx/csrc/offline-speaker-diarization-pyannote-impl.h index 0c70f0bc6..aaedc3be0 100644 --- a/sherpa-onnx/csrc/offline-speaker-diarization-pyannote-impl.h +++ b/sherpa-onnx/csrc/offline-speaker-diarization-pyannote-impl.h @@ -10,6 +10,11 @@ #include #include +#if __ANDROID_API__ >= 9 +#include "android/asset_manager.h" +#include "android/asset_manager_jni.h" +#endif + #include "Eigen/Dense" #include "sherpa-onnx/csrc/fast-clustering.h" #include "sherpa-onnx/csrc/math.h" @@ -65,6 +70,17 @@ class OfflineSpeakerDiarizationPyannoteImpl Init(); } +#if __ANDROID_API__ >= 9 + OfflineSpeakerDiarizationPyannoteImpl( + AAssetManager *mgr, const OfflineSpeakerDiarizationConfig &config) + : config_(config), + segmentation_model_(mgr, config_.segmentation), + embedding_extractor_(mgr, config_.embedding), + clustering_(std::make_unique(config_.clustering)) { + Init(); + } +#endif + int32_t SampleRate() const override { const auto &meta_data = segmentation_model_.GetModelMetaData(); diff --git a/sherpa-onnx/csrc/offline-speaker-diarization.cc b/sherpa-onnx/csrc/offline-speaker-diarization.cc index 00733bfb2..f34ea4e0e 100644 --- a/sherpa-onnx/csrc/offline-speaker-diarization.cc +++ b/sherpa-onnx/csrc/offline-speaker-diarization.cc @@ -73,6 +73,12 @@ OfflineSpeakerDiarization::OfflineSpeakerDiarization( const OfflineSpeakerDiarizationConfig &config) : impl_(OfflineSpeakerDiarizationImpl::Create(config)) {} +#if __ANDROID_API__ >= 9 +OfflineSpeakerDiarization::OfflineSpeakerDiarization( + AAssetManager *mgr, const OfflineSpeakerDiarizationConfig &config) + : impl_(OfflineSpeakerDiarizationImpl::Create(mgr, config)) {} +#endif + OfflineSpeakerDiarization::~OfflineSpeakerDiarization() = default; int32_t OfflineSpeakerDiarization::SampleRate() const { diff --git a/sherpa-onnx/csrc/offline-speaker-diarization.h b/sherpa-onnx/csrc/offline-speaker-diarization.h index 376e5f975..4a517fbb2 100644 --- a/sherpa-onnx/csrc/offline-speaker-diarization.h +++ b/sherpa-onnx/csrc/offline-speaker-diarization.h @@ -9,6 +9,11 @@ #include #include +#if __ANDROID_API__ >= 9 +#include "android/asset_manager.h" +#include "android/asset_manager_jni.h" +#endif + #include "sherpa-onnx/csrc/fast-clustering-config.h" #include "sherpa-onnx/csrc/offline-speaker-diarization-result.h" #include "sherpa-onnx/csrc/offline-speaker-segmentation-model-config.h" @@ -57,6 +62,11 @@ class OfflineSpeakerDiarization { explicit OfflineSpeakerDiarization( const OfflineSpeakerDiarizationConfig &config); +#if __ANDROID_API__ >= 9 + OfflineSpeakerDiarization(AAssetManager *mgr, + const OfflineSpeakerDiarizationConfig &config); +#endif + ~OfflineSpeakerDiarization(); // Expected sample rate of the input audio samples diff --git a/sherpa-onnx/csrc/offline-speaker-segmentation-pyannote-model.cc b/sherpa-onnx/csrc/offline-speaker-segmentation-pyannote-model.cc index 3f3323698..e3768dcf4 100644 --- a/sherpa-onnx/csrc/offline-speaker-segmentation-pyannote-model.cc +++ b/sherpa-onnx/csrc/offline-speaker-segmentation-pyannote-model.cc @@ -24,6 +24,17 @@ class OfflineSpeakerSegmentationPyannoteModel::Impl { Init(buf.data(), buf.size()); } +#if __ANDROID_API__ >= 9 + Impl(AAssetManager *mgr, const OfflineSpeakerSegmentationModelConfig &config) + : config_(config), + env_(ORT_LOGGING_LEVEL_ERROR), + sess_opts_(GetSessionOptions(config)), + allocator_{} { + auto buf = ReadFile(mgr, config_.pyannote.model); + Init(buf.data(), buf.size()); + } +#endif + const OfflineSpeakerSegmentationPyannoteModelMetaData &GetModelMetaData() const { return meta_data_; @@ -92,6 +103,13 @@ OfflineSpeakerSegmentationPyannoteModel:: const OfflineSpeakerSegmentationModelConfig &config) : impl_(std::make_unique(config)) {} +#if __ANDROID_API__ >= 9 +OfflineSpeakerSegmentationPyannoteModel:: + OfflineSpeakerSegmentationPyannoteModel( + AAssetManager *mgr, const OfflineSpeakerSegmentationModelConfig &config) + : impl_(std::make_unique(mgr, config)) {} +#endif + OfflineSpeakerSegmentationPyannoteModel:: ~OfflineSpeakerSegmentationPyannoteModel() = default; diff --git a/sherpa-onnx/csrc/offline-speaker-segmentation-pyannote-model.h b/sherpa-onnx/csrc/offline-speaker-segmentation-pyannote-model.h index b504c373f..6b835763b 100644 --- a/sherpa-onnx/csrc/offline-speaker-segmentation-pyannote-model.h +++ b/sherpa-onnx/csrc/offline-speaker-segmentation-pyannote-model.h @@ -6,6 +6,11 @@ #include +#if __ANDROID_API__ >= 9 +#include "android/asset_manager.h" +#include "android/asset_manager_jni.h" +#endif + #include "onnxruntime_cxx_api.h" // NOLINT #include "sherpa-onnx/csrc/offline-speaker-segmentation-model-config.h" #include "sherpa-onnx/csrc/offline-speaker-segmentation-pyannote-model-meta-data.h" @@ -17,6 +22,11 @@ class OfflineSpeakerSegmentationPyannoteModel { explicit OfflineSpeakerSegmentationPyannoteModel( const OfflineSpeakerSegmentationModelConfig &config); +#if __ANDROID_API__ >= 9 + OfflineSpeakerSegmentationPyannoteModel( + AAssetManager *mgr, const OfflineSpeakerSegmentationModelConfig &config); +#endif + ~OfflineSpeakerSegmentationPyannoteModel(); const OfflineSpeakerSegmentationPyannoteModelMetaData &GetModelMetaData() diff --git a/sherpa-onnx/csrc/sherpa-onnx-vad-microphone-offline-asr.cc b/sherpa-onnx/csrc/sherpa-onnx-vad-microphone-offline-asr.cc index c90c29c52..df3e250a5 100644 --- a/sherpa-onnx/csrc/sherpa-onnx-vad-microphone-offline-asr.cc +++ b/sherpa-onnx/csrc/sherpa-onnx-vad-microphone-offline-asr.cc @@ -211,7 +211,7 @@ to download models for offline ASR. } while (!vad->Empty()) { - auto &segment = vad->Front(); + const auto &segment = vad->Front(); auto s = recognizer.CreateStream(); s->AcceptWaveform(sample_rate, segment.samples.data(), segment.samples.size()); diff --git a/sherpa-onnx/jni/audio-tagging.cc b/sherpa-onnx/jni/audio-tagging.cc index ff8db0089..7ad6e7d53 100644 --- a/sherpa-onnx/jni/audio-tagging.cc +++ b/sherpa-onnx/jni/audio-tagging.cc @@ -70,6 +70,7 @@ JNIEXPORT jlong JNICALL Java_com_k2fsa_sherpa_onnx_AudioTagging_newFromAsset( AAssetManager *mgr = AAssetManager_fromJava(env, asset_manager); if (!mgr) { SHERPA_ONNX_LOGE("Failed to get asset manager: %p", mgr); + return 0; } #endif diff --git a/sherpa-onnx/jni/keyword-spotter.cc b/sherpa-onnx/jni/keyword-spotter.cc index ca0c229c2..4ac80a294 100644 --- a/sherpa-onnx/jni/keyword-spotter.cc +++ b/sherpa-onnx/jni/keyword-spotter.cc @@ -115,10 +115,12 @@ JNIEXPORT jlong JNICALL Java_com_k2fsa_sherpa_onnx_KeywordSpotter_newFromAsset( AAssetManager *mgr = AAssetManager_fromJava(env, asset_manager); if (!mgr) { SHERPA_ONNX_LOGE("Failed to get asset manager: %p", mgr); + return 0; } #endif auto config = sherpa_onnx::GetKwsConfig(env, _config); SHERPA_ONNX_LOGE("config:\n%s", config.ToString().c_str()); + auto kws = new sherpa_onnx::KeywordSpotter( #if __ANDROID_API__ >= 9 mgr, diff --git a/sherpa-onnx/jni/offline-punctuation.cc b/sherpa-onnx/jni/offline-punctuation.cc index 5056a3ac4..efe03cac0 100644 --- a/sherpa-onnx/jni/offline-punctuation.cc +++ b/sherpa-onnx/jni/offline-punctuation.cc @@ -53,10 +53,12 @@ Java_com_k2fsa_sherpa_onnx_OfflinePunctuation_newFromAsset( AAssetManager *mgr = AAssetManager_fromJava(env, asset_manager); if (!mgr) { SHERPA_ONNX_LOGE("Failed to get asset manager: %p", mgr); + return 0; } #endif auto config = sherpa_onnx::GetOfflinePunctuationConfig(env, _config); SHERPA_ONNX_LOGE("config:\n%s", config.ToString().c_str()); + auto model = new sherpa_onnx::OfflinePunctuation( #if __ANDROID_API__ >= 9 mgr, diff --git a/sherpa-onnx/jni/offline-recognizer.cc b/sherpa-onnx/jni/offline-recognizer.cc index 8c1265bba..5e4b359b6 100644 --- a/sherpa-onnx/jni/offline-recognizer.cc +++ b/sherpa-onnx/jni/offline-recognizer.cc @@ -233,10 +233,12 @@ Java_com_k2fsa_sherpa_onnx_OfflineRecognizer_newFromAsset(JNIEnv *env, AAssetManager *mgr = AAssetManager_fromJava(env, asset_manager); if (!mgr) { SHERPA_ONNX_LOGE("Failed to get asset manager: %p", mgr); + return 0; } #endif auto config = sherpa_onnx::GetOfflineConfig(env, _config); SHERPA_ONNX_LOGE("config:\n%s", config.ToString().c_str()); + auto model = new sherpa_onnx::OfflineRecognizer( #if __ANDROID_API__ >= 9 mgr, diff --git a/sherpa-onnx/jni/offline-speaker-diarization.cc b/sherpa-onnx/jni/offline-speaker-diarization.cc index e82962c80..ba4e14bc3 100644 --- a/sherpa-onnx/jni/offline-speaker-diarization.cc +++ b/sherpa-onnx/jni/offline-speaker-diarization.cc @@ -101,7 +101,24 @@ SHERPA_ONNX_EXTERN_C JNIEXPORT jlong JNICALL Java_com_k2fsa_sherpa_onnx_OfflineSpeakerDiarization_newFromAsset( JNIEnv *env, jobject /*obj*/, jobject asset_manager, jobject _config) { - return 0; +#if __ANDROID_API__ >= 9 + AAssetManager *mgr = AAssetManager_fromJava(env, asset_manager); + if (!mgr) { + SHERPA_ONNX_LOGE("Failed to get asset manager: %p", mgr); + return 0; + } +#endif + + auto config = sherpa_onnx::GetOfflineSpeakerDiarizationConfig(env, _config); + SHERPA_ONNX_LOGE("config:\n%s", config.ToString().c_str()); + + auto sd = new sherpa_onnx::OfflineSpeakerDiarization( +#if __ANDROID_API__ >= 9 + mgr, +#endif + config); + + return (jlong)sd; } SHERPA_ONNX_EXTERN_C diff --git a/sherpa-onnx/jni/offline-tts.cc b/sherpa-onnx/jni/offline-tts.cc index 43a93e0e0..4d67afc27 100644 --- a/sherpa-onnx/jni/offline-tts.cc +++ b/sherpa-onnx/jni/offline-tts.cc @@ -105,6 +105,7 @@ JNIEXPORT jlong JNICALL Java_com_k2fsa_sherpa_onnx_OfflineTts_newFromAsset( AAssetManager *mgr = AAssetManager_fromJava(env, asset_manager); if (!mgr) { SHERPA_ONNX_LOGE("Failed to get asset manager: %p", mgr); + return 0; } #endif auto config = sherpa_onnx::GetOfflineTtsConfig(env, _config); diff --git a/sherpa-onnx/jni/online-recognizer.cc b/sherpa-onnx/jni/online-recognizer.cc index 1793cf73b..dbe205c4e 100644 --- a/sherpa-onnx/jni/online-recognizer.cc +++ b/sherpa-onnx/jni/online-recognizer.cc @@ -267,6 +267,7 @@ Java_com_k2fsa_sherpa_onnx_OnlineRecognizer_newFromAsset(JNIEnv *env, AAssetManager *mgr = AAssetManager_fromJava(env, asset_manager); if (!mgr) { SHERPA_ONNX_LOGE("Failed to get asset manager: %p", mgr); + return 0; } #endif auto config = sherpa_onnx::GetConfig(env, _config); diff --git a/sherpa-onnx/jni/speaker-embedding-extractor.cc b/sherpa-onnx/jni/speaker-embedding-extractor.cc index b1190bffc..33d630ee6 100644 --- a/sherpa-onnx/jni/speaker-embedding-extractor.cc +++ b/sherpa-onnx/jni/speaker-embedding-extractor.cc @@ -45,6 +45,7 @@ Java_com_k2fsa_sherpa_onnx_SpeakerEmbeddingExtractor_newFromAsset( AAssetManager *mgr = AAssetManager_fromJava(env, asset_manager); if (!mgr) { SHERPA_ONNX_LOGE("Failed to get asset manager: %p", mgr); + return 0; } #endif auto config = sherpa_onnx::GetSpeakerEmbeddingExtractorConfig(env, _config); diff --git a/sherpa-onnx/jni/spoken-language-identification.cc b/sherpa-onnx/jni/spoken-language-identification.cc index 278c6adbf..fcb6f228a 100644 --- a/sherpa-onnx/jni/spoken-language-identification.cc +++ b/sherpa-onnx/jni/spoken-language-identification.cc @@ -62,6 +62,7 @@ Java_com_k2fsa_sherpa_onnx_SpokenLanguageIdentification_newFromAsset( AAssetManager *mgr = AAssetManager_fromJava(env, asset_manager); if (!mgr) { SHERPA_ONNX_LOGE("Failed to get asset manager: %p", mgr); + return 0; } #endif diff --git a/sherpa-onnx/jni/voice-activity-detector.cc b/sherpa-onnx/jni/voice-activity-detector.cc index 319edd09b..a30423f70 100644 --- a/sherpa-onnx/jni/voice-activity-detector.cc +++ b/sherpa-onnx/jni/voice-activity-detector.cc @@ -71,10 +71,12 @@ JNIEXPORT jlong JNICALL Java_com_k2fsa_sherpa_onnx_Vad_newFromAsset( AAssetManager *mgr = AAssetManager_fromJava(env, asset_manager); if (!mgr) { SHERPA_ONNX_LOGE("Failed to get asset manager: %p", mgr); + return 0; } #endif auto config = sherpa_onnx::GetVadModelConfig(env, _config); SHERPA_ONNX_LOGE("config:\n%s", config.ToString().c_str()); + auto model = new sherpa_onnx::VoiceActivityDetector( #if __ANDROID_API__ >= 9 mgr, From 5a22f74b2b0700b9f986bc9f01ae93b58b2117c9 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sun, 13 Oct 2024 14:02:57 +0800 Subject: [PATCH 234/237] Android demo for speaker diarization (#1423) --- .../workflows/apk-speaker-diarization.yaml | 175 +++++++++++++++ .../workflows/apk-speaker-identification.yaml | 62 ++++++ .github/workflows/apk-vad.yaml | 2 +- README.md | 51 +++-- android/README.md | 2 + .../SherpaOnnxSpeakerDiarization/.gitignore | 15 ++ .../app/.gitignore | 1 + .../app/build.gradle.kts | 71 ++++++ .../app/proguard-rules.pro | 21 ++ .../diarization/ExampleInstrumentedTest.kt | 24 ++ .../app/src/main/AndroidManifest.xml | 32 +++ .../app/src/main/assets/.gitkeep | 0 .../onnx/speaker/diarization/BarItem.kt | 13 ++ .../onnx/speaker/diarization/MainActivity.kt | 132 +++++++++++ .../onnx/speaker/diarization/NavBarItems.kt | 20 ++ .../onnx/speaker/diarization/NavRoutes.kt | 6 + .../diarization/OfflineSpeakerDiarization.kt | 1 + .../onnx/speaker/diarization/ReadWaveFile.kt | 137 ++++++++++++ .../diarization/SpeakerDiarizationObject.kt | 66 ++++++ .../SpeakerEmbeddingExtractorConfig.kt | 1 + .../onnx/speaker/diarization/screens/Help.kt | 38 ++++ .../onnx/speaker/diarization/screens/Home.kt | 210 ++++++++++++++++++ .../speaker/diarization/ui/theme/Color.kt | 11 + .../speaker/diarization/ui/theme/Theme.kt | 58 +++++ .../onnx/speaker/diarization/ui/theme/Type.kt | 34 +++ .../app/src/main/jniLibs/arm64-v8a/.gitkeep | 0 .../app/src/main/jniLibs/armeabi-v7a/.gitkeep | 0 .../app/src/main/jniLibs/x86/.gitkeep | 0 .../app/src/main/jniLibs/x86_64/.gitkeep | 0 .../drawable-v24/ic_launcher_foreground.xml | 30 +++ .../res/drawable/ic_launcher_background.xml | 170 ++++++++++++++ .../res/mipmap-anydpi-v26/ic_launcher.xml | 6 + .../mipmap-anydpi-v26/ic_launcher_round.xml | 6 + .../src/main/res/mipmap-hdpi/ic_launcher.webp | Bin 0 -> 1404 bytes .../res/mipmap-hdpi/ic_launcher_round.webp | Bin 0 -> 2898 bytes .../src/main/res/mipmap-mdpi/ic_launcher.webp | Bin 0 -> 982 bytes .../res/mipmap-mdpi/ic_launcher_round.webp | Bin 0 -> 1772 bytes .../main/res/mipmap-xhdpi/ic_launcher.webp | Bin 0 -> 1900 bytes .../res/mipmap-xhdpi/ic_launcher_round.webp | Bin 0 -> 3918 bytes .../main/res/mipmap-xxhdpi/ic_launcher.webp | Bin 0 -> 2884 bytes .../res/mipmap-xxhdpi/ic_launcher_round.webp | Bin 0 -> 5914 bytes .../main/res/mipmap-xxxhdpi/ic_launcher.webp | Bin 0 -> 3844 bytes .../res/mipmap-xxxhdpi/ic_launcher_round.webp | Bin 0 -> 7778 bytes .../app/src/main/res/values/colors.xml | 10 + .../app/src/main/res/values/strings.xml | 3 + .../app/src/main/res/values/themes.xml | 5 + .../app/src/main/res/xml/backup_rules.xml | 13 ++ .../main/res/xml/data_extraction_rules.xml | 19 ++ .../speaker/diarization/ExampleUnitTest.kt | 17 ++ .../build.gradle.kts | 5 + .../gradle.properties | 23 ++ .../gradle/libs.versions.toml | 35 +++ .../gradle/wrapper/gradle-wrapper.jar | Bin 0 -> 59203 bytes .../gradle/wrapper/gradle-wrapper.properties | 6 + android/SherpaOnnxSpeakerDiarization/gradlew | 185 +++++++++++++++ .../SherpaOnnxSpeakerDiarization/gradlew.bat | 89 ++++++++ .../settings.gradle.kts | 23 ++ .../SpeakerEmbeddingExtractorConfig.kt | 1 + .../SpeakerEmbeddingExtractorConfig.kt | 1 + kotlin-api-examples/run.sh | 2 + scripts/apk/build-apk-speaker-diarization.sh | 73 ++++++ .../kotlin-api/OfflineSpeakerDiarization.kt | 11 +- sherpa-onnx/kotlin-api/Speaker.kt | 7 - .../SpeakerEmbeddingExtractorConfig.kt | 8 + 64 files changed, 1905 insertions(+), 26 deletions(-) create mode 100644 .github/workflows/apk-speaker-diarization.yaml create mode 100644 android/SherpaOnnxSpeakerDiarization/.gitignore create mode 100644 android/SherpaOnnxSpeakerDiarization/app/.gitignore create mode 100644 android/SherpaOnnxSpeakerDiarization/app/build.gradle.kts create mode 100644 android/SherpaOnnxSpeakerDiarization/app/proguard-rules.pro create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/androidTest/java/com/k2fsa/sherpa/onnx/speaker/diarization/ExampleInstrumentedTest.kt create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/AndroidManifest.xml create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/assets/.gitkeep create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/BarItem.kt create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/MainActivity.kt create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/NavBarItems.kt create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/NavRoutes.kt create mode 120000 android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/OfflineSpeakerDiarization.kt create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/ReadWaveFile.kt create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/SpeakerDiarizationObject.kt create mode 120000 android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/SpeakerEmbeddingExtractorConfig.kt create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/screens/Help.kt create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/screens/Home.kt create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/ui/theme/Color.kt create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/ui/theme/Theme.kt create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/ui/theme/Type.kt create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/jniLibs/arm64-v8a/.gitkeep create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/jniLibs/armeabi-v7a/.gitkeep create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/jniLibs/x86/.gitkeep create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/jniLibs/x86_64/.gitkeep create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/res/drawable-v24/ic_launcher_foreground.xml create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/res/drawable/ic_launcher_background.xml create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/res/mipmap-hdpi/ic_launcher.webp create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/res/mipmap-mdpi/ic_launcher.webp create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/res/mipmap-xhdpi/ic_launcher.webp create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/res/values/colors.xml create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/res/values/strings.xml create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/res/values/themes.xml create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/res/xml/backup_rules.xml create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/main/res/xml/data_extraction_rules.xml create mode 100644 android/SherpaOnnxSpeakerDiarization/app/src/test/java/com/k2fsa/sherpa/onnx/speaker/diarization/ExampleUnitTest.kt create mode 100644 android/SherpaOnnxSpeakerDiarization/build.gradle.kts create mode 100644 android/SherpaOnnxSpeakerDiarization/gradle.properties create mode 100644 android/SherpaOnnxSpeakerDiarization/gradle/libs.versions.toml create mode 100644 android/SherpaOnnxSpeakerDiarization/gradle/wrapper/gradle-wrapper.jar create mode 100644 android/SherpaOnnxSpeakerDiarization/gradle/wrapper/gradle-wrapper.properties create mode 100755 android/SherpaOnnxSpeakerDiarization/gradlew create mode 100644 android/SherpaOnnxSpeakerDiarization/gradlew.bat create mode 100644 android/SherpaOnnxSpeakerDiarization/settings.gradle.kts create mode 120000 android/SherpaOnnxSpeakerIdentification/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/identification/SpeakerEmbeddingExtractorConfig.kt create mode 120000 kotlin-api-examples/SpeakerEmbeddingExtractorConfig.kt create mode 100755 scripts/apk/build-apk-speaker-diarization.sh create mode 100644 sherpa-onnx/kotlin-api/SpeakerEmbeddingExtractorConfig.kt diff --git a/.github/workflows/apk-speaker-diarization.yaml b/.github/workflows/apk-speaker-diarization.yaml new file mode 100644 index 000000000..19f0b99bc --- /dev/null +++ b/.github/workflows/apk-speaker-diarization.yaml @@ -0,0 +1,175 @@ +name: apk-speaker-diarization + +on: + push: + branches: + - apk + - android-demo-speaker-diarization-2 + + workflow_dispatch: + +concurrency: + group: apk-speaker-diarization-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: write + +jobs: + apk_speaker_identification: + if: github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa' + runs-on: ${{ matrix.os }} + name: apk for speaker diarization ${{ matrix.index }}/${{ matrix.total }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + total: ["1"] + index: ["0"] + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + # https://github.com/actions/setup-java + - uses: actions/setup-java@v4 + with: + distribution: 'temurin' # See 'Supported distributions' for available options + java-version: '21' + + - name: ccache + uses: hendrikmuhs/ccache-action@v1.2 + with: + key: ${{ matrix.os }}-android + + - name: Display NDK HOME + shell: bash + run: | + echo "ANDROID_NDK_LATEST_HOME: ${ANDROID_NDK_LATEST_HOME}" + ls -lh ${ANDROID_NDK_LATEST_HOME} + + - name: Install Python dependencies + shell: bash + run: | + python3 -m pip install --upgrade pip jinja2 + + - name: Setup build tool version variable + shell: bash + run: | + echo "---" + ls -lh /usr/local/lib/android/ + echo "---" + + ls -lh /usr/local/lib/android/sdk + echo "---" + + ls -lh /usr/local/lib/android/sdk/build-tools + echo "---" + + BUILD_TOOL_VERSION=$(ls /usr/local/lib/android/sdk/build-tools/ | tail -n 1) + echo "BUILD_TOOL_VERSION=$BUILD_TOOL_VERSION" >> $GITHUB_ENV + echo "Last build tool version is: $BUILD_TOOL_VERSION" + + - name: Generate build script + shell: bash + run: | + cd scripts/apk + + chmod +x build-apk-speaker-diarization.sh + mv -v ./build-apk-speaker-diarization.sh ../.. + + - name: build APK + shell: bash + run: | + export CMAKE_CXX_COMPILER_LAUNCHER=ccache + export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH" + cmake --version + + export ANDROID_NDK=$ANDROID_NDK_LATEST_HOME + ./build-apk-speaker-diarization.sh + + - name: Display APK + shell: bash + run: | + ls -lh ./apks/ + du -h -d1 . + + # https://github.com/marketplace/actions/sign-android-release + - uses: r0adkll/sign-android-release@v1 + name: Sign app APK + with: + releaseDirectory: ./apks + signingKeyBase64: ${{ secrets.ANDROID_SIGNING_KEY }} + alias: ${{ secrets.ANDROID_SIGNING_KEY_ALIAS }} + keyStorePassword: ${{ secrets.ANDROID_SIGNING_KEY_STORE_PASSWORD }} + env: + BUILD_TOOLS_VERSION: ${{ env.BUILD_TOOL_VERSION }} + + - name: Display APK after signing + shell: bash + run: | + ls -lh ./apks/ + du -h -d1 . + + - name: Rename APK after signing + shell: bash + run: | + cd apks + rm -fv signingKey.jks + rm -fv *.apk.idsig + rm -fv *-aligned.apk + + all_apks=$(ls -1 *-signed.apk) + echo "----" + echo $all_apks + echo "----" + for apk in ${all_apks[@]}; do + n=$(echo $apk | sed -e s/-signed//) + mv -v $apk $n + done + + cd .. + + ls -lh ./apks/ + du -h -d1 . + + - name: Display APK after rename + shell: bash + run: | + ls -lh ./apks/ + du -h -d1 . + + - name: Publish to huggingface + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + uses: nick-fields/retry@v3 + with: + max_attempts: 20 + timeout_seconds: 200 + shell: bash + command: | + git config --global user.email "csukuangfj@gmail.com" + git config --global user.name "Fangjun Kuang" + + rm -rf huggingface + export GIT_LFS_SKIP_SMUDGE=1 + export GIT_CLONE_PROTECTION_ACTIVE=false + + SHERPA_ONNX_VERSION=$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2) + echo "SHERPA_ONNX_VERSION $SHERPA_ONNX_VERSION" + + git clone https://huggingface.co/csukuangfj/sherpa-onnx-apk huggingface + cd huggingface + git fetch + git pull + git merge -m "merge remote" --ff origin main + + d=speaker-diarization/$SHERPA_ONNX_VERSION + mkdir -p $d/ + cp -v ../apks/*.apk $d/ + git status + git lfs track "*.apk" + git add . + git commit -m "add more apks" + git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-apk main diff --git a/.github/workflows/apk-speaker-identification.yaml b/.github/workflows/apk-speaker-identification.yaml index ca89ec49f..e32ad3bc9 100644 --- a/.github/workflows/apk-speaker-identification.yaml +++ b/.github/workflows/apk-speaker-identification.yaml @@ -53,6 +53,23 @@ jobs: run: | python3 -m pip install --upgrade pip jinja2 + - name: Setup build tool version variable + shell: bash + run: | + echo "---" + ls -lh /usr/local/lib/android/ + echo "---" + + ls -lh /usr/local/lib/android/sdk + echo "---" + + ls -lh /usr/local/lib/android/sdk/build-tools + echo "---" + + BUILD_TOOL_VERSION=$(ls /usr/local/lib/android/sdk/build-tools/ | tail -n 1) + echo "BUILD_TOOL_VERSION=$BUILD_TOOL_VERSION" >> $GITHUB_ENV + echo "Last build tool version is: $BUILD_TOOL_VERSION" + - name: Generate build script shell: bash run: | @@ -82,6 +99,51 @@ jobs: ls -lh ./apks/ du -h -d1 . + # https://github.com/marketplace/actions/sign-android-release + - uses: r0adkll/sign-android-release@v1 + name: Sign app APK + with: + releaseDirectory: ./apks + signingKeyBase64: ${{ secrets.ANDROID_SIGNING_KEY }} + alias: ${{ secrets.ANDROID_SIGNING_KEY_ALIAS }} + keyStorePassword: ${{ secrets.ANDROID_SIGNING_KEY_STORE_PASSWORD }} + env: + BUILD_TOOLS_VERSION: ${{ env.BUILD_TOOL_VERSION }} + + - name: Display APK after signing + shell: bash + run: | + ls -lh ./apks/ + du -h -d1 . + + - name: Rename APK after signing + shell: bash + run: | + cd apks + rm -fv signingKey.jks + rm -fv *.apk.idsig + rm -fv *-aligned.apk + + all_apks=$(ls -1 *-signed.apk) + echo "----" + echo $all_apks + echo "----" + for apk in ${all_apks[@]}; do + n=$(echo $apk | sed -e s/-signed//) + mv -v $apk $n + done + + cd .. + + ls -lh ./apks/ + du -h -d1 . + + - name: Display APK after rename + shell: bash + run: | + ls -lh ./apks/ + du -h -d1 . + - name: Publish to huggingface env: HF_TOKEN: ${{ secrets.HF_TOKEN }} diff --git a/.github/workflows/apk-vad.yaml b/.github/workflows/apk-vad.yaml index 8253145b6..d9af75477 100644 --- a/.github/workflows/apk-vad.yaml +++ b/.github/workflows/apk-vad.yaml @@ -166,7 +166,7 @@ jobs: git pull git merge -m "merge remote" --ff origin main - d=vad/SHERPA_ONNX_VERSION + d=vad/$SHERPA_ONNX_VERSION mkdir -p $d cp -v ../apks/*.apk $d/ git status diff --git a/README.md b/README.md index 1828847e5..32d141f90 100644 --- a/README.md +++ b/README.md @@ -84,8 +84,9 @@ with the following APIs ### Links for Huggingface Spaces -You can visit the following Huggingface spaces to try `sherpa-onnx` without -installing anything. All you need is a browser. +
+You can visit the following Huggingface spaces to try sherpa-onnx without +installing anything. All you need is a browser. | Description | URL | |-------------------------------------------------------|------------------------------------| @@ -118,23 +119,34 @@ We also have spaces built using WebAssembly. They are listed below: |Speech synthesis (German) |[Click me][wasm-hf-tts-piper-de]| [地址][wasm-ms-tts-piper-de]| |Speaker diarization |[Click me][wasm-hf-speaker-diarization]|[地址][wasm-ms-speaker-diarization]| +
+ ### Links for pre-built Android APKs -| Description | URL | 中国用户 | -|----------------------------------------|------------------------------|-----------------------------| -| Streaming speech recognition | [Address][apk-streaming-asr] | [点此][apk-streaming-asr-cn]| -| Text-to-speech | [Address][apk-tts] | [点此][apk-tts-cn] | -| Voice activity detection (VAD) | [Address][apk-vad] | [点此][apk-vad-cn] | -| VAD + non-streaming speech recognition | [Address][apk-vad-asr] | [点此][apk-vad-asr-cn] | -| Two-pass speech recognition | [Address][apk-2pass] | [点此][apk-2pass-cn] | -| Audio tagging | [Address][apk-at] | [点此][apk-at-cn] | -| Audio tagging (WearOS) | [Address][apk-at-wearos] | [点此][apk-at-wearos-cn] | -| Speaker identification | [Address][apk-sid] | [点此][apk-sid-cn] | -| Spoken language identification | [Address][apk-slid] | [点此][apk-slid-cn] | -| Keyword spotting | [Address][apk-kws] | [点此][apk-kws-cn] | +
+ +You can find pre-built Android APKs for this repository in the following table + +| Description | URL | 中国用户 | +|----------------------------------------|------------------------------------|-----------------------------------| +| Speaker diarization | [Address][apk-speaker-diarization] | [点此][apk-speaker-diarization-cn]| +| Streaming speech recognition | [Address][apk-streaming-asr] | [点此][apk-streaming-asr-cn] | +| Text-to-speech | [Address][apk-tts] | [点此][apk-tts-cn] | +| Voice activity detection (VAD) | [Address][apk-vad] | [点此][apk-vad-cn] | +| VAD + non-streaming speech recognition | [Address][apk-vad-asr] | [点此][apk-vad-asr-cn] | +| Two-pass speech recognition | [Address][apk-2pass] | [点此][apk-2pass-cn] | +| Audio tagging | [Address][apk-at] | [点此][apk-at-cn] | +| Audio tagging (WearOS) | [Address][apk-at-wearos] | [点此][apk-at-wearos-cn] | +| Speaker identification | [Address][apk-sid] | [点此][apk-sid-cn] | +| Spoken language identification | [Address][apk-slid] | [点此][apk-slid-cn] | +| Keyword spotting | [Address][apk-kws] | [点此][apk-kws-cn] | + +
### Links for pre-built Flutter APPs +
+ #### Real-time speech recognition | Description | URL | 中国用户 | @@ -153,17 +165,24 @@ We also have spaces built using WebAssembly. They are listed below: > Note: You need to build from source for iOS. +
+ ### Links for pre-built Lazarus APPs +
+ #### Generating subtitles | Description | URL | 中国用户 | |--------------------------------|----------------------------|----------------------------| | Generate subtitles (生成字幕) | [Address][lazarus-subtitle]| [点此][lazarus-subtitle-cn]| +
### Links for pre-trained models +
+ | Description | URL | |---------------------------------------------|---------------------------------------------------------------------------------------| | Speech recognition (speech to text, ASR) | [Address][asr-models] | @@ -176,6 +195,8 @@ We also have spaces built using WebAssembly. They are listed below: | Punctuation | [Address][punct-models] | | Speaker segmentation | [Address][speaker-segmentation-models] | +
+ ### Useful links - Documentation: https://k2-fsa.github.io/sherpa/onnx/ @@ -265,6 +286,8 @@ Video demo in Chinese: [爆了!炫神教你开打字挂!真正影响胜率 [wasm-ms-tts-piper-de]: https://modelscope.cn/studios/k2-fsa/web-assembly-tts-sherpa-onnx-de [wasm-hf-speaker-diarization]: https://huggingface.co/spaces/k2-fsa/web-assembly-speaker-diarization-sherpa-onnx [wasm-ms-speaker-diarization]: https://www.modelscope.cn/studios/csukuangfj/web-assembly-speaker-diarization-sherpa-onnx +[apk-speaker-diarization]: https://k2-fsa.github.io/sherpa/onnx/speaker-diarization/apk.html +[apk-speaker-diarization-cn]: https://k2-fsa.github.io/sherpa/onnx/speaker-diarization/apk-cn.html [apk-streaming-asr]: https://k2-fsa.github.io/sherpa/onnx/android/apk.html [apk-streaming-asr-cn]: https://k2-fsa.github.io/sherpa/onnx/android/apk-cn.html [apk-tts]: https://k2-fsa.github.io/sherpa/onnx/tts/apk-engine.html diff --git a/android/README.md b/android/README.md index 42b29e08f..bae335598 100644 --- a/android/README.md +++ b/android/README.md @@ -4,6 +4,8 @@ Please refer to https://k2-fsa.github.io/sherpa/onnx/android/index.html for usage. +- [SherpaOnnxSpeakerDiarization](./SherpaOnnxSpeakerDiarization) It is for speaker diarization. + - [SherpaOnnx](./SherpaOnnx) It uses a streaming ASR model. - [SherpaOnnx2Pass](./SherpaOnnx2Pass) It uses a streaming ASR model diff --git a/android/SherpaOnnxSpeakerDiarization/.gitignore b/android/SherpaOnnxSpeakerDiarization/.gitignore new file mode 100644 index 000000000..aa724b770 --- /dev/null +++ b/android/SherpaOnnxSpeakerDiarization/.gitignore @@ -0,0 +1,15 @@ +*.iml +.gradle +/local.properties +/.idea/caches +/.idea/libraries +/.idea/modules.xml +/.idea/workspace.xml +/.idea/navEditor.xml +/.idea/assetWizardSettings.xml +.DS_Store +/build +/captures +.externalNativeBuild +.cxx +local.properties diff --git a/android/SherpaOnnxSpeakerDiarization/app/.gitignore b/android/SherpaOnnxSpeakerDiarization/app/.gitignore new file mode 100644 index 000000000..42afabfd2 --- /dev/null +++ b/android/SherpaOnnxSpeakerDiarization/app/.gitignore @@ -0,0 +1 @@ +/build \ No newline at end of file diff --git a/android/SherpaOnnxSpeakerDiarization/app/build.gradle.kts b/android/SherpaOnnxSpeakerDiarization/app/build.gradle.kts new file mode 100644 index 000000000..7a390ba42 --- /dev/null +++ b/android/SherpaOnnxSpeakerDiarization/app/build.gradle.kts @@ -0,0 +1,71 @@ +plugins { + alias(libs.plugins.android.application) + alias(libs.plugins.jetbrains.kotlin.android) +} + +android { + namespace = "com.k2fsa.sherpa.onnx.speaker.diarization" + compileSdk = 34 + + defaultConfig { + applicationId = "com.k2fsa.sherpa.onnx.speaker.diarization" + minSdk = 21 + targetSdk = 34 + versionCode = 1 + versionName = "1.0" + + testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner" + vectorDrawables { + useSupportLibrary = true + } + } + + buildTypes { + release { + isMinifyEnabled = false + proguardFiles( + getDefaultProguardFile("proguard-android-optimize.txt"), + "proguard-rules.pro" + ) + } + } + compileOptions { + sourceCompatibility = JavaVersion.VERSION_1_8 + targetCompatibility = JavaVersion.VERSION_1_8 + } + kotlinOptions { + jvmTarget = "1.8" + } + buildFeatures { + compose = true + } + composeOptions { + kotlinCompilerExtensionVersion = "1.5.1" + } + packaging { + resources { + excludes += "/META-INF/{AL2.0,LGPL2.1}" + } + } +} + +dependencies { + + implementation(libs.androidx.core.ktx) + implementation(libs.androidx.lifecycle.runtime.ktx) + implementation(libs.androidx.activity.compose) + implementation(platform(libs.androidx.compose.bom)) + implementation(libs.androidx.ui) + implementation(libs.androidx.ui.graphics) + implementation(libs.androidx.ui.tooling.preview) + implementation(libs.androidx.material3) + implementation(libs.androidx.navigation.compose) + implementation(libs.androidx.documentfile) + testImplementation(libs.junit) + androidTestImplementation(libs.androidx.junit) + androidTestImplementation(libs.androidx.espresso.core) + androidTestImplementation(platform(libs.androidx.compose.bom)) + androidTestImplementation(libs.androidx.ui.test.junit4) + debugImplementation(libs.androidx.ui.tooling) + debugImplementation(libs.androidx.ui.test.manifest) +} \ No newline at end of file diff --git a/android/SherpaOnnxSpeakerDiarization/app/proguard-rules.pro b/android/SherpaOnnxSpeakerDiarization/app/proguard-rules.pro new file mode 100644 index 000000000..481bb4348 --- /dev/null +++ b/android/SherpaOnnxSpeakerDiarization/app/proguard-rules.pro @@ -0,0 +1,21 @@ +# Add project specific ProGuard rules here. +# You can control the set of applied configuration files using the +# proguardFiles setting in build.gradle. +# +# For more details, see +# http://developer.android.com/guide/developing/tools/proguard.html + +# If your project uses WebView with JS, uncomment the following +# and specify the fully qualified class name to the JavaScript interface +# class: +#-keepclassmembers class fqcn.of.javascript.interface.for.webview { +# public *; +#} + +# Uncomment this to preserve the line number information for +# debugging stack traces. +#-keepattributes SourceFile,LineNumberTable + +# If you keep the line number information, uncomment this to +# hide the original source file name. +#-renamesourcefileattribute SourceFile \ No newline at end of file diff --git a/android/SherpaOnnxSpeakerDiarization/app/src/androidTest/java/com/k2fsa/sherpa/onnx/speaker/diarization/ExampleInstrumentedTest.kt b/android/SherpaOnnxSpeakerDiarization/app/src/androidTest/java/com/k2fsa/sherpa/onnx/speaker/diarization/ExampleInstrumentedTest.kt new file mode 100644 index 000000000..53d7af15f --- /dev/null +++ b/android/SherpaOnnxSpeakerDiarization/app/src/androidTest/java/com/k2fsa/sherpa/onnx/speaker/diarization/ExampleInstrumentedTest.kt @@ -0,0 +1,24 @@ +package com.k2fsa.sherpa.onnx.speaker.diarization + +import androidx.test.platform.app.InstrumentationRegistry +import androidx.test.ext.junit.runners.AndroidJUnit4 + +import org.junit.Test +import org.junit.runner.RunWith + +import org.junit.Assert.* + +/** + * Instrumented test, which will execute on an Android device. + * + * See [testing documentation](http://d.android.com/tools/testing). + */ +@RunWith(AndroidJUnit4::class) +class ExampleInstrumentedTest { + @Test + fun useAppContext() { + // Context of the app under test. + val appContext = InstrumentationRegistry.getInstrumentation().targetContext + assertEquals("com.k2fsa.sherpa.onnx.speaker.diarization", appContext.packageName) + } +} \ No newline at end of file diff --git a/android/SherpaOnnxSpeakerDiarization/app/src/main/AndroidManifest.xml b/android/SherpaOnnxSpeakerDiarization/app/src/main/AndroidManifest.xml new file mode 100644 index 000000000..d58f7e8d7 --- /dev/null +++ b/android/SherpaOnnxSpeakerDiarization/app/src/main/AndroidManifest.xml @@ -0,0 +1,32 @@ + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/android/SherpaOnnxSpeakerDiarization/app/src/main/assets/.gitkeep b/android/SherpaOnnxSpeakerDiarization/app/src/main/assets/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/BarItem.kt b/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/BarItem.kt new file mode 100644 index 000000000..0895cf52c --- /dev/null +++ b/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/BarItem.kt @@ -0,0 +1,13 @@ +package com.k2fsa.sherpa.onnx.speaker.diarization + +import androidx.compose.ui.graphics.vector.ImageVector + +data class BarItem( + val title: String, + + // see https://www.composables.com/icons + // and + // https://developer.android.com/reference/kotlin/androidx/compose/material/icons/filled/package-summary + val image: ImageVector, + val route: String, +) \ No newline at end of file diff --git a/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/MainActivity.kt b/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/MainActivity.kt new file mode 100644 index 000000000..7a25d49b9 --- /dev/null +++ b/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/MainActivity.kt @@ -0,0 +1,132 @@ +package com.k2fsa.sherpa.onnx.speaker.diarization + +import android.os.Bundle +import androidx.activity.ComponentActivity +import androidx.activity.compose.setContent +import androidx.activity.enableEdgeToEdge +import androidx.compose.foundation.layout.Column +import androidx.compose.foundation.layout.fillMaxSize +import androidx.compose.foundation.layout.padding +import androidx.compose.material3.CenterAlignedTopAppBar +import androidx.compose.material3.ExperimentalMaterial3Api +import androidx.compose.material3.Icon +import androidx.compose.material3.MaterialTheme +import androidx.compose.material3.NavigationBar +import androidx.compose.material3.NavigationBarItem +import androidx.compose.material3.Scaffold +import androidx.compose.material3.Surface +import androidx.compose.material3.Text +import androidx.compose.material3.TopAppBarDefaults +import androidx.compose.runtime.Composable +import androidx.compose.runtime.getValue +import androidx.compose.ui.Modifier +import androidx.compose.ui.text.font.FontWeight +import androidx.compose.ui.tooling.preview.Preview +import androidx.navigation.NavGraph.Companion.findStartDestination +import androidx.navigation.NavHostController +import androidx.navigation.compose.NavHost +import androidx.navigation.compose.composable +import androidx.navigation.compose.currentBackStackEntryAsState +import androidx.navigation.compose.rememberNavController +import com.k2fsa.sherpa.onnx.speaker.diarization.screens.HelpScreen +import com.k2fsa.sherpa.onnx.speaker.diarization.screens.HomeScreen +import com.k2fsa.sherpa.onnx.speaker.diarization.ui.theme.SherpaOnnxSpeakerDiarizationTheme + +const val TAG = "sherpa-onnx-sd" + +class MainActivity : ComponentActivity() { + override fun onCreate(savedInstanceState: Bundle?) { + super.onCreate(savedInstanceState) + enableEdgeToEdge() + setContent { + SherpaOnnxSpeakerDiarizationTheme { + // A surface container using the 'background' color from the theme + Surface( + modifier = Modifier.fillMaxSize(), + color = MaterialTheme.colorScheme.background + ) { + MainScreen() + } + } + } + SpeakerDiarizationObject.initSpeakerDiarization(this.assets) + } +} + +@OptIn(ExperimentalMaterial3Api::class) +@Composable +fun MainScreen(modifier: Modifier = Modifier) { + val navController = rememberNavController() + Scaffold( + topBar = { + CenterAlignedTopAppBar( + colors = TopAppBarDefaults.topAppBarColors( + containerColor = MaterialTheme.colorScheme.primaryContainer, + titleContentColor = MaterialTheme.colorScheme.primary, + ), + title = { + Text( + "Next-gen Kaldi: Speaker Diarization", + fontWeight = FontWeight.Bold, + ) + }, + ) + }, + content = { padding -> + Column(Modifier.padding(padding)) { + NavigationHost(navController = navController) + + } + }, + bottomBar = { + BottomNavigationBar(navController = navController) + } + ) +} + +@Composable +fun NavigationHost(navController: NavHostController) { + NavHost(navController = navController, startDestination = NavRoutes.Home.route) { + composable(NavRoutes.Home.route) { + HomeScreen() + } + + composable(NavRoutes.Help.route) { + HelpScreen() + } + } +} + +@Composable +fun BottomNavigationBar(navController: NavHostController) { + NavigationBar { + val backStackEntry by navController.currentBackStackEntryAsState() + val currentRoute = backStackEntry?.destination?.route + + NavBarItems.BarItems.forEach { navItem -> + NavigationBarItem(selected = currentRoute == navItem.route, + onClick = { + navController.navigate(navItem.route) { + popUpTo(navController.graph.findStartDestination().id) { + saveState = true + } + launchSingleTop = true + restoreState = true + } + }, + icon = { + Icon(imageVector = navItem.image, contentDescription = navItem.title) + }, label = { + Text(text = navItem.title) + }) + } + } +} + +@Preview(showBackground = true) +@Composable +fun MainScreenPreview() { + SherpaOnnxSpeakerDiarizationTheme { + MainScreen() + } +} \ No newline at end of file diff --git a/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/NavBarItems.kt b/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/NavBarItems.kt new file mode 100644 index 000000000..65c737f97 --- /dev/null +++ b/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/NavBarItems.kt @@ -0,0 +1,20 @@ +package com.k2fsa.sherpa.onnx.speaker.diarization + +import androidx.compose.material.icons.Icons +import androidx.compose.material.icons.filled.Home +import androidx.compose.material.icons.filled.Info + +object NavBarItems { + val BarItems = listOf( + BarItem( + title = "Home", + image = Icons.Filled.Home, + route = "home", + ), + BarItem( + title = "Help", + image = Icons.Filled.Info, + route = "help", + ), + ) +} \ No newline at end of file diff --git a/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/NavRoutes.kt b/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/NavRoutes.kt new file mode 100644 index 000000000..2e1ae90b5 --- /dev/null +++ b/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/NavRoutes.kt @@ -0,0 +1,6 @@ +package com.k2fsa.sherpa.onnx.speaker.diarization + +sealed class NavRoutes(val route: String) { + object Home : NavRoutes("home") + object Help : NavRoutes("help") +} \ No newline at end of file diff --git a/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/OfflineSpeakerDiarization.kt b/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/OfflineSpeakerDiarization.kt new file mode 120000 index 000000000..459cc22cc --- /dev/null +++ b/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/OfflineSpeakerDiarization.kt @@ -0,0 +1 @@ +../../../../../../../../../../../../sherpa-onnx/kotlin-api/OfflineSpeakerDiarization.kt \ No newline at end of file diff --git a/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/ReadWaveFile.kt b/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/ReadWaveFile.kt new file mode 100644 index 000000000..940a2b643 --- /dev/null +++ b/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/ReadWaveFile.kt @@ -0,0 +1,137 @@ +package com.k2fsa.sherpa.onnx.speaker.diarization.screens + +import android.content.Context +import android.media.AudioFormat +import android.media.MediaCodec +import android.media.MediaExtractor +import android.media.MediaFormat +import android.net.Uri + +data class WaveData( + val sampleRate: Int? = null, + val samples: FloatArray? = null, + val msg: String? = null +) + +// It supports only 16-bit encoded wave files +// +// References +// - https://gist.github.com/a-m-s/1991ab18fbcb0fcc2cf9 +// - https://github.com/taehwandev/MediaCodecExample/blob/master/app/src/main/java/tech/thdev/mediacodecexample/audio/AACAudioDecoderThread.kt +fun readUri(context: Context, uri: Uri): WaveData { + val extractor = MediaExtractor() + extractor.setDataSource(context, uri, null) + + val samplesList: MutableList = ArrayList() + + for (i in 0 until extractor.trackCount) { + val format = extractor.getTrackFormat(i) + val mime = format.getString(MediaFormat.KEY_MIME) + if (mime?.startsWith("audio/") == true) { + extractor.selectTrack(i) + + var encoding: Int = -1 + try { + encoding = format.getInteger(MediaFormat.KEY_PCM_ENCODING) + } catch (_: Exception) { + } + + if (encoding != AudioFormat.ENCODING_PCM_16BIT) { + return WaveData(msg = "We support only 16-bit encoded wave files") + } + + val sampleRate = format.getInteger(MediaFormat.KEY_SAMPLE_RATE) + val decoder = MediaCodec.createDecoderByType(mime) + decoder.configure(format, null, null, 0) + decoder.start() + + val inputBuffers = decoder.inputBuffers + var outputBuffers = decoder.outputBuffers + + val info = MediaCodec.BufferInfo() + var eof = false + + var outputBufferIndex = -1 + + while (true) { + if (!eof) { + val inputBufferIndex = decoder.dequeueInputBuffer(10000) + if (inputBufferIndex > 0) { + val size = extractor.readSampleData(inputBuffers[inputBufferIndex], 0) + if (size < 0) { + decoder.queueInputBuffer( + inputBufferIndex, + 0, + 0, + 0, + MediaCodec.BUFFER_FLAG_END_OF_STREAM + ) + eof = true + } else { + decoder.queueInputBuffer( + inputBufferIndex, + 0, + size, + extractor.sampleTime, + 0 + ) + extractor.advance() + } + } + } // if (!eof) + + if (outputBufferIndex >= 0) { + outputBuffers[outputBufferIndex].position(0) + } + + outputBufferIndex = decoder.dequeueOutputBuffer(info, 10000) + if (outputBufferIndex >= 0) { + if (info.flags != 0) { + decoder.stop() + decoder.release() + + var k = 0 + for (s in samplesList) { + k += s.size + } + if (k == 0) { + return WaveData(msg = "Failed to read selected file") + } + + val ans = FloatArray(k) + k = 0 + for (s in samplesList) { + s.copyInto(ans, k) + k += s.size + } + + return WaveData(sampleRate = sampleRate, samples = ans) + } + + val buffer = outputBuffers[outputBufferIndex] + val chunk = ByteArray(info.size) + buffer[chunk] + buffer.clear() + + val numSamples = info.size / 2 + + val samples = FloatArray(numSamples) + for (k in 0 until numSamples) { + // assume little endian + val s = chunk[2 * k] + (chunk[2 * k + 1] * 256.0f) + + samples[k] = s / 32768.0f + } + samplesList.add(samples) + + decoder.releaseOutputBuffer(outputBufferIndex, false) + } else if (outputBufferIndex == MediaCodec.INFO_OUTPUT_BUFFERS_CHANGED) { + outputBuffers = decoder.outputBuffers + } + } + } + } + + extractor.release() + return WaveData(msg = "not an audio file") +} \ No newline at end of file diff --git a/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/SpeakerDiarizationObject.kt b/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/SpeakerDiarizationObject.kt new file mode 100644 index 000000000..f4bc24554 --- /dev/null +++ b/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/SpeakerDiarizationObject.kt @@ -0,0 +1,66 @@ +package com.k2fsa.sherpa.onnx.speaker.diarization + +import android.content.res.AssetManager +import android.util.Log +import com.k2fsa.sherpa.onnx.FastClusteringConfig +import com.k2fsa.sherpa.onnx.OfflineSpeakerDiarization +import com.k2fsa.sherpa.onnx.OfflineSpeakerDiarizationConfig +import com.k2fsa.sherpa.onnx.OfflineSpeakerSegmentationModelConfig +import com.k2fsa.sherpa.onnx.OfflineSpeakerSegmentationPyannoteModelConfig +import com.k2fsa.sherpa.onnx.SpeakerEmbeddingExtractorConfig + +// Please download +// https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 +// then unzip it, rename model.onnx to segmentation.onnx, and mv +// segmentation.onnx to the assets folder +val segmentationModel = "segmentation.onnx" + +// please download it from +// https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx +// and move it to the assets folder +val embeddingModel = "3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx" + +// in the end, your assets folder should look like below +/* +(py38) fangjuns-MacBook-Pro:assets fangjun$ pwd +/Users/fangjun/open-source/sherpa-onnx/android/SherpaOnnxSpeakerDiarization/app/src/main/assets +(py38) fangjuns-MacBook-Pro:assets fangjun$ ls -lh +total 89048 +-rw-r--r-- 1 fangjun staff 38M Oct 12 20:28 3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx +-rw-r--r-- 1 fangjun staff 5.7M Oct 12 20:28 segmentation.onnx + */ + +object SpeakerDiarizationObject { + var _sd: OfflineSpeakerDiarization? = null + val sd: OfflineSpeakerDiarization + get() { + return _sd!! + } + + fun initSpeakerDiarization(assetManager: AssetManager? = null) { + synchronized(this) { + if (_sd != null) { + return + } + Log.i(TAG, "Initializing sherpa-onnx speaker diarization") + + val config = OfflineSpeakerDiarizationConfig( + segmentation = OfflineSpeakerSegmentationModelConfig( + pyannote = OfflineSpeakerSegmentationPyannoteModelConfig( + segmentationModel + ), + debug = true, + ), + embedding = SpeakerEmbeddingExtractorConfig( + model = embeddingModel, + debug = true, + numThreads = 2, + ), + clustering = FastClusteringConfig(numClusters = -1, threshold = 0.5f), + minDurationOn = 0.2f, + minDurationOff = 0.5f, + ) + _sd = OfflineSpeakerDiarization(assetManager = assetManager, config = config) + } + } +} \ No newline at end of file diff --git a/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/SpeakerEmbeddingExtractorConfig.kt b/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/SpeakerEmbeddingExtractorConfig.kt new file mode 120000 index 000000000..9bab8fe88 --- /dev/null +++ b/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/SpeakerEmbeddingExtractorConfig.kt @@ -0,0 +1 @@ +../../../../../../../../../../../../sherpa-onnx/kotlin-api/SpeakerEmbeddingExtractorConfig.kt \ No newline at end of file diff --git a/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/screens/Help.kt b/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/screens/Help.kt new file mode 100644 index 000000000..b3640b9e9 --- /dev/null +++ b/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/screens/Help.kt @@ -0,0 +1,38 @@ +package com.k2fsa.sherpa.onnx.speaker.diarization.screens + +import androidx.compose.foundation.layout.Box +import androidx.compose.foundation.layout.Column +import androidx.compose.foundation.layout.Spacer +import androidx.compose.foundation.layout.fillMaxSize +import androidx.compose.foundation.layout.height +import androidx.compose.foundation.layout.padding +import androidx.compose.material3.Text +import androidx.compose.runtime.Composable +import androidx.compose.ui.Modifier +import androidx.compose.ui.unit.dp +import androidx.compose.ui.unit.sp + +@Composable +fun HelpScreen() { + Box(modifier = Modifier.fillMaxSize()) { + Column( + modifier = Modifier.padding(8.dp) + ) { + Text( + "This app accepts only 16kHz 16-bit 1-channel *.wav files. " + + "It has two arguments: Number of speakers and clustering threshold. " + + "If you know the actual number of speakers in the file, please set it. " + + "Otherwise, please set it to 0. In that case, you have to set the threshold. " + + "A larger threshold leads to fewer segmented speakers." + ) + Spacer(modifier = Modifier.height(5.dp)) + Text("The speaker segmentation model is from " + + "pyannote-audio (https://huggingface.co/pyannote/segmentation-3.0), "+ + "whereas the embedding extractor model is from 3D-Speaker (https://github.com/modelscope/3D-Speaker)") + Spacer(modifier = Modifier.height(5.dp)) + Text("Please see http://github.com/k2-fsa/sherpa-onnx ") + Spacer(modifier = Modifier.height(5.dp)) + Text("Everything is open-sourced!", fontSize = 20.sp) + } + } +} diff --git a/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/screens/Home.kt b/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/screens/Home.kt new file mode 100644 index 000000000..a5a9cd31c --- /dev/null +++ b/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/screens/Home.kt @@ -0,0 +1,210 @@ +package com.k2fsa.sherpa.onnx.speaker.diarization.screens + +import android.util.Log +import androidx.activity.compose.rememberLauncherForActivityResult +import androidx.activity.result.contract.ActivityResultContracts +import androidx.compose.foundation.layout.Arrangement +import androidx.compose.foundation.layout.Column +import androidx.compose.foundation.layout.Row +import androidx.compose.foundation.layout.Spacer +import androidx.compose.foundation.layout.fillMaxWidth +import androidx.compose.foundation.layout.padding +import androidx.compose.foundation.layout.size +import androidx.compose.foundation.rememberScrollState +import androidx.compose.foundation.verticalScroll +import androidx.compose.material3.Button +import androidx.compose.material3.OutlinedTextField +import androidx.compose.material3.Text +import androidx.compose.runtime.Composable +import androidx.compose.runtime.getValue +import androidx.compose.runtime.mutableStateOf +import androidx.compose.runtime.remember +import androidx.compose.runtime.setValue +import androidx.compose.ui.Alignment +import androidx.compose.ui.Modifier +import androidx.compose.ui.platform.LocalClipboardManager +import androidx.compose.ui.platform.LocalContext +import androidx.compose.ui.text.AnnotatedString +import androidx.compose.ui.unit.dp +import androidx.compose.ui.unit.sp +import androidx.documentfile.provider.DocumentFile +import com.k2fsa.sherpa.onnx.speaker.diarization.SpeakerDiarizationObject +import com.k2fsa.sherpa.onnx.speaker.diarization.TAG +import kotlin.concurrent.thread + + +private var samples: FloatArray? = null + +@Composable +fun HomeScreen() { + val context = LocalContext.current + + var sampleRate: Int + var filename by remember { mutableStateOf("") } + var status by remember { mutableStateOf("") } + var progress by remember { mutableStateOf("") } + val clipboardManager = LocalClipboardManager.current + var done by remember { mutableStateOf(false) } + var fileIsOk by remember { mutableStateOf(false) } + var started by remember { mutableStateOf(false) } + var numSpeakers by remember { mutableStateOf(0) } + var threshold by remember { mutableStateOf(0.5f) } + + + val callback = here@{ numProcessedChunks: Int, numTotalChunks: Int, arg: Long -> + Int + val percent = 100.0 * numProcessedChunks / numTotalChunks + progress = "%.2f%%".format(percent) + Log.i(TAG, progress) + return@here 0 + } + + val launcher = rememberLauncherForActivityResult(ActivityResultContracts.OpenDocument()) { + it?.let { + val documentFile = DocumentFile.fromSingleUri(context, it) + filename = documentFile?.name ?: "" + + progress = "" + done = false + fileIsOk = false + + if (filename.isNotEmpty()) { + val data = readUri(context, it) + Log.i(TAG, "sample rate: ${data.sampleRate}") + Log.i(TAG, "numSamples: ${data.samples?.size ?: 0}") + if (data.msg != null) { + Log.i(TAG, "failed to read $filename") + status = data.msg + } else if (data.sampleRate != SpeakerDiarizationObject.sd.sampleRate()) { + status = + "Expected sample rate: ${SpeakerDiarizationObject.sd.sampleRate()}. Given wave file with sample rate: ${data.sampleRate}" + } else { + samples = data.samples!! + fileIsOk = true + } + } + } + } + + Column( + modifier = Modifier.padding(10.dp), + verticalArrangement = Arrangement.Top, + ) { + Row( + modifier = Modifier.fillMaxWidth(), + horizontalArrangement = Arrangement.SpaceEvenly, + verticalAlignment = Alignment.CenterVertically + ) { + + Button(onClick = { + launcher.launch(arrayOf("audio/*")) + }) { + Text("Select a .wav file") + } + + Button(enabled = fileIsOk && !started, + onClick = { + Log.i(TAG, "started") + Log.i(TAG, "num samples: ${samples?.size}") + started = true + progress = "" + + val config = SpeakerDiarizationObject.sd.config + config.clustering.numClusters = numSpeakers + config.clustering.threshold = threshold + + SpeakerDiarizationObject.sd.setConfig(config) + + thread(true) { + done = false + status = "Started! Please wait" + val segments = SpeakerDiarizationObject.sd.processWithCallback( + samples!!, + callback = callback, + ) + done = true + started = false + status = "" + for (s in segments) { + val start = "%.2f".format(s.start) + val end = "%.2f".format(s.end) + val speaker = "speaker_%02d".format(s.speaker) + status += "$start -- $end $speaker\n" + Log.i(TAG, "$start -- $end $speaker") + } + + Log.i(TAG, status) + } + }) { + Text("Start") + } + if (progress.isNotEmpty()) { + Text(progress, fontSize = 25.sp) + } + } + + Row( + modifier = Modifier.fillMaxWidth(), + horizontalArrangement = Arrangement.SpaceEvenly, + verticalAlignment = Alignment.CenterVertically + ) { + OutlinedTextField( + value = numSpeakers.toString(), + onValueChange = { + if (it.isEmpty() || it.isBlank()) { + numSpeakers = 0 + } else { + numSpeakers = it.toIntOrNull() ?: 0 + } + }, + label = { + Text("Number of Speakers") + }, + ) + } + + Row( + modifier = Modifier.fillMaxWidth(), + horizontalArrangement = Arrangement.SpaceEvenly, + verticalAlignment = Alignment.CenterVertically + ) { + OutlinedTextField( + value = threshold.toString(), + onValueChange = { + if (it.isEmpty() || it.isBlank()) { + threshold = 0.5f + } else { + threshold = it.toFloatOrNull() ?: 0.5f + } + }, + label = { + Text("Clustering threshold") + }, + ) + } + + if (filename.isNotEmpty()) { + Text(text = "Selected $filename") + Spacer(Modifier.size(20.dp)) + } + + if (done) { + Button(onClick = { + clipboardManager.setText(AnnotatedString(status)) + progress = "Copied!" + }) { + Text("Copy result") + } + Spacer(Modifier.size(20.dp)) + } + + if (status.isNotEmpty()) { + Text( + status, + modifier = Modifier.verticalScroll(rememberScrollState()), + ) + } + + + } +} \ No newline at end of file diff --git a/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/ui/theme/Color.kt b/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/ui/theme/Color.kt new file mode 100644 index 000000000..a96515d3d --- /dev/null +++ b/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/ui/theme/Color.kt @@ -0,0 +1,11 @@ +package com.k2fsa.sherpa.onnx.speaker.diarization.ui.theme + +import androidx.compose.ui.graphics.Color + +val Purple80 = Color(0xFFD0BCFF) +val PurpleGrey80 = Color(0xFFCCC2DC) +val Pink80 = Color(0xFFEFB8C8) + +val Purple40 = Color(0xFF6650a4) +val PurpleGrey40 = Color(0xFF625b71) +val Pink40 = Color(0xFF7D5260) \ No newline at end of file diff --git a/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/ui/theme/Theme.kt b/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/ui/theme/Theme.kt new file mode 100644 index 000000000..5dbbe7e59 --- /dev/null +++ b/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/ui/theme/Theme.kt @@ -0,0 +1,58 @@ +package com.k2fsa.sherpa.onnx.speaker.diarization.ui.theme + +import android.app.Activity +import android.os.Build +import androidx.compose.foundation.isSystemInDarkTheme +import androidx.compose.material3.MaterialTheme +import androidx.compose.material3.darkColorScheme +import androidx.compose.material3.dynamicDarkColorScheme +import androidx.compose.material3.dynamicLightColorScheme +import androidx.compose.material3.lightColorScheme +import androidx.compose.runtime.Composable +import androidx.compose.ui.platform.LocalContext + +private val DarkColorScheme = darkColorScheme( + primary = Purple80, + secondary = PurpleGrey80, + tertiary = Pink80 +) + +private val LightColorScheme = lightColorScheme( + primary = Purple40, + secondary = PurpleGrey40, + tertiary = Pink40 + + /* Other default colors to override + background = Color(0xFFFFFBFE), + surface = Color(0xFFFFFBFE), + onPrimary = Color.White, + onSecondary = Color.White, + onTertiary = Color.White, + onBackground = Color(0xFF1C1B1F), + onSurface = Color(0xFF1C1B1F), + */ +) + +@Composable +fun SherpaOnnxSpeakerDiarizationTheme( + darkTheme: Boolean = isSystemInDarkTheme(), + // Dynamic color is available on Android 12+ + dynamicColor: Boolean = true, + content: @Composable () -> Unit +) { + val colorScheme = when { + dynamicColor && Build.VERSION.SDK_INT >= Build.VERSION_CODES.S -> { + val context = LocalContext.current + if (darkTheme) dynamicDarkColorScheme(context) else dynamicLightColorScheme(context) + } + + darkTheme -> DarkColorScheme + else -> LightColorScheme + } + + MaterialTheme( + colorScheme = colorScheme, + typography = Typography, + content = content + ) +} \ No newline at end of file diff --git a/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/ui/theme/Type.kt b/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/ui/theme/Type.kt new file mode 100644 index 000000000..39a81b941 --- /dev/null +++ b/android/SherpaOnnxSpeakerDiarization/app/src/main/java/com/k2fsa/sherpa/onnx/speaker/diarization/ui/theme/Type.kt @@ -0,0 +1,34 @@ +package com.k2fsa.sherpa.onnx.speaker.diarization.ui.theme + +import androidx.compose.material3.Typography +import androidx.compose.ui.text.TextStyle +import androidx.compose.ui.text.font.FontFamily +import androidx.compose.ui.text.font.FontWeight +import androidx.compose.ui.unit.sp + +// Set of Material typography styles to start with +val Typography = Typography( + bodyLarge = TextStyle( + fontFamily = FontFamily.Default, + fontWeight = FontWeight.Normal, + fontSize = 16.sp, + lineHeight = 24.sp, + letterSpacing = 0.5.sp + ) + /* Other default text styles to override + titleLarge = TextStyle( + fontFamily = FontFamily.Default, + fontWeight = FontWeight.Normal, + fontSize = 22.sp, + lineHeight = 28.sp, + letterSpacing = 0.sp + ), + labelSmall = TextStyle( + fontFamily = FontFamily.Default, + fontWeight = FontWeight.Medium, + fontSize = 11.sp, + lineHeight = 16.sp, + letterSpacing = 0.5.sp + ) + */ +) \ No newline at end of file diff --git a/android/SherpaOnnxSpeakerDiarization/app/src/main/jniLibs/arm64-v8a/.gitkeep b/android/SherpaOnnxSpeakerDiarization/app/src/main/jniLibs/arm64-v8a/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/android/SherpaOnnxSpeakerDiarization/app/src/main/jniLibs/armeabi-v7a/.gitkeep b/android/SherpaOnnxSpeakerDiarization/app/src/main/jniLibs/armeabi-v7a/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/android/SherpaOnnxSpeakerDiarization/app/src/main/jniLibs/x86/.gitkeep b/android/SherpaOnnxSpeakerDiarization/app/src/main/jniLibs/x86/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/android/SherpaOnnxSpeakerDiarization/app/src/main/jniLibs/x86_64/.gitkeep b/android/SherpaOnnxSpeakerDiarization/app/src/main/jniLibs/x86_64/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/android/SherpaOnnxSpeakerDiarization/app/src/main/res/drawable-v24/ic_launcher_foreground.xml b/android/SherpaOnnxSpeakerDiarization/app/src/main/res/drawable-v24/ic_launcher_foreground.xml new file mode 100644 index 000000000..2b068d114 --- /dev/null +++ b/android/SherpaOnnxSpeakerDiarization/app/src/main/res/drawable-v24/ic_launcher_foreground.xml @@ -0,0 +1,30 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/android/SherpaOnnxSpeakerDiarization/app/src/main/res/drawable/ic_launcher_background.xml b/android/SherpaOnnxSpeakerDiarization/app/src/main/res/drawable/ic_launcher_background.xml new file mode 100644 index 000000000..07d5da9cb --- /dev/null +++ b/android/SherpaOnnxSpeakerDiarization/app/src/main/res/drawable/ic_launcher_background.xml @@ -0,0 +1,170 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/android/SherpaOnnxSpeakerDiarization/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml b/android/SherpaOnnxSpeakerDiarization/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml new file mode 100644 index 000000000..6f3b755bf --- /dev/null +++ b/android/SherpaOnnxSpeakerDiarization/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/android/SherpaOnnxSpeakerDiarization/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml b/android/SherpaOnnxSpeakerDiarization/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml new file mode 100644 index 000000000..6f3b755bf --- /dev/null +++ b/android/SherpaOnnxSpeakerDiarization/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/android/SherpaOnnxSpeakerDiarization/app/src/main/res/mipmap-hdpi/ic_launcher.webp b/android/SherpaOnnxSpeakerDiarization/app/src/main/res/mipmap-hdpi/ic_launcher.webp new file mode 100644 index 0000000000000000000000000000000000000000..c209e78ecd372343283f4157dcfd918ec5165bb3 GIT binary patch literal 1404 zcmV-?1%vuhNk&F=1pok7MM6+kP&il$0000G0000-002h-06|PpNX!5L00Dqw+t%{r zzW2vH!KF=w&cMnnN@{whkTw+#mAh0SV?YL=)3MimFYCWp#fpdtz~8$hD5VPuQgtcN zXl<@<#Cme5f5yr2h%@8TWh?)bSK`O z^Z@d={gn7J{iyxL_y_%J|L>ep{dUxUP8a{byupH&!UNR*OutO~0{*T4q5R6@ApLF! z5{w?Z150gC7#>(VHFJZ-^6O@PYp{t!jH(_Z*nzTK4 zkc{fLE4Q3|mA2`CWQ3{8;gxGizgM!zccbdQoOLZc8hThi-IhN90RFT|zlxh3Ty&VG z?Fe{#9RrRnxzsu|Lg2ddugg7k%>0JeD+{XZ7>Z~{=|M+sh1MF7~ zz>To~`~LVQe1nNoR-gEzkpe{Ak^7{{ZBk2i_<+`Bq<^GB!RYG+z)h;Y3+<{zlMUYd zrd*W4w&jZ0%kBuDZ1EW&KLpyR7r2=}fF2%0VwHM4pUs}ZI2egi#DRMYZPek*^H9YK zay4Iy3WXFG(F14xYsoDA|KXgGc5%2DhmQ1gFCkrgHBm!lXG8I5h*uf{rn48Z!_@ z4Bk6TJAB2CKYqPjiX&mWoW>OPFGd$wqroa($ne7EUK;#3VYkXaew%Kh^3OrMhtjYN?XEoY`tRPQsAkH-DSL^QqyN0>^ zmC>{#F14jz4GeW{pJoRpLFa_*GI{?T93^rX7SPQgT@LbLqpNA}<@2wH;q493)G=1Y z#-sCiRNX~qf3KgiFzB3I>4Z%AfS(3$`-aMIBU+6?gbgDb!)L~A)je+;fR0jWLL-Fu z4)P{c7{B4Hp91&%??2$v9iRSFnuckHUm}or9seH6 z>%NbT+5*@L5(I9j@06@(!{ZI?U0=pKn8uwIg&L{JV14+8s2hnvbRrU|hZCd}IJu7*;;ECgO%8_*W Kmw_-CKmY()leWbG literal 0 HcmV?d00001 diff --git a/android/SherpaOnnxSpeakerDiarization/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp b/android/SherpaOnnxSpeakerDiarization/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp new file mode 100644 index 0000000000000000000000000000000000000000..b2dfe3d1ba5cf3ee31b3ecc1ced89044a1f3b7a9 GIT binary patch literal 2898 zcmV-Y3$650Nk&FW3jhFDMM6+kP&il$0000G0000-002h-06|PpNWB9900E$G+qN-D z+81ABX7q?;bwx%xBg?kcwr$(C-Tex-ZCkHUw(Y9#+`E5-zuONG5fgw~E2WDng@Bc@ z24xy+R1n%~6xI#u9vJ8zREI)sb<&Il(016}Z~V1n^PU3-_H17A*Bf^o)&{_uBv}Py zulRfeE8g(g6HFhk_?o_;0@tz?1I+l+Y#Q*;RVC?(ud`_cU-~n|AX-b`JHrOIqn(-t&rOg-o`#C zh0LPxmbOAEb;zHTu!R3LDh1QO zZTf-|lJNUxi-PpcbRjw3n~n-pG;$+dIF6eqM5+L();B2O2tQ~|p{PlpNcvDbd1l%c zLtXn%lu(3!aNK!V#+HNn_D3lp z2%l+hK-nsj|Bi9;V*WIcQRTt5j90A<=am+cc`J zTYIN|PsYAhJ|=&h*4wI4ebv-C=Be#u>}%m;a{IGmJDU`0snWS&$9zdrT(z8#{OZ_Y zxwJx!ZClUi%YJjD6Xz@OP8{ieyJB=tn?>zaI-4JN;rr`JQbb%y5h2O-?_V@7pG_+y z(lqAsqYr!NyVb0C^|uclHaeecG)Sz;WV?rtoqOdAAN{j%?Uo%owya(F&qps@Id|Of zo@~Y-(YmfB+chv^%*3g4k3R0WqvuYUIA+8^SGJ{2Bl$X&X&v02>+0$4?di(34{pt* zG=f#yMs@Y|b&=HyH3k4yP&goF2LJ#tBLJNNDo6lG06r}ghC-pC4Q*=x3;|+W04zte zAl>l4kzUBQFYF(E`KJy?ZXd1tnfbH+Z~SMmA21KokJNs#eqcXWKUIC>{TuoKe^vhF z);H)o`t9j~`$h1D`#bxe@E`oE`cM9w(@)5Bp8BNukIwM>wZHfd0S;5bcXA*5KT3bj zc&_~`&{z7u{Et!Z_k78H75gXf4g8<_ul!H$eVspPeU3j&&Au=2R*Zp#M9$9s;fqwgzfiX=E_?BwVcfx3tG9Q-+<5fw z%Hs64z)@Q*%s3_Xd5>S4dg$s>@rN^ixeVj*tqu3ZV)biDcFf&l?lGwsa zWj3rvK}?43c{IruV2L`hUU0t^MemAn3U~x3$4mFDxj=Byowu^Q+#wKRPrWywLjIAp z9*n}eQ9-gZmnd9Y0WHtwi2sn6n~?i#n9VN1B*074_VbZZ=WrpkMYr{RsI ztM_8X1)J*DZejxkjOTRJ&a*lrvMKBQURNP#K)a5wIitfu(CFYV4FT?LUB$jVwJSZz zNBFTWg->Yk0j&h3e*a5>B=-xM7dE`IuOQna!u$OoxLlE;WdrNlN)1 z7**de7-hZ!(%_ZllHBLg`Ir#|t>2$*xVOZ-ADZKTN?{(NUeLU9GbuG-+Axf*AZ-P1 z0ZZ*fx+ck4{XtFsbcc%GRStht@q!m*ImssGwuK+P@%gEK!f5dHymg<9nSCXsB6 zQ*{<`%^bxB($Z@5286^-A(tR;r+p7B%^%$N5h%lb*Vlz-?DL9x;!j<5>~kmXP$E}m zQV|7uv4SwFs0jUervsxVUm>&9Y3DBIzc1XW|CUZrUdb<&{@D5yuLe%Xniw^x&{A2s z0q1+owDSfc3Gs?ht;3jw49c#mmrViUfX-yvc_B*wY|Lo7; zGh!t2R#BHx{1wFXReX*~`NS-LpSX z#TV*miO^~B9PF%O0huw!1Zv>^d0G3$^8dsC6VI!$oKDKiXdJt{mGkyA`+Gwd4D-^1qtNTUK)`N*=NTG-6}=5k6suNfdLt*dt8D| z%H#$k)z#ZRcf|zDWB|pn<3+7Nz>?WW9WdkO5(a^m+D4WRJ9{wc>Y}IN)2Kbgn;_O? zGqdr&9~|$Y0tP=N(k7^Eu;iO*w+f%W`20BNo)=Xa@M_)+o$4LXJyiw{F?a633SC{B zl~9FH%?^Rm*LVz`lkULs)%idDX^O)SxQol(3jDRyBVR!7d`;ar+D7do)jQ}m`g$TevUD5@?*P8)voa?kEe@_hl{_h8j&5eB-5FrYW&*FHVt$ z$kRF9Nstj%KRzpjdd_9wO=4zO8ritN*NPk_9avYrsF(!4))tm{Ga#OY z(r{0buexOzu7+rw8E08Gxd`LTOID{*AC1m*6Nw@osfB%0oBF5sf<~wH1kL;sd zo)k6^VyRFU`)dt*iX^9&QtWbo6yE8XXH?`ztvpiOLgI3R+=MOBQ9=rMVgi<*CU%+d1PQQ0a1U=&b0vkF207%xU0ssI2 literal 0 HcmV?d00001 diff --git a/android/SherpaOnnxSpeakerDiarization/app/src/main/res/mipmap-mdpi/ic_launcher.webp b/android/SherpaOnnxSpeakerDiarization/app/src/main/res/mipmap-mdpi/ic_launcher.webp new file mode 100644 index 0000000000000000000000000000000000000000..4f0f1d64e58ba64d180ce43ee13bf9a17835fbca GIT binary patch literal 982 zcmV;{11bDcNk&G_0{{S5MM6+kP&il$0000G0000l001ul06|PpNU8t;00Dqo+t#w^ z^1csucXz7-Qrhzl9HuHB%l>&>1tG2^vb*E&k^T3$FG1eQZ51g$uv4V+kI`0<^1Z@N zk?Jjh$olyC%l>)Xq;7!>{iBj&BjJ`P&$fsCfpve_epJOBkTF?nu-B7D!hO=2ZR}

C%4 zc_9eOXvPbC4kzU8YowIA8cW~Uv|eB&yYwAObSwL2vY~UYI7NXPvf3b+c^?wcs~_t{ ze_m66-0)^{JdOMKPwjpQ@Sna!*?$wTZ~su*tNv7o!gXT!GRgivP}ec?5>l1!7<(rT zds|8x(qGc673zrvYIz;J23FG{9nHMnAuP}NpAED^laz3mAN1sy+NXK)!6v1FxQ;lh zOBLA>$~P3r4b*NcqR;y6pwyhZ3_PiDb|%n1gGjl3ZU}ujInlP{eks-#oA6>rh&g+!f`hv#_%JrgYPu z(U^&XLW^QX7F9Z*SRPpQl{B%x)_AMp^}_v~?j7 zapvHMKxSf*Mtyx8I}-<*UGn3)oHd(nn=)BZ`d$lDBwq_GL($_TPaS{UeevT(AJ`p0 z9%+hQb6z)U9qjbuXjg|dExCLjpS8$VKQ55VsIC%@{N5t{NsW)=hNGI`J=x97_kbz@ E0Of=7!TQj4N+cqN`nQhxvX7dAV-`K|Ub$-q+H-5I?Tx0g9jWxd@A|?POE8`3b8fO$T))xP* z(X?&brZw({`)WU&rdAs1iTa0x6F@PIxJ&&L|dpySV!ID|iUhjCcKz(@mE z!x@~W#3H<)4Ae(4eQJRk`Iz3<1)6^m)0b_4_TRZ+cz#eD3f8V;2r-1fE!F}W zEi0MEkTTx}8i1{`l_6vo0(Vuh0HD$I4SjZ=?^?k82R51bC)2D_{y8mi_?X^=U?2|F{Vr7s!k(AZC$O#ZMyavHhlQ7 zUR~QXuH~#o#>(b$u4?s~HLF*3IcF7023AlwAYudn0FV~|odGH^05AYPEfR)8p`i{n zwg3zPVp{+wOsxKc>)(pMupKF!Y2HoUqQ3|Yu|8lwR=?5zZuhG6J?H`bSNk_wPoM{u zSL{c@pY7+c2kck>`^q1^^gR0QB7Y?KUD{vz-uVX~;V-rW)PDcI)$_UjgVV?S?=oLR zf4}zz{#*R_{LkiJ#0RdQLNC^2Vp%JPEUvG9ra2BVZ92(p9h7Ka@!yf9(lj#}>+|u* z;^_?KWdzkM`6gqPo9;;r6&JEa)}R3X{(CWv?NvgLeOTq$cZXqf7|sPImi-7cS8DCN zGf;DVt3Am`>hH3{4-WzH43Ftx)SofNe^-#|0HdCo<+8Qs!}TZP{HH8~z5n`ExcHuT zDL1m&|DVpIy=xsLO>8k92HcmfSKhflQ0H~9=^-{#!I1g(;+44xw~=* zxvNz35vfsQE)@)Zsp*6_GjYD};Squ83<_?^SbALb{a`j<0Gn%6JY!zhp=Fg}Ga2|8 z52e1WU%^L1}15Ex0fF$e@eCT(()_P zvV?CA%#Sy08_U6VPt4EtmVQraWJX` zh=N|WQ>LgrvF~R&qOfB$!%D3cGv?;Xh_z$z7k&s4N)$WYf*k=|*jCEkO19{h_(%W4 zPuOqbCw`SeAX*R}UUsbVsgtuG?xs(#Ikx9`JZoQFz0n*7ZG@Fv@kZk`gzO$HoA9kN z8U5{-yY zvV{`&WKU2$mZeoBmiJrEdzUZAv1sRxpePdg1)F*X^Y)zp^Y*R;;z~vOv-z&)&G)JQ{m!C9cmziu1^nHA z`#`0c>@PnQ9CJKgC5NjJD8HM3|KC(g5nnCq$n0Gsu_DXk36@ql%npEye|?%RmG)

FJ$wK}0tWNB{uH;AM~i literal 0 HcmV?d00001 diff --git a/android/SherpaOnnxSpeakerDiarization/app/src/main/res/mipmap-xhdpi/ic_launcher.webp b/android/SherpaOnnxSpeakerDiarization/app/src/main/res/mipmap-xhdpi/ic_launcher.webp new file mode 100644 index 0000000000000000000000000000000000000000..948a3070fe34c611c42c0d3ad3013a0dce358be0 GIT binary patch literal 1900 zcmV-y2b1_xNk&Fw2LJ$9MM6+kP&il$0000G0001A003VA06|PpNH75a00DqwTbm-~ zullQTcXxO9ki!OCRx^i?oR|n!<8G0=kI^!JSjFi-LL*`V;ET0H2IXfU0*i>o6o6Gy zRq6Ap5(_{XLdXcL-MzlN`ugSdZY_`jXhcENAu)N_0?GhF))9R;E`!bo9p?g?SRgw_ zEXHhFG$0{qYOqhdX<(wE4N@es3VIo$%il%6xP9gjiBri+2pI6aY4 zJbgh-Ud|V%3O!IcHKQx1FQH(_*TK;1>FQWbt^$K1zNn^cczkBs=QHCYZ8b&l!UV{K z{L0$KCf_&KR^}&2Fe|L&?1I7~pBENnCtCuH3sjcx6$c zwqkNkru);ie``q+_QI;IYLD9OV0ZxkuyBz|5<$1BH|vtey$> z5oto4=l-R-Aaq`Dk0}o9N0VrkqW_#;!u{!bJLDq%0092{Ghe=F;(kn} z+sQ@1=UlX30+2nWjkL$B^b!H2^QYO@iFc0{(-~yXj2TWz?VG{v`Jg zg}WyYnwGgn>{HFaG7E~pt=)sOO}*yd(UU-D(E&x{xKEl6OcU?pl)K%#U$dn1mDF19 zSw@l8G!GNFB3c3VVK0?uyqN&utT-D5%NM4g-3@Sii9tSXKtwce~uF zS&Jn746EW^wV~8zdQ1XC28~kXu8+Yo9p!<8h&(Q({J*4DBglPdpe4M_mD8AguZFn~ ztiuO~{6Bx?SfO~_ZV(GIboeR9~hAym{{fV|VM=77MxDrbW6`ujX z<3HF(>Zr;#*uCvC*bpoSr~C$h?_%nXps@A)=l_;({Fo#6Y1+Zv`!T5HB+)#^-Ud_; zBwftPN=d8Vx)*O1Mj+0oO=mZ+NVH*ptNDC-&zZ7Hwho6UQ#l-yNvc0Cm+2$$6YUk2D2t#vdZX-u3>-Be1u9gtTBiMB^xwWQ_rgvGpZ6(C@e23c!^K=>ai-Rqu zhqT`ZQof;9Bu!AD(i^PCbYV%yha9zuoKMp`U^z;3!+&d@Hud&_iy!O-$b9ZLcSRh? z)R|826w}TU!J#X6P%@Zh=La$I6zXa#h!B;{qfug}O%z@K{EZECu6zl)7CiNi%xti0 zB{OKfAj83~iJvmpTU|&q1^?^cIMn2RQ?jeSB95l}{DrEPTW{_gmU_pqTc)h@4T>~& zluq3)GM=xa(#^VU5}@FNqpc$?#SbVsX!~RH*5p0p@w z;~v{QMX0^bFT1!cXGM8K9FP+=9~-d~#TK#ZE{4umGT=;dfvWi?rYj;^l_Zxywze`W z^Cr{55U@*BalS}K%Czii_80e0#0#Zkhlij4-~I@}`-JFJ7$5{>LnoJSs??J8kWVl6|8A}RCGAu9^rAsfCE=2}tHwl93t0C?#+jMpvr7O3`2=tr{Hg$=HlnjVG^ewm|Js0J*kfPa6*GhtB>`fN!m#9J(sU!?(OSfzY*zS(FJ<-Vb zfAIg+`U)YaXv#sY(c--|X zEB+TVyZ%Ie4L$gi#Fc++`h6%vzsS$pjz9aLt+ZL(g;n$Dzy5=m=_TV(3H8^C{r0xd zp#a%}ht55dOq?yhwYPrtp-m1xXp;4X;)NhxxUpgP%XTLmO zcjaFva^}dP3$&sfFTIR_jC=2pHh9kpI@2(6V*GQo7Ws)`j)hd+tr@P~gR*2gO@+1? zG<`_tB+LJuF|SZ9tIec;h%}}6WClT`L>HSW?E{Hp1h^+mlbf_$9zA>!ug>NALJsO{ mU%z=YwVD?}XMya)Bp;vlyE5&E_6!fzx9pwrdz474!~g(M6R?N? literal 0 HcmV?d00001 diff --git a/android/SherpaOnnxSpeakerDiarization/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp b/android/SherpaOnnxSpeakerDiarization/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp new file mode 100644 index 0000000000000000000000000000000000000000..1b9a6956b3acdc11f40ce2bb3f6efbd845cc243f GIT binary patch literal 3918 zcmV-U53%r4Nk&FS4*&pHMM6+kP&il$0000G0001A003VA06|PpNSy@$00HoY|G(*G z+qV7x14$dSO^Re!iqt-AAIE9iwr$(CZQJL$blA4B`>;C3fBY6Q8_YSjb2%a=fc}4E zrSzssacq<^nmW|Rs93PJni30R<8w<(bK_$LO4L?!_OxLl$}K$MUEllnMK|rg=f3;y z*?;3j|Nh>)p0JQ3A~rf(MibH2r+)3cyV1qF&;8m{w-S*y+0mM){KTK^M5}ksc`qX3 zy>rf^b>~l>SSHds8(I@hz3&PD@LmEs4&prkT=BjsBCXTMhN$_)+kvnl0bLKW5rEsj z*d#KXGDB4P&>etx0X+`R19yC=LS)j!mgs5M0L~+o-T~Jl!p!AJxnGAhV%~rhYUL4hlWhgES3Kb5oA&X z{}?3OBSS-{!v$nCIGj->(-TAG)8LR{htr41^gxsT8yqt2@DEG6Yl`Uma3Nd4;YUoW zTbkYl3CMU5ypMF3EIkYmWL|*BknM`0+Kq6CpvO(y$#j94e+q{vI{Zp8cV_6RK!`&C zob$*5Q|$IZ09dW=L!V zw@#2wviu|<#3lgGE8GEhcx+zBt`} zOwP8j9X%^f7i_bth4PiJ$LYtFJSCN$3xwDN;8mr*B;CJwBP2G0TMq0uNt7S^DO_wE zepk!Wrn#Z#03j{`c*Rf~y3o7?J}w?tEELRUR2cgxB*Y{LzA#pxHgf}q?u5idu>077 zd^=p)`nA}6e`|@`p?u}YU66PP_MA}Zqqe!c{nK&z%Jwq1N4e_q<#4g^xaz=ao;u|6 zwpRcW2Lax=ZGbx=Q*HhlJ`Ns#Y*r0*%!T?P*TTiX;rb)$CGLz=rSUum$)3Qyv{BL2 zO*=OI2|%(Yz~`pNEOnLp>+?T@glq-DujlIp?hdJeZ7ctP4_OKx|5@EOps3rr(pWzg zK4d3&oN-X2qN(d_MkfwB4I)_)!I_6nj2iA9u^pQ{;GckGLxBGrJUM2Wdda!k)Y>lq zmjws>dVQ*vW9lvEMkiN3wE-__6OWD0txS&Qn0n22cyj4Q*8(nG4!G{6OOwNvsrPIL zCl-$W9UwkEUVuLwyD%|inbOF*xMODZ4VMEVAq_zUxZ+K#Gdqf!DW$5f)?7UNOFMz! zrB~tuu=6X2FE(p^iqgxr+?ZK;=yz`e;C$#_@D9Lj-+TDVOrva>(#*PVbaHO>A)mhl z07OJWCqYC60518$!&c`eNBcBW%GnfaQ*$eazV^2_AW?j)h;J1nUjN(I9=0+!RVx~% z3@Tf!P0TE+98jA?WceK-}A1% zW!K)lyKcGqy#M~})315-A#2NXQ`?6NR#Apo=S!oF=JfpX>iR*49ec{7AN$xxpK{D$ z2d%Fz&rdfSqourN$~Y^NFIMV1CZ?J*bMx~H3k&meGtH@q9ra2vZxmA$S(#jaaj-g4 ztJmxG+DLV<*q<|sDXPp$X>E)#S}Vm&sRaO5P&goh2><}FEdZSXDqsL$06sAkh(e+v zAsBhKSRexgwg6tIy~GFJzaTxXD(}|+0eOwFDA%rn`X;MVwDHT9=4=g%OaJ9s%3b9>9EUTnnp0t;2Zpa{*>mk~hZqItE_!dQ zOtC>8`$l|mV43Jbudf0N6&&X;{=z}Zi}d1`2qmJ}i|0*GsulD3>GgQXHN)pkR6sf1 z?5ZU%&xtL}oH;YiAA)d*^Ndw2T$+Mjuzyzz@-SM`9df7LqTxLuIwC~S0092~+=qYv z@*ja;?Wt!T!{U?c*Z0YtGe)XbI&y-?B&G2$`JDM)(dIV9G`Sc#6?sI60de6kv+)Qb zUW~2|WjvJq3TA8`0+sWA3zRhY9a~ow)O~&StBkG2{*{TGiY~S8ep{V&Vo2l<6LWsu z^#p0-v*t2?3&aA1)ozu|%efSR=XnpX$lvTeRdKlvM!@|pM5p2w3u-6 zU>}t2xiYLS+{|%C65AzX+23Mtlq?BS&YdYcYsVjoiE&rT>;Necn6l^K)T^lmE`5u{ zm1i+-a-gc;Z&v-{;8r)z6NYfBUv+=_L}ef}qa9FX01)+Aaf+;xj(mL6|JUzGJR1|fnanb%?BPPIp>SCjP|8qE5qJ{=n5ZGw?81z3(k;pzH%1CtlX50{E7h)$h{qGKfzC`e2o`*IqA#tjA z`Fz&^%$b9F*N`)U-#6>a)Z`55`$Dd0cfcs0$d13^ONrdCu9xcv_=n#WQo8stcz3jP9|2EvdI-RhJM3%Q%oM&!OlShM|0 z?gz?wHZSnm45njLtsz8PVT1S&jAlbKg5kVam$p16=EK@Sj4EP0OtH zmJDmdc^v)x>56Qg_wmYHz6h)>kl_h$>0@J!ypv%APmjZTAQVLy6Fu50RGY&JAVNhx zrF_qG6`x9MkT;1SFWo$)l{M$;3qUDn9JwE}z zRl#E_bDRJFii61kPgBybIgp8dNW!Cc1b*^YYk-#oWLJvtM_v^hQx~9?8LD4VFFxBF z3MlrsSC%f9Oupn*ctPL0U1fwfX?`tRhPD{PSLFPQOmIt$mDy0SgpNVvHS+f#Do>h1Gn?LZU9(KaN>Q_=Y*_T zvtD7%_u^^+{g`0VGzg(VZrpVQ6Ub5M=tI_p7T93R8@3Zulu3|#{iNcu!oiHxZ4Rf*( zfmiN$$ru(*_Zqn=`Gq#OuHRTSwp7uH_SokR&|)RuW5yo=Z|_4?qU-JU+tpt>!B&Is z@N(=SG;bpVc;AO@zbmMM zScqq1)b-ZQIrs={oD}|?6y{$HNB1U0^LsBh8JI&3!GBZxOXI<}&5-$lgkAaYqhOTb z?2vEnZ$-kk;*M_17(upJF3%+iH*s0-r{vttXVB2OUwI1s^+G(Ft(U8gYFXC}#P&E^ z>T@C^tS`Z7{6HT4_nF~n>JlZtk5&qDBl6r|^kzQYe`wq!C)n@$c>WOPA61NDFj<<6 zGW71NMMhwAl!U-yqrq2xrSFqRCI8acw7?}3j;ynxo*-b7Co;g5r%^j=H@9({PXXBf z@r>U>>N;E)81wx`B4f%{PB~MHka_);%kBCb(d|Jy5!MqJ%2p`t&@L)4$T2j&-WHvG zv3(uyA_gwqNu(k?jQTtv3dgPKRZoH8prxe7>pQBW5L&dpumS&5Ld2?(sCpJjvc4L5 zEnh&?91WVm)ZdTj=fjJ$pPDdgAttLXuke+?KdKxu*;kTC(r!tQk6;gxj4h%FdHAt(^M3YvYj(!tOeN)+Hvj6+< zzyJRG?^lZfWuR#t!tUKP&(?%3v&Zd$R2YN>lB(Lq`OInY48%4%yTv2 zYe1{G`3)(PDEio5Y@-I5tUf`c%%OCJMtSW56g3iEg%3`$7XSJJHyA z<|7&N)5Xrlgv~%BO24eFd;Hd;uiK%D`EdK|quUeRZDqbh9l)%j%J#0lfrZumvA<_w zu&=AVvdChf6}eqh(bUz`(`Ue*p01{fBAcTgKyDYLs_I+YyJEk+rM@avU~>fB$n)HS zM7pfJydu`i%gfS<{PF94kZDv$t>06sAkheDzu40NJ$5CMW%n^Lls?8^p^QGWURbKu3ZduZQZ((s2? zzE`}<{;Zt7<$C|9R8A~DJ~@%x>TfP zF>TX8)@v|t)q4GjRt<}5s6hLHwRel7>V@&r-O|Av(yh;Q1A{E>Ir>p+%dHD|=l+lT zpr(Dg&>#Nu=!)6bCLr-ZS%|;h)Ij$+e@r8_{qO19QvDe=&1tmpY*0lcA^Cc-#{9fQ z<~$*<&P$Q<_jy#<$40PMofM7aQ}C=jphI`4kLg}Z7CIN#26D{-4v-_CA-LiE@(%{y!BzsU%gG`Q?sjLUf%qFSl0y)2#ae*+EI>s|i`d^V$Dn)qmzqRq6VJRY|{4ujsIU%#bnqU6MR&-1I_43=|5(6Jr;Jvert) zE?S|Tmn}Tv<-??sxV5@9t}3D=>YZ0JrQe$CO~|EY=Lj9RM&4svQHPQL6%pV5fPFiH zfXDx;l@~et{*{U*#c#Dvzu)|znDO7$#CRx)Z&yp-}SrD{&|(MQtfUz~n35@RLfUy=aqrhCX0M}J_r5QsK~NmRCR|Nm&L z41UdsLjWxSUlL41r^0K&nCCK>fdR-!MYjFg(z9_mF^C|#ZQw?`)f6uVzF^`bRnVY& zo}@M06J&_+>w9@jpaO4snmU;0t-(zYW1qVBHtuD!d?%?AtN7Plp><-1Y8Rqb20ZaP zTCgn*-Sri4Q8Xn>=gNaWQ57%!D35UkA@ksOlPB*Dvw}t02ENAqw|kFhn%ZyyW%+t{ zNdM!uqEM^;2}f+tECHbwLmH*!nZVrb$-az%t50Y2pg(HqhvY-^-lb}>^6l{$jOI6} zo_kBzj%8aX|6H5M0Y<)7pzz_wLkIpRm!;PzY)9+24wk2&TT{w--phDGDCOz{cN_ca zpnm7`$oDy=HX%0i-`769*0M6(e5j-?(?24%)<)&46y0e&6@HCDZAm9W6Ib#Y#BF6- z=30crHGg+RRTe%VBC>T00OV6F+gQDAK38Ne3N9bm|62tPccBJi)5{B z4zc^Db72XiBd}v$CF|yU{Z=M|DZ%-(XarYNclODlb1Kz1_EKLy(NSLCN`eUl(rBCL zT*jx@wNvze0|TSqgE(QArOZU)_?qH(sj#TwzElLs9q)(0u!_P|R%Cy_0JFQxgGV>1 zz4?_uq<8_gM0`c*Hh|;UMz~vrg1gQXp{ufg`hM_qU;U>+zmvc5blCLSq@PrEBSGR# z&8=2Z4uXN`F3p73ueD1l{s{k$WipAvSh5W7ABe?4)t;r@V?y`bNB5FvBuE|0VRTb< zM1Hn^?DSsJY+sX@T5xW=#>T9VEV|?<(=6|ge$X6Sb05!LFdjDcoq*gM(Zq=t;_)Le&jyt(&9jzR73noru`a# zN*<`KwGa^gZU3-)MSLF0aFag#f0<>E(bYTeHmtdbns#|I)-$)mJ`q9ctQ8g0=ET?| zdO}eZ*b_p>ygRTtR^5Ggdam=Zb5wmd{}np+Jn1d_=M`~P=M67jj})fH4ztb5yQqQW z^C|C&^LHAK-u+ooIK)yM)QM?t;|<{P;;{`p=BclzAN#JzL4jCwXkQB1Dy{=^KR`=~ zTrr)y7eiYBzSNs_DvO=4A6#EgGS-zY%Vi)N*Yb`U;6o}KR}dq{r9pT5wqZ@3NOE8- z9-(}D|Nc5732CSYQbL)!gPQ#RbD8BhK3dl{sUuPvei0tkvnJBxDEAYTesU8H$)g(Plra{VH(v3u^CO1~(+ zU0O7#)jaS4{NcwA+LuSm&VBcX2#Im3xg)W}ySNw%->orn1taZ&+d)}8gJTqA!u|5P z{yv?zol_3|(1(%M(EVU=cp?L`{Pi|ixk{U)*guFML3P!OSlz;zGA#T+E@8@cgQ_mv1o7RSU=Zo_82F?&&2r;WE z@wk}JHYEZ9nYUc(Vv~iTCa3u8e4q(yq<29VoNbKk|`mq%I6u)My=gPIDuUb&lzf4`MEA9^g8u z)vp8|$$HE9m_BTV?lOosIGa4jud=jIbw)O2eCMfyw2*S8?hjWw^nqws$O*M$3I1)x zR0PWFb3$ySOcGTe1dz%N0l;RPc`x%05FtT^f^j{YCP}*Q=lvp4$ZXrTZQHhO+w%wJn3c8j%+5C3UAFD&%8dBl_qi9D5g8fry}6Ev z2_Q~)5^N$!IU`BPh1O|=BxQ#*C5*}`lluC515$lxc-vNC)IgW=K|=z7o%cWFpndn= zX}f{`!VK02_kU+Q5a3m37J;c} zTzbxteE{GNf?yLt5X=Bzc-mio^Up0nunMCgp*ZJ;%MJvPM3QK)BryP(_v@ei4UvHr z6+sbCifQaOkL6-;5fL8$W($zZ_;CZp305C;~$hhRquZr-r)jjd1z z31%ZK{-(`P#|Um_Sivn@p$-vz46uqT>QG0B1w9znfS9A8PB2LaHdzA|_)yjXVR*l{ zkcu3@vEf7bxH0nkh`q?8FmoO_Ucui*>_a~P?qQrlZ9@+D7%MTpSnztpylXrt5!-k8_QPB?YL8Kx_On8WD zgT+111d(Op$^$&KLAN5+@?>f7F4~wFi(8TL8+szgVmcMDTp5l&k6~=rA{Dt}!gb^r zSWY<)M7D|Z2P0cEodj6E42PV>&>DFmQpgt)E-|#sSUU@uKed+F680H@<;-x{p|nuH4!_mn85rx>wz;0mPi2ZkL#k6;sznu?cXh!T0S>{w6 zL^gvR05NY64l*<+_L>On$rjx9!US;l;LX6@z}yi#2XHh)F@Oo+l)h%fq$v}DNmF2> zfs^_t0)3N-W<9-N?uedVv{)-J0W5mh#29QM5R5h&KuiRM=0Zvnf#lF=K#WlCgc#9c zS;qvh(P$!_a8JwyhI^ZJV2k+B6Z^64?w|1?5gyo6y{}923CRZfYVe1#?F% z7h2SUiNO3;T#JUOyovSs@@C1GtwipycA=*x5{BpIZ_#GCMuV8XK=x;qCNy{d7?wA~ zC+=vjls;ci&zW=6$H~4^K%v{p}Ab?U%C6Z4p%eC<3ExqU$XR<}LLF67A$Sr20DR_pJ3yeBa~ z^sw{V0FI5;UpwXsScYuhbqGQ`YQ25;6p6W^+tgL&;Ml;>S3CGpSZ>VrTn0m1$y$HU z&65)I!c?oREz};c=nLCliriqQX->4uivHTgd${GqeAlf*!P^B|jkU|*IdNP(&6C>4 zqOW$)Nw9nvjy^&`?E|gotDV{JmJ9Q~vuhy<`^C4XIUDt|j4o6rK^e8_(=YqC zuaR6TRVf@tUFHB079o4MBIh{M~4>WwnGgesQH*3?w(RA%hCZ*7)b!aNV=yOQ%o_Y=Lt0Sl*(9^jfRnC210Om$=y>*o|3z} zAR&vAdrB#mWoaB0fJSw9xw|Am$fzK>rx-~R#7IFSAwdu_EI|SRfB*yl0w8oX09H^q zAjl2?0I)v*odGJ40FVGaF&2qJq9Gv`>V>2r0|c`GX8h>CX8eHcOy>S0@<;M3<_6UM z7yCEpug5NZL!H_0>Hg_HasQGxR`rY&Z{geOy?N92Z z{lER^um|$*?*G63*njwc(R?NT)Bei*3jVzR>FWUDb^gKhtL4A=kE_1p-%Fo2`!8M} z(0AjuCiS;G{?*^1tB-uY%=)SRx&D)pK4u@>f6@KPe3}2j_har$>HqzH;UCR^ssFD0 z7h+VLO4o@_Yt>>AeaZKUxqyvxWCAjKB>qjQ30UA)#w z&=RmdwlT`7a8J8Yae=7*c8XL|{@%wA8uvCqfsNX^?UZsS>wX}QD{K}ad4y~iO*p%4 z_cS{u7Ek%?WV6em2(U9#d8(&JDirb^u~7wK4+xP$iiI6IlD|a&S)6o=kG;59N|>K1 zn(0mUqbG3YIY7dQd+*4~)`!S9m7H6HP6YcKHhBc#b%1L}VIisp%;TckEkcu0>lo@u995$<*Em;XNodjTiCdC%R+TX|_ZR#|1`RR|`^@Teh zl#w@8fI1FTx2Dy+{blUT{`^kY*V-AZUd?ZZqCS4gW(kY5?retkLbF=>p=59Nl|=sf zo1Pc|{{N4>5nt#627ylGF`3n>X%`w%bw-Y~zWM_{Si$dc82|=YhISal{N7OY?O`C4 zD|qb}6nLWJ`hUyL+E>-;ricg9J@ZNYP(x(Sct&OI$Y!QWr*=^VN;G3#i>^1n4e#Je zOVhbFbLpXVu*16enDM+ic;97@R~u&kh__kgP#!R`*rQEnA+_dLkNP~L`0alC|J;c; zeiK=s8;BsLE)KbG3BD&Br@(Ha@SBT&$?xX`=$;eeel=|R_dIr6-Ro?=HEjnsJ_b`1 zK6Yg^-6;^2aW!xeTK)A~3Rm|L^FCHB_I>jIju7ZGo&N_1*QHkxH2!!%@o4iZ?vntS;&zJdPe1dH#04YD93A44o-MpfD zP{rn_aq>U%RDvC2+bp;xPlsOzauIi3*Lf42`jVKKZCRuKdYhi>FDuL2l=v{$BCN#Q6796s%r-AG$Q^t(3c@ zD?w0UhYr11@feiyl9kY_@H8~|xlmO<8PfQmj1!$@WieW@VxR@Psxfe-v9WCi1+f>F4VL?0O~K7T?m4-u|pSkBpUJZZe*16_wAp zSYZ@;k`3;W3UHKUWc8QeI}0jH5Ly=cGWQPw(Kr2fm=-5L(d`lcXofy8tJY3@Tuadz zYWXR{mW7XT!RF#RVCe%}=tM*O6!AD3^(!8un~opNI%Uko7$5t@<8+?; zTxDys(MyyGsUjtSu9$+|_-t!U3fVb1dkK?l`17<+jfl=hrBHnDSV>^R1=TnQeyqbW z>ov#l%!1|S!1>8UUxIdhQq`_klcHVx0{?#>K3#$4GlXncwldt!g17TcvKq-jo_996 z>oA=tH9CqRl6Yw?Uc`am!V?lHJbizOJaVaScf1UP5e7Dbgabq=b!B~T&_F6?ooU>w%x0A zH~&MHJ=q`fCH{U<7MDXE4SD32cDZA)WJeWkllJ`UspWaS#eDe^kg^oU_A14UE9zG-a^g{xaXf$})Wik>gT zl#dkzGr(;h0JZDuFn(+k8wNq?PZ5grQ<+sM?wBGt@JnH6v0#or-5wBQWKU~(S_> zkE!tc*ZJ1Y&*p(xX84POb3cClRMd!^qJ#CAZfIepEj-<`VURS_yCz0(?*Ixcj4 z-!zV1_QZhpm=0<;*(nm+F>T=)o?ep@CK5I%g^VAA+RB25ab?7)A~z~egru=I1S|@v zH7tXV!0wmGS^qj#e+MY;C5eUjEAp$Y?LDkS^QPZ}8WN85?r$u<-Epi;yZ1|J2J`se z$D6DpH~2F=eI0B&=UFAUnJvZAmClJlK)sutJ?M>xpZiWV&0=G4MZP+x+p>EX=HbCz zxls%Mw?*u^;LbHWIWCyq+yi)`GmFn9J112CZda_u@YIP%i;srFg_paU02Ifij*7}l z&CF-(3|>*a|+vbNR`^RP=9G?ymEJ0Z~)d&c*UE$UMepZ zcITr{0WqhxkjUnM15js_gW=e3Uh|y6ZReaXHIz-=p`x5VvB&rH9y>Amv@^WmXFEw) zQXYrk3feir=a{jMQ+wDIkkFnZ$k{sJakHn*?u za%4b!00ev8NVLM1TY=cl?KB&55BY_MU-sg?c>=Dbz_W{(Z~c?HJi*XpYL)C6Bd8WH zt+v-#0&o~@t4qESi*)+eW%@VD0|o^yF)n0hME$UtXF$*Lvh}7sso{`|pn*JDIy5^Fm3s$5*zEE=?u5<=l8FJc3r%+H} zdfoNl2J0^~!-*mOL5o-x32|e0Im*E!yY7F7E5N)W3>+v_LBydlEx?4$RL5f2oYRD# zaR0wv(-p~wO0eLDl3K=%`{5+0Gd$ktO=W)gWlGZJ0`K z$_RNA=ckrfa;H0KA~dR^p�(p-{x$&=IACIfoAR!za)F-^da-t3#0Dycnp zwO~NVXwXCl;jE<}>%@xz|=8fIJAB?>+E{7)|4l${4ngA3G|=r z2Dyv;VVWSgZx9Wj>qUjleGl3Ei9K4>h!(lPS%8VOG>Xu0%6VDz^O=bjJmuP7>DeUv zrbI}MlHB^^d?{zv6d=@_ZD2lg1&G7UjnVN{1}9WkaM3H~btX0GtSzB+tZ^qRgWo4m z!GmimlG$=wgXCnr6j@m<1gAL46#T~5Bnm=2{^@>|t&`9mkEPddj zAvG~@Tv~TAm2i%VW}R-g(Z0)z-Y|szHr@rk>4MAyG*Ma*7Yh#H7(!-5>DZ@8r;_dx z{prSe<>~099F8vsYd2xff7uAS%7{S)f(|@me3t2$iy&NEc7OUEchp@9A|X;;IA>8!oX+y(BKJ$EzV* znR$z;!L$s7uy@{OT~nG#B!NRraT8(X##Ho!0r_o@gg0CA-9H^;-uE&?$2$nHv_00o z%cbuUc-tCx$Uh&EZ4Nf4Zgqv)Y6>usG3>GeQnxx_Z6+PcbX-+ysbt1hQ`K1LDpOE? zrAhIZhSN9yVIAOa22gn577tbc&i3|3V8NWy&!tw##`}9*x}gtI^h1DzZRA>UuaJG) zaZ7j)dq!O}{?#8Y7~7i6fHh4{`pL?>-18|p!S75Y#^DM>-S3)vuZG+Q7l@ek zQP~#cBpWgg#mApc_sPYjpw8odQuRokmTkzcNl`^CcKB7e&;zViV;{Y{o^Y$%7i0m# z62%#1Lq!RC?}lK>%mp}T!3Xv;L*0v*>USLm``N%>w>@fwC+#T&Tx2bN4w(20JB}oU zuSa6v^kXi0xPs?pbaOHnyiqq6By1EZY9OZ^^QA>{q-Hsd&m`pbQ%8121aWG-F5xf zlZ%;B{;C>X19|`^_?dVyCq>n+41w7|!tUS!{9rHlbhX=SZO5CQ^;!Du_E7*`GiR^Q w)2!4MKjfSAeNo!9>IaV6aUZ*?W>} zs4%E?srLW`CJh0GCIK@hTkrW7A15Iu%N&?Q^$0+!{Tv&|t^Y@u%!L zglTg&?Q5q#ijZ;&HBQ?FNPp;k3J5!&{^+SGq?AX~SiOM9jJMRpyP?RCr@z38AQyy&WRMaC;n4una$~nJKSp?q|s8F00c9?Q! zY_ovvjTFm+DeQM^LXJ#v0}6HRt3R1%5PT*}W!k8BEM;Jrj8dIceFo2fhzTqaB3KKk zGlCLI)gU25(#u6ch6GeB1k@eHq7l{EHXv0n6xE#ws#ri}08kkCf8hUt{|Ejb`2YW* zvg}0nSSX1m=76s?sZhRY$K=3dpJ+y*eDULGnL2}4>4nvW^7_<~wIM_5fjvwt4h1|g z)g0Z6ZFq9j<~9~b8((~TN{Z?ZQfw|is&Xp~AC61sj;xItKyCHdI|tCMC_LbXF>~vR z=w6V3^H=W4CbAgR4#xw}ETTwu2guW~=Crl@SMXv85jQ=%y!s^?m4PI0My7MWICO;- z175jm%&PcPWh8QdOU(#8bp4!N7ET-+)N}N2zk2)8ch|4Q&lPFNQgT-thu053`r*h3 z_8dI@G;`zn;lH$zX3RzIk`E8~`J=BBdR}qD%n@vVG1834)!pS1Y?zVkJGtsa(sB~y zNfMYKsOJb%5J(0ivK8d+l2D2y&5X!cg3BG!AJ}910|_${nF}sC1QF^nLIhzXk-Y#x z0)&1iK!O;Og0Ky!;`b~v%b$`S4E&fB)1NB4v@8wr( z&+NX4e^&o)ecb=)dd~C!{(1e6t?&9j{l8%U*k4)?`(L3;Qjw z#w7FS+U(94MaJKS!J9O8^$)36_J8;thW#2$y9i{bB{?M{QS_inZIJ!jwqAbfXYVd$ zQ5fC$6Nc9hFi8m^;oI-%C#BS|c8vy+@{jx6hFcf^_;2VRgkoN(0h!_VSGmgNPRsxI z8$rTo0LaYq-H5i&gtj81=&xU?H-Y2==G@uQV7E`@+2E9XQW@{&j`?EOktk|Ho{HU>ZqDzvgjwBmdex z&uZNd2C1h{{}2k6Ys9$*nFP3;K%u!MhW`uZy7Sn`1M1zs@Es&;z*Z>Gsh@-3Fe6pE zQD2@cqF((NrRevgvLsvM_8;;iNyJ5nyPyy?e!kvKjGj`6diRFBEe49Oa7wwkJFV7Z z$YT&DWloYu-H?3<0BKn9L&JYDT-SK~*6c5pi18P26$JESKRYj{T7Zk6KiRJcbvOO*{P56Q6s8msbeI3>|j>K9}Q9UBeq*inXKemCm`-<5|-$ZyN4u$(3 z&HcvqehFD%5Yrmykg-^d`=BSa8(i=>ZoC77^mWY{evp(km@aHqhUECBz76YiR+VYK zY_avFC~V3$=`6C4JhfHAQ@DZtUOwH`L;oYX6zK0-uI^?hS$ALfq}A7evR;ohJHij} zHSZdW?EKv9U1s4oD*<(0oQ*;MaQ6@cvGL zuHCPgm_NhVsgp^sfr*ia^Db}swo1?O(_Q2)y+S$CBm+g=9wCOUPbz(x)_GbaKa@A7 zuI&!ynLiZRT#V%_y_-D`0Z5lT*auoe{(U5NylTzFSJW()W-#F6*&A`LNO1bV#Y;QJ zSbLBnp|B^dtK|KIWC|No>JjWBWE@n7O)x{&^E(WMeMvp57#qA8m* zeTow*U@_86B#Fm*rxyYu5PRWaWHx8y> z*qmHEp(AMDl0v)ij(AY8fnH=~ZwwjVAbu*m5;xPfidh@ov6d8g zfJsi&!QyK53Es%sC39ts;54V68koALD4b|%tNHW0bIkZAJKa=W&FomJSEDT>W1xIX z1x%Z>AvNIsSPLcn3RTcHXb@KB?cuM)=x6fcIx>&(GxqZ8w3p#jJ(GVgc*`c0HG}dv zIop&Qim!K1NFwic%07KcjWgHBPUkq7f~lj;TPqVGTiT#cUeim>;nY`>h@a*S{qQex zQ`z62WK|Mj)Y{tfF{;T4P;c8$Q|KU?Joh zIkA^z%X7z|r>4aTh@|StTi!-r1D!g=zb#3d#{{&K3CqE$Iz-UH<%37c zRfkO`&uM%#AD3PHv`g5t0e^O%nVL0d{Xlx^EjEC3#skF@`zl-7PF^0oxW)1!C!JxR zWvuAHH?)61FKA1QeT*_sY7;_Id#!GmV4n`MO{~sv}VLSK` zXRw=Y=Clz*00B(5y^K;gCZMAzjT5+c3IC=)l(9VIDdatpxj3y89WwI|bH&$!ZEvp` zPR!T@#!(|KfI-w?!&+7$N3F6>tD{YO4Qg$d_`nNEdfVCha9vaPn0jI0`)`@*72hq! zpU5ND^P*RoEkbD5o#az(-g=Y)L>HH>Oc%}$ zT3Rs_ih0;4+Lv4Y;@Iv(;fUbQ=i-G(#>vghec~*j(I#r|5mqFiJBpzi&hzEcD{u$< zRsm0BVYn=pT;0>R(itW|*D&;O%bOc7et9ACaH#J>z3A1A~6fdP>pmbM%xzm4>|;c_?B+%sl;Qs2{t!60$^u zH1t@9^6>;?!FuusnISi$f5CL&;z?EqJN$FBuWDA#D5`cy_UvCFIVvf{c?4N0teh;d zET$7aVbj08KTQS!x?Nd1Is8q8qFzs}a=!@nJ;7FSfCY^T@D-gpw`w<6e#X3+;O}1h z$%I!M)0bg|EKUA04Qjn@+x{Rj8vt6Wn!R|3A92z}^$KfF5(#CWr4y#~re1CN4i4w0 z#GsypBR{xA3Er7sgAi(|}1-W?s~n$7?K|9WL8kpVfw-;#b9 z+mn;=ep!162U5R>_t}fOt~tE?s#m( zO-S$7>Ay6*hHdZ)7_oU915WYYCIX;hFI-U2EWYX!pllONr@Q--2o~`!isi6vTPLJ4@(|o=%NHYjo0_S&q*UQIROw@*N-By@PaQ&;YxFZ0aR zX&}LeOEz);#m~Hwm^VAY8DK}b$F4bo{jMN?d!lxKPhNklzr^Cd`0f4oJr^z=I|l`* zm8AHm*fPV`0=lF3Pnnp}&J0N1X@}-D94YvmUabFrLGSnTz7Mu^21F#O5tN#CuY9Vh zUZBH=ez%h*wkf0hBtXJh1SN3d+IF{gzT7lp)j}n?03lt;XSQRAh7qd&v;RwTYDuQ# zbI2*r<>?x-G0@hM{;%{VBD7nLKt~D`T~-HAt5;h%i0_=Ifs=yHma5dhJ+QMG?Ux(a z|E?1CMy1!~oA`FP!k~iG=t&5#>bVdz=peT8HMB6Y)#7PpETtNryT^+Rv3vpJaF^zP z{H}0-LyV9Fu21ID%wO9f1IKlFr1p4c{o-?03vyB-tr5duk^&L$;m_|f$vs`^Sl{j2 z95}oY{LlY+=ZS%J+tZoXCd0*sSU7w^gjovXn+g7uyra5{cU49@yHf#Z^Jl-$9cIfo z+AJuxH$VLb=#+uBbVmUjnx zxb1pZ@-O9=AIk4@S)m6fJ2?{HrNYwwnL3a45muuNjr;6$O`bGEM0T4A2_S$t=86*- zcO+0mywg*j#A4mU}enR_!cGmIYQ;qwfchWtFEXL)AK%*;=j znYne+hS4EMy3S)C*mZ1KI>!+)0V@9!N6H$Y}~MJ{rYuf zz^KljIWvFi-?#?V@LPR&c6Nn{!=XM z>}-h$S76;$H{E{Y%@^zlmOl^efBwa%UU+jJD9UVukQ3ti_kH-?H*RC0?M1W%FCvMB zM_+v6fk$6X2sx)-p~B3&Kl{nscK}pNLM*qjtpaf9>AU{-iPKQZR8yCg!TY}Qg*(;) z)gdvCcB%kppZc$VdvsK@)3l1{&DG!d_6OHOS`y=ITLEVu`unSKA2E%JD*DVX{LJ}K z9l>hMRDqxQh0lnpGHpVYneX}eA3Pt|2v%=q;rt)``R|#bDyB)OXY&vI_@|*}h}G?^ z@aZ4_!7cQPX`!fW_?{oT1NTwHs#l5L-0`E|y@48<3Q^HFf8=Idi zpJYD%1MkII!~|7I^WGo)IF=?{>ACnjJ_WUi39C}!Q{QnheVJqeKKqq5^o5CBde(g9 zvw$X6^jz_^E2$wSw4!q5*RG(C2_^XO$HBn_55vbl44OnTTRwRaePP0vo{K)U1#99& z<>rq7V&V(<&@I%MFoN5zrY}sz=(*-L&}1QQ*a%`u25h{cFj===17eB_uGuzG&byQ< zrm8BJZl4r_E$3k|Wo6FW0-6M7>qac5uFQsQcmkLWGfeH74S3Z_rJ!jgN++!@i=HW8 zkyjI(oPH-+-N#Qc^-mpNO`bc6r=2-<%&Wy5K1vfFJB(L_IkpS6fY^NmuL8qsgj>MD zn~BHH9WM~32_3vd=W&B)k7F9q%stJx+b_L_X-4zr^LVUMCmyCTA3sWtkvsmME?Xiy z?xOSfB=_$oY06~J-HcCq&)qcW{j;uP;?Dm}=hkq?zh&n!;m((-G-u_t|6x399Q;>A zgNpxoJNj{u|MFDH7Rhq@FCAl0dE|ddnl!oh9{Lq?@JDoR6L;C941IK`ISfdE$4S zE0AUQ8+2|Ncl_q5QkSp#AODp~(^mfP&%Au@@|TBQwoP`UU+V{6u8|)6ZA{~uKmQ*M zmrMTDU8S~8Eqi{^v0Ug&5Upcm#y7Z1(RbgZAG8jB$eRwCspQ)>5;U)oGZ&E5aeR*K z8Yt`Y0$G))Yd(Y3KH}tA4`-_QmNke5hU_|nq=xtyjwW(_o?itz>B>WM&^63bNdQ)k@-IgDHW*RW$Xo9#RzrTrCn7L2H{9Amq|qNg@#eZY=|P zCoI?2s+L)zsM%WX(NbVEY^`C>lFjIBYmJ6@DKJ0ZT4&F&WHW!dwa%QzOG!?jY_2(S zDcEzZbz*2Q!43|z))9yOP9X1Xt%DXzwY(3tl-TR=Qb_MbZYRrooh;dYYmS!U_as1(=YVB?Q_A|tNu5Ut&_q3jbfDM zoFxT^uEuH`nX3*sB%K?GuHUkweYReBwnHqh3P)~`+s3+Tj!rDA1e)8vuBv5J*IsxC zkd^~b(aGzArj08{>cnzOuy04C+C`}gb|Yz-1avxeWzev3NzcHbz_&4W@QCr$z3~w=8Ua- z`;vfG1~BP8CyLb=F7t1am~ph_#|O%$khSJ9%Vtcn)YmpgQxF?xM^_Vb+5fnpB^W0I`f%X8gb9#X{Q-yJG0{Z56aWeI&zPxnf5pdJA38bM`cYnS#x)% z`n1tFf$i)W-hGm(f9mde^=X@NcV_lFb=P`4&CI&H=IArijGwdCk&X@uQ$5xmj!~^? z#$ROCI)V-~t%L%GS#wo@U27ddR`4`3)WoB{R-4snfNrfee|kI8^bu#yDgYqOwas9# zmcb`3!kRJ`Cr=_tq)8aMt{aGtUZsqwVlj6DgCGre>AEt&x8H_in!x@uwgExIh|-mA zjdaC(29~CTVSaaF7HPbql&*9Uo8P@f)>LqCXclr}peS7_1BQ28u9PO8Eq1@`l3q9o zkfKCaO2?T?ZyA6loW<#9_c^O=m<&h}CA!ineAD@=(gbq`vyT|tiJ6#^B1$P;;qax` z55k&Q?wEh#87niLo*+n4L@65J(Nz~=Ya%7^(miLb(E>A3B@|Jjl;FU&D>o|9#7PJH z?|ago!o;WC^h=|T7PVBg(DAB}72cyUS zb(f>Bwbr!F1eTCO5fpj<{PqhY5>143p?~5ZA5H40);=@M#MYvrB6gqHbU_!GSY??i z%s=>-ciA4*zOOZHds0a(kWewZ4h(k8h(ua7HX)Au&mY~H8KY6(_cb$_&fA@QjIW-*heP3%$d!m5^AdnT}`12qA^c@!g3DOwZ5WwE2?)-yU z!)Vx#Mtxt?FzFTwK!77sy7)sMzUd->w4^bxtpM2j!b1pjgyk zGKwWGeb4)^zjy{9Es&PU1}gwg?|J#L$KJB7ett9@4M%-nGtIQr0>Fl@8-yh`-+1ed zS6r}(MeSvgSoFmH*_WPu@i?}!AB~2?;i&IxrkNg~cQ9Som98tcq)k^|eeER|Zl77t za-TVUc;DNvzVXJ%w52+#weN?+;i#{f#!Oc&z?81*N>^e~ltRS%ZI@lR{rs()HmqG! zx*}ZrI-EZ}ckJMiy>A^oofwDfC~IH)z8{VHKGT@#E5I(Ll&+MnMCl>~AV7+>Gi%mF zkU1QlKASdR0B80!YhP<$Ywi0?W2Ux45oPfxv9QolWzJPD^weBfvo4SONxP35106sAmh(e+vAs0GboFD@PvNs)jNPvarhW}0YliZEg{Gazv z+JDIpoojRVPr<*C|BTq<`6ga{5q^8^!|0cxe=rZ!zxH3%f5ZO0cQ*Z<^$Yt2{|Ek0 zyT|*F+CO@K;(owBKtGg!S^xj-Z~rga2m6nxKl9J=fBSuNKW_dLKWhJKeg^-Xe`^1? z`TyJj)8E!#>_3Y?uKrwqq3LJ#SGU>AzUO|6`nR^u&3FNN_jGOc zw)Nw`wr3yIKhgcee6IaN=ws>M{6677%)hPwx&HzC(f&u~&)6@b2kNRzBDQAP0*H73 zq%McOmRk{B3i47qRe=DA*$&odrbEJZ*pV9XXa&p@wlW~@Yfs>V{yiTtplMhgM*-Bz zsSnlq&pG;z0OUN%$~$3=g1UF+G*>+17eRbBf3=y79J}KR8owon@$1Z7MIrvvWWH)34nK2SD)GsrJ{l z1Cl#oVo3A8qY3e=aF)qzms~FG#2$LzT=gs&aVMOj>(%{y<&O0cG!nCiESl~x=^dF{ zKvj8F1K8Ng171wwM5Fh4KoQw`_c6#y$(5cAm7e}~nJ#A*fx+c9;y#&W!#VukR)ugk zKp3=+;Ut+IYn%m+r4d*<`L2h%aDnX5}^!5R|H;(34AoVWjRx(msBZvk;rCI*|~ zdOijqI@9Z{Vu!~jvHW{lBa$rnl4+!s_5sfK3bCGk-B%iDe&@-}+%fOKU|(9?V1 zHE8&@4z)Kx!RAvAs z!Wic9=o#(bg?kc-G68-m(jZ`^=XGUXb)}t(%&~sjFnV^sEX%hSy6UKC4iOhgV=BHV z2w`4g7Y=s#Vu2B_?#VQ|hP39@eArgfX>-0S+dd&^mx0*wp}>)x;c4RUgxz%;oNe?& z-7-lJ@Y^2^C;=qJsxx5|xF)*pTGhch2B&kxtn;f!7=gznk}I3}Dh}(CoMXgA5-p&kS202!l?!fT3t|HG*rIP~mS* z$Wjo}jq3}z$Qq!9yrtd3fM0N629ZM?LU$nv@Tv9b7I;D|;0H2dsA~g7Z7zp1| zB)XmrkMgF6OQr|R)HHD^TE{Y#j!~SR?b`Xt3Qs`B+x<hxexYeAjMUWdZ-*n9%(1)Wb(n2U<><7&9dwGJmrob)4%H? zlQ%z+L-^$dFhhH|@u$%97Qz?*Ynh2VG@q|?8vY&L74&fs&_b&3$x&Oyjl~LQDRRap zJU4U*R+(2Dd!G+lh8!V{pT_UJn+^1Qg6$` zqkNm(a#hWyc6SP+p5=C4HL8-m`pO`5o~`-LI?_h5CsH?F_%?nDodmz&pWR20WTpJE z?N|wSzLjMUK8E)a2tI}Lf;+;*M|h3Y(U#>)g1>zk9|Hd}oZAa2 zLYBWBoSW!Ts!RwXr^8h+U*@{9{zqS^iH)Op<;r`Uw~nc}<^$V~_i%$GFjaG?X1@E|M`h)nekvFKt`Dh-f>@|0-`Xoq)o` zx;JmzDfOV9qCx|EVpogEe0LK~tGS?5$$L_i6P$P6wIsCQaP_;d{{N=iV@+8LI}o#( zvo*Ejy=IIn{rdIQh1&q-{EuohpVOjJ^Q3lD*YTp37$^RRgn8ihpdu5{Ct%5-KO!VL zcNB6dUajXI9jkm-P|i3~GB-A(X`P1Oqqb$tcku)UJw0w3GeUijb__#QT4j%64z%EeB7S?jlWwx_7&+EEvB|6N=kV}DwnyAlX=?j`) zmU#!$*^@NIu#n_d7;WoJV@*Fbv9|yJO4;n|BNF2xy(54RyB>t~8lUOUW$&2%Nwi1y zx6JxW88>U2$#qhl^6KUbtmg9}D0o5vYDT7kWJthLGkpGnN4T>{St^_EU>4;DmLF9o zr|LqsA8_MoNLQ=}w?8u!ziSZ@PC#Y<#9uJFo-ozVo6D;<8j^1$c|qAE3ZTE5i~zmE z$BU5lw6l=EWsg^y^;8>r9qH{xfL|~PZYK#md$zZ0?o11gV<*WSW~cgy2GYGQir%wf zt4iW8D+;s*;RGrmd(-T<@2&j(Cb9xhV*l-x`TpK`xq|7p?5R%5*s!69?2c!cC*VY* z2DE^9pvOPLU!1e}wA8S8opcTJ3`NB>hY=JQnL~QFXR4K8A$BqJnoEB$wn-%u@E6Mh zCfMF4kusv3N!(aHC}4)Xs^xoOwXd%e^6pi5|DZo=Q25j+6HlJ^7FodH6y1bMROR^q zGu6)fopS`h%Sw<;ZH%TEPf+#81-#_v+@8nlR0jLcIDKQtLleOC)6yLZgC!D9X3GgS zohwU{v$jl=quD#Go^hB{`@Qw*a%`(^jyT~=q^bWgGzRj;|12J55HWdCWV}EB|K=%N z3Nq-qxJJ`>^|1MNN+q}zTB&ooE3j==AgK@^UW<^oSbeALa2peF)Th6{@sj0KyMNHZ zksk1+MXN2tv+22A%cQOGpS9)77(uP9mh+!5T5ERLvF@b}$+WvXM45Z?-kCa)fb~f1 znVbTD$Gx-0Zxc`0D@YgHakge6SL0H`-vN_x?AP0>iGH0_EE&=v83hMJgaKAI0jJXm zVxVz;X<$v6WW7}fxROO7vr#YLP;;lij5VrX{;>7kK6TtOH&6|Ar^xo>00%+u$C4@# z>!jOt6*3><171+WxoZnKDTzJtDRw+T030;yI}~uV@9fCnei^I*j>Bp&mzP2d=FPb_ zCM*l_+$LDR3B*a!A$g#>xsrZvw0lckxmMg>0aQd7tPyN=t{dgXb;Ie+T8{fZH=gdu zM7Rg9c(kg(Jg0?ARRRl=AONFKrvFj)lTY$KfT%6^6s`mk*ABGhsce*LsoD>K{z_M2 ziPpnu+lw22PfF!CoId^6n*G4H(Ix+#+N{C(da7t1BYMGEaE#PdpOLxsVD5riQXHp@OX;`S`8VnpM~)I920w~<3|mo0 zf8~Az`*?2?H&gZ&*K&bRkV@qzvMlRHXys8*Ze2+1c?5o!^+$&MHxB@4Ee5cke52R! zmn7AZtY6ST%ixgU5)%$%QcwHj7Es-Qu^kLAPwy%7pGBw_4Q9#da^W2$}axNHr03)_nw z5?yuNmXrI5HgS46)c5&}B)Tts49oU92>3xBLLy}FMUW=84DQbVq^;7_e7|(Sdz|&J z73N+M`rc2rt*oSWu#7S{*s~nH6HRHJS1SmzeXk|;CA)FI4bat3<%}nkB%;;?=F>B7ms9QSxv#@+69;@>QaR?REYX4&)=itG>rM{<{A79Rmk)`5ON#GL`*KX%}Ihk3w(RtM-WLt z?f&FLF}4N^yE!(pZ&Yj&Bc`~K0@4_}*0Om?wN|}4WJ>WL;G^H2*QpgEkGA~OET-Km zkwz|5{6dnz1U<2Pe9DNL>3g5FEIvp1jzP&2K#z~j%g6!7B;^zF+o95?fV{3mnB8*RMhCDNp>Am-3e@jNfMj?jHV$MWjk!DDKP zkAz$Y?Sr)!GUOX}qTQ5aMh|wq1uq}~joWyKl=b_LboM#wi{CMuz5x6BKlA-qy++cM01D3b7`uD z#l6M4pI;JCypO8JZ6?U&wNxR!{4oB_ zlV!x9+-&Qy6{%MQ{~yoZGkKiTSC`YS_j22~G;xUV855g2&C(zm^V!(wpcm@zn{%!g z4}JGo(sGZ1O~to-}le

UmY2RIYtNPVDpE$%vda+HD#3m z&VuXJ{BK&Qe+rBa7eq}Q(bq|tn(RrJAk|ztj2(i{d>nmQnM?;HF2k&9sA6up5tmjl z7lySlzMbifH17-m-Lwa_F&e7nOH?ESi3#ckR3tsM+jsck3`oG!uMS}|eAwVXv>}qxwq?QY%QJ0}r@^;fhuUA9W z*BVl>TGo&N004@xSiwDUXUvp51sVmqO3m)=B55aPwf@0=e}cN+$-BdKxY`YrT_4)0 z_d10#i44Q*rFr8MC>*)v$EJvz``(pb{e&*6k+b zsMz%($|1+8hn8c2?P(l@;Rb&CsZeYoCI3?2!LqjbwPXW3z4G$Qfj=cT5Yb%vY0(AX oeb?AaKtwrnc|$|zzw9vfvn^aJJ!zd)XFXqqy0000001=f@-~a#s literal 0 HcmV?d00001 diff --git a/android/SherpaOnnxSpeakerDiarization/app/src/main/res/values/colors.xml b/android/SherpaOnnxSpeakerDiarization/app/src/main/res/values/colors.xml new file mode 100644 index 000000000..f8c6127d3 --- /dev/null +++ b/android/SherpaOnnxSpeakerDiarization/app/src/main/res/values/colors.xml @@ -0,0 +1,10 @@ + + + #FFBB86FC + #FF6200EE + #FF3700B3 + #FF03DAC5 + #FF018786 + #FF000000 + #FFFFFFFF + \ No newline at end of file diff --git a/android/SherpaOnnxSpeakerDiarization/app/src/main/res/values/strings.xml b/android/SherpaOnnxSpeakerDiarization/app/src/main/res/values/strings.xml new file mode 100644 index 000000000..05f2df090 --- /dev/null +++ b/android/SherpaOnnxSpeakerDiarization/app/src/main/res/values/strings.xml @@ -0,0 +1,3 @@ + + SherpaOnnxSpeakerDiarization + \ No newline at end of file diff --git a/android/SherpaOnnxSpeakerDiarization/app/src/main/res/values/themes.xml b/android/SherpaOnnxSpeakerDiarization/app/src/main/res/values/themes.xml new file mode 100644 index 000000000..34d1d96ed --- /dev/null +++ b/android/SherpaOnnxSpeakerDiarization/app/src/main/res/values/themes.xml @@ -0,0 +1,5 @@ + + + +