diff --git a/.github/scripts/test-nodejs-npm.sh b/.github/scripts/test-nodejs-npm.sh new file mode 100755 index 000000000..9900a9db1 --- /dev/null +++ b/.github/scripts/test-nodejs-npm.sh @@ -0,0 +1,62 @@ +#!/usr/bin/env bash + +set -ex + +echo "dir: $d" +cd $d +npm install +git status +ls -lh +ls -lh node_modules + +# offline asr + +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-ctc-en-conformer-small.tar.bz2 +tar xvf sherpa-onnx-nemo-ctc-en-conformer-small.tar.bz2 +rm sherpa-onnx-nemo-ctc-en-conformer-small.tar.bz2 +node ./test-offline-nemo-ctc.js +rm -rf sherpa-onnx-nemo-ctc-en-conformer-small + +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 +tar xvf sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 +rm sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 +node ./test-offline-paraformer.js +rm -rf sherpa-onnx-paraformer-zh-2023-03-28 + +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-en-2023-06-26.tar.bz2 +tar xvf sherpa-onnx-zipformer-en-2023-06-26.tar.bz2 +rm sherpa-onnx-zipformer-en-2023-06-26.tar.bz2 +node ./test-offline-transducer.js +rm -rf sherpa-onnx-zipformer-en-2023-06-26 + +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 +tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 +rm sherpa-onnx-whisper-tiny.en.tar.bz2 +node ./test-offline-whisper.js +rm -rf sherpa-onnx-whisper-tiny.en + +# online asr +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 +tar xvf sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 +rm sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 +node ./test-online-paraformer.js +rm -rf sherpa-onnx-streaming-paraformer-bilingual-zh-en + +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 +tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 +rm sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 +node ./test-online-transducer.js +rm -rf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 + +# offline tts +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-vctk.tar.bz2 +tar xvf vits-vctk.tar.bz2 +rm vits-vctk.tar.bz2 +node ./test-offline-tts-en.js +rm -rf vits-vctk + +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 +tar xvf vits-zh-aishell3.tar.bz2 +rm vits-zh-aishell3.tar.bz2 +node ./test-offline-tts-zh.js +rm -rf vits-zh-aishell3 diff --git a/.github/workflows/dot-net.yaml b/.github/workflows/dot-net.yaml index 2741b486b..12bbc0150 100644 --- a/.github/workflows/dot-net.yaml +++ b/.github/workflows/dot-net.yaml @@ -4,6 +4,7 @@ on: push: branches: - dot-net + - fix-dot-net tags: - '*' diff --git a/.github/workflows/npm.yaml b/.github/workflows/npm.yaml new file mode 100644 index 000000000..98f633584 --- /dev/null +++ b/.github/workflows/npm.yaml @@ -0,0 +1,58 @@ +name: npm + +on: + workflow_dispatch: + +concurrency: + group: npm-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + nodejs: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-version: ["3.8"] + + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - uses: actions/setup-node@v3 + with: + node-version: 13 + registry-url: 'https://registry.npmjs.org' + + - name: Display node version + shell: bash + run: | + node --version + npm --version + cd nodejs-examples + + npm install npm@6.14.4 -g + npm install npm@6.14.4 + npm --version + + - name: Build nodejs package + shell: bash + env: + NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} + run: | + cd scripts/nodejs + ./run.sh + npm install + rm run.sh + npm ci + npm publish --provenance --access public diff --git a/.github/workflows/test-nodejs-npm.yaml b/.github/workflows/test-nodejs-npm.yaml new file mode 100644 index 000000000..4905d30d2 --- /dev/null +++ b/.github/workflows/test-nodejs-npm.yaml @@ -0,0 +1,59 @@ +name: test-nodejs-npm + +on: + workflow_dispatch: + + schedule: + # minute (0-59) + # hour (0-23) + # day of the month (1-31) + # month (1-12) + # day of the week (0-6) + # nightly build at 23:50 UTC time every day + - cron: "50 23 * * *" + +concurrency: + group: test-nodejs-npm-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + test-nodejs-npm: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macos-latest] #, windows-latest] + python-version: ["3.8"] + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - uses: actions/setup-node@v3 + with: + node-version: 13 + registry-url: 'https://registry.npmjs.org' + + - name: Display node version + shell: bash + run: | + node --version + npm --version + + - name: Run tests + shell: bash + run: | + node --version + npm --version + + export d=nodejs-examples + ./.github/scripts/test-nodejs-npm.sh diff --git a/.github/workflows/test-nodejs.yaml b/.github/workflows/test-nodejs.yaml new file mode 100644 index 000000000..4f9ecc56a --- /dev/null +++ b/.github/workflows/test-nodejs.yaml @@ -0,0 +1,108 @@ +name: test-nodejs + +on: + push: + branches: + - master + + pull_request: + branches: + - master + + workflow_dispatch: + +concurrency: + group: test-nodejs-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + test-nodejs: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macos-latest] #, windows-latest] + python-version: ["3.8"] + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: ccache + uses: hendrikmuhs/ccache-action@v1.2 + with: + key: ${{ matrix.os }}-Release-ON + + - name: Configure CMake + shell: bash + run: | + export CMAKE_CXX_COMPILER_LAUNCHER=ccache + export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH" + cmake --version + + mkdir build + cd build + cmake -D CMAKE_BUILD_TYPE=Release -D BUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX=./install .. + make -j2 + make install + ls -lh install/lib + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Copy files + shell: bash + run: | + os=${{ matrix.os }} + if [[ $os == 'ubuntu-latest' ]]; then + mkdir -p scripts/nodejs/lib/linux-x64 + dst=scripts/nodejs/lib/linux-x64 + elif [[ $os == 'macos-latest' ]]; then + mkdir -p scripts/nodejs/lib/osx-x64 + dst=scripts/nodejs/lib/osx-x64 + fi + cp -v build/install/lib/* $dst/ + + - name: replace files + shell: bash + run: | + cd nodejs-examples + files=$(ls -1 *.js) + for f in ${files[@]}; do + echo $f + sed -i.bak s%\'sherpa-onnx\'%\'./index.js\'% $f + git status + done + git diff + cp *.js ../scripts/nodejs + + - uses: actions/setup-node@v3 + with: + node-version: 13 + registry-url: 'https://registry.npmjs.org' + + - name: Display node version + shell: bash + run: | + node --version + npm --version + + - name: Run tests + shell: bash + run: | + node --version + npm --version + export d=scripts/nodejs + + pushd $d + npm install + npm install wav + popd + + ./.github/scripts/test-nodejs-npm.sh diff --git a/.gitignore b/.gitignore index cd68dd483..256ff406e 100644 --- a/.gitignore +++ b/.gitignore @@ -67,3 +67,6 @@ swift-api-examples/k2fsa-* run-*.sh two-pass-*.sh build-* +vits-vctk +vits-zh-aishell3 +jslint.mjs diff --git a/CMakeLists.txt b/CMakeLists.txt index 748e835b3..eba77a2d8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR) project(sherpa-onnx) -set(SHERPA_ONNX_VERSION "1.8.10") +set(SHERPA_ONNX_VERSION "1.8.11") # Disable warning about # diff --git a/nodejs-examples/.gitignore b/nodejs-examples/.gitignore new file mode 100644 index 000000000..d5f19d89b --- /dev/null +++ b/nodejs-examples/.gitignore @@ -0,0 +1,2 @@ +node_modules +package-lock.json diff --git a/nodejs-examples/README.md b/nodejs-examples/README.md new file mode 100644 index 000000000..1ee665e31 --- /dev/null +++ b/nodejs-examples/README.md @@ -0,0 +1,247 @@ +# Introduction + +This directory contains nodejs examples for [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx). + +Before you continue, please first install the npm package `sherpa-onnx` by + +```bash +npm install sherpa-onnx +``` + +In the following, we describe how to use [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx) +for text-to-speech and speech-to-text. + +**Caution**: If you get the following error: +``` +/Users/fangjun/open-source/sherpa-onnx/nodejs-examples/node_modules/ffi-napi/lib/dynamic_library.js:67 + if (match = err.match(/^(([^ \t()])+\.so([^ \t:()])*):([ \t])*/)) { + ^ + +TypeError: Cannot read properties of null (reading 'match') + at new DynamicLibrary (/Users/fangjun/open-source/sherpa-onnx/nodejs-examples/node_modules/ffi-napi/lib/dynamic_library.js:67:21) + at Object.Library (/Users/fangjun/open-source/sherpa-onnx/nodejs-examples/node_modules/ffi-napi/lib/library.js:47:10) + at Object. (/Users/fangjun/open-source/sherpa-onnx/nodejs-examples/node_modules/sherpa-onnx3/index.js:268:28) + at Module._compile (node:internal/modules/cjs/loader:1376:14) + at Module._extensions..js (node:internal/modules/cjs/loader:1435:10) + at Module.load (node:internal/modules/cjs/loader:1207:32) + at Module._load (node:internal/modules/cjs/loader:1023:12) + at Module.require (node:internal/modules/cjs/loader:1235:19) + at require (node:internal/modules/helpers:176:18) + at Object. (/Users/fangjun/open-source/sherpa-onnx/nodejs-examples/test-offline-tts-zh.js:3:21) +``` + +Please downgrade your node to version v13.14.0. See also +https://github.com/node-ffi-napi/node-ffi-napi/issues/244 +and +https://github.com/node-ffi-napi/node-ffi-napi/issues/97 . + +# Text-to-speech + +In the following, we demonstrate how to run text-to-speech. + +## ./test-offline-tts-en.js + +[./test-offline-tts-en.js](./test-offline-tts-en.js) shows how to use +a VITS pretrained model +[VCTK](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#vctk-english-multi-speaker-109-speakers) +for text-to-speech. + +You can use the following command to run it: + +```bash +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-vctk.tar.bz2 +tar xvf vits-vctk.tar.bz2 +node ./test-offline-tts-en.js +``` + +## ./test-offline-tts-zh.js + +[./test-offline-tts-zh.js](./test-offline-tts-zh.js) shows how to use +a VITS pretrained model +[aishell3](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#vits-model-aishell3) +for text-to-speech. + +You can use the following command to run it: + +```bash +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 +tar xvf vits-zh-aishell3.tar.bz2 +node ./test-offline-tts-zh.js +``` + +# Speech-to-text + +In the following, we demonstrate how to decode files and how to perform +speech recognition with a microphone with `nodejs`. We need to install two additional +npm packages: + + +```bash +npm install wav naudiodon2 +``` + +## ./test-offline-nemo-ctc.js + +[./test-offline-nemo-ctc.js](./test-offline-nemo-ctc.js) demonstrates +how to decode a file with a NeMo CTC model. In the code we use +[stt_en_conformer_ctc_small](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-ctc/nemo/english.html#stt-en-conformer-ctc-small). + +You can use the following command run it: + +```bash +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-ctc-en-conformer-small.tar.bz2 +tar xvf sherpa-onnx-nemo-ctc-en-conformer-small.tar.bz2 +node ./test-offline-nemo-ctc.js +``` + +## ./test-offline-paraformer.js + +[./test-offline-paraformer.js](./test-offline-paraformer.js) demonstrates +how to decode a file with a non-streaming Paraformer model. In the code we use +[sherpa-onnx-paraformer-zh-2023-03-28](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-paraformer-zh-2023-03-28-chinese). + +You can use the following command run it: + +```bash +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 +tar xvf sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 +node ./test-offline-paraformer.js +``` + +## ./test-offline-transducer.js + +[./test-offline-transducer.js](./test-offline-transducer.js) demonstrates +how to decode a file with a non-streaming transducer model. In the code we use +[sherpa-onnx-zipformer-en-2023-06-26](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/offline-transducer/zipformer-transducer-models.html#csukuangfj-sherpa-onnx-zipformer-en-2023-06-26-english). + +You can use the following command run it: + +```bash +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-en-2023-06-26.tar.bz2 +tar xvf sherpa-onnx-zipformer-en-2023-06-26.tar.bz2 +node ./test-offline-transducer.js +``` + +## ./test-offline-whisper.js +[./test-offline-whisper.js](./test-offline-whisper.js) demonstrates +how to decode a file with a Whisper model. In the code we use +[sherpa-onnx-whisper-tiny.en](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/whisper/tiny.en.html). + +You can use the following command run it: + +```bash +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 +tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 +node ./test-offline-whisper.js +``` + +## ./test-online-paraformer-microphone.js +[./test-online-paraformer-microphone.js](./test-online-paraformer-microphone.js) +demonstrates how to do real-time speech recognition from microphone +with a streaming Paraformer model. In the code we use +[sherpa-onnx-streaming-paraformer-bilingual-zh-en](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-streaming-paraformer-bilingual-zh-en-chinese-english). + +You can use the following command run it: + +```bash +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 +rm sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 +node ./test-online-paraformer-microphone.js +``` + +## ./test-online-paraformer.js +[./test-online-paraformer.js](./test-online-paraformer.js) demonstrates +how to decode a file using a streaming Paraformer model. In the code we use +[sherpa-onnx-streaming-paraformer-bilingual-zh-en](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-paraformer/paraformer-models.html#csukuangfj-sherpa-onnx-streaming-paraformer-bilingual-zh-en-chinese-english). + +You can use the following command run it: + +```bash +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 +rm sherpa-onnx-streaming-paraformer-bilingual-zh-en.tar.bz2 +node ./test-online-paraformer.js +``` + +## ./test-online-transducer-microphone.js +[./test-online-transducer-microphone.js](./test-online-transducer-microphone.js) +demonstrates how to do real-time speech recognition with microphone using a streaming transducer model. In the code +we use [sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#csukuangfj-sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20-bilingual-chinese-english). + + +You can use the following command run it: + +```bash +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 +tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 +node ./test-online-transducer-microphone.js +``` + +## ./test-online-transducer.js +[./test-online-transducer.js](./test-online-transducer.js) demonstrates +how to decode a file using a streaming transducer model. In the code +we use [sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20](https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#csukuangfj-sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20-bilingual-chinese-english). + +You can use the following command run it: + +```bash +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 +tar xvf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20.tar.bz2 +node ./test-online-transducer.js +``` + +## ./test-vad-microphone-offline-paraformer.js + +[./test-vad-microphone-offline-paraformer.js](./test-vad-microphone-offline-paraformer.js) +demonstrates how to use [silero-vad](https://github.com/snakers4/silero-vad) +with non-streaming Paraformer for speech recognition from microphone. + +You can use the following command run it: + +```bash +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 +tar xvf sherpa-onnx-paraformer-zh-2023-03-28.tar.bz2 +node ./test-vad-microphone-offline-paraformer.js +``` + +## ./test-vad-microphone-offline-transducer.js + +[./test-vad-microphone-offline-transducer.js](./test-vad-microphone-offline-transducer.js) +demonstrates how to use [silero-vad](https://github.com/snakers4/silero-vad) +with a non-streaming transducer model for speech recognition from microphone. + +You can use the following command run it: + +```bash +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-zipformer-en-2023-06-26.tar.bz2 +tar xvf sherpa-onnx-zipformer-en-2023-06-26.tar.bz2 +node ./test-vad-microphone-offline-transducer.js +``` + +## ./test-vad-microphone-offline-whisper.js + +[./test-vad-microphone-offline-whisper.js](./test-vad-microphone-offline-whisper.js) +demonstrates how to use [silero-vad](https://github.com/snakers4/silero-vad) +with whisper for speech recognition from microphone. + +You can use the following command run it: + +```bash +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 +tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 +node ./test-vad-microphone-offline-whisper.js +``` + +## ./test-vad-microphone.js + +[./test-vad-microphone.js](./test-vad-microphone.js) +demonstrates how to use [silero-vad](https://github.com/snakers4/silero-vad). + +You can use the following command run it: + +```bash +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +node ./test-vad-microphone.js +``` diff --git a/nodejs-examples/package.json b/nodejs-examples/package.json new file mode 100644 index 000000000..278762641 --- /dev/null +++ b/nodejs-examples/package.json @@ -0,0 +1,7 @@ +{ + "dependencies": { + "naudiodon2": "^2.4.0", + "sherpa-onnx": "^1.8.11", + "wav": "^1.0.2" + } +} diff --git a/nodejs-examples/test-offline-nemo-ctc.js b/nodejs-examples/test-offline-nemo-ctc.js new file mode 100644 index 000000000..1cef7169b --- /dev/null +++ b/nodejs-examples/test-offline-nemo-ctc.js @@ -0,0 +1,97 @@ +// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) +// +const fs = require('fs'); +const {Readable} = require('stream'); +const wav = require('wav'); + +const sherpa_onnx = require('sherpa-onnx'); + +function createRecognizer() { + const featConfig = new sherpa_onnx.FeatureConfig(); + featConfig.sampleRate = 16000; + featConfig.featureDim = 80; + + // test online recognizer + const nemoCtc = new sherpa_onnx.OfflineNemoEncDecCtcModelConfig(); + nemoCtc.model = './sherpa-onnx-nemo-ctc-en-conformer-small/model.int8.onnx'; + const tokens = './sherpa-onnx-nemo-ctc-en-conformer-small/tokens.txt'; + + const modelConfig = new sherpa_onnx.OfflineModelConfig(); + modelConfig.nemoCtc = nemoCtc; + modelConfig.tokens = tokens; + modelConfig.modelType = 'nemo_ctc'; + + const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig(); + recognizerConfig.featConfig = featConfig; + recognizerConfig.modelConfig = modelConfig; + recognizerConfig.decodingMethod = 'greedy_search'; + + const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig); + return recognizer; +} + +recognizer = createRecognizer(); +stream = recognizer.createStream(); + +const waveFilename = + './sherpa-onnx-nemo-ctc-en-conformer-small/test_wavs/0.wav'; + +const reader = new wav.Reader(); +const readable = new Readable().wrap(reader); +const buf = []; + +reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => { + if (sampleRate != recognizer.config.featConfig.sampleRate) { + throw new Error(`Only support sampleRate ${ + recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`); + } + + if (audioFormat != 1) { + throw new Error(`Only support PCM format. Given ${audioFormat}`); + } + + if (channels != 1) { + throw new Error(`Only a single channel. Given ${channel}`); + } + + if (bitDepth != 16) { + throw new Error(`Only support 16-bit samples. Given ${bitDepth}`); + } +}); + +fs.createReadStream(waveFilename, {highWaterMark: 4096}) + .pipe(reader) + .on('finish', function(err) { + // tail padding + const floatSamples = + new Float32Array(recognizer.config.featConfig.sampleRate * 0.5); + + buf.push(floatSamples); + const flattened = + Float32Array.from(buf.reduce((a, b) => [...a, ...b], [])); + + stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened); + recognizer.decode(stream); + const r = recognizer.getResult(stream); + console.log(r.text); + + stream.free(); + recognizer.free(); + }); + +readable.on('readable', function() { + let chunk; + while ((chunk = readable.read()) != null) { + const int16Samples = new Int16Array( + chunk.buffer, chunk.byteOffset, + chunk.length / Int16Array.BYTES_PER_ELEMENT); + + const floatSamples = new Float32Array(int16Samples.length); + + for (let i = 0; i < floatSamples.length; i++) { + floatSamples[i] = int16Samples[i] / 32768.0; + } + + buf.push(floatSamples); + } +}); diff --git a/nodejs-examples/test-offline-paraformer.js b/nodejs-examples/test-offline-paraformer.js new file mode 100644 index 000000000..c96977b40 --- /dev/null +++ b/nodejs-examples/test-offline-paraformer.js @@ -0,0 +1,95 @@ +// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) + +const fs = require('fs'); +const {Readable} = require('stream'); +const wav = require('wav'); + +const sherpa_onnx = require('sherpa-onnx'); + +function createRecognizer() { + const featConfig = new sherpa_onnx.FeatureConfig(); + featConfig.sampleRate = 16000; + featConfig.featureDim = 80; + + // test online recognizer + const paraformer = new sherpa_onnx.OfflineParaformerModelConfig(); + paraformer.model = './sherpa-onnx-paraformer-zh-2023-03-28/model.onnx'; + const tokens = './sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt'; + + const modelConfig = new sherpa_onnx.OfflineModelConfig(); + modelConfig.paraformer = paraformer; + modelConfig.tokens = tokens; + modelConfig.modelType = 'paraformer'; + + const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig(); + recognizerConfig.featConfig = featConfig; + recognizerConfig.modelConfig = modelConfig; + recognizerConfig.decodingMethod = 'greedy_search'; + + const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig); + return recognizer; +} + +recognizer = createRecognizer(); +stream = recognizer.createStream(); + +const waveFilename = './sherpa-onnx-paraformer-zh-2023-03-28/test_wavs/0.wav'; + +const reader = new wav.Reader(); +const readable = new Readable().wrap(reader); +const buf = []; + +reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => { + if (sampleRate != recognizer.config.featConfig.sampleRate) { + throw new Error(`Only support sampleRate ${ + recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`); + } + + if (audioFormat != 1) { + throw new Error(`Only support PCM format. Given ${audioFormat}`); + } + + if (channels != 1) { + throw new Error(`Only a single channel. Given ${channel}`); + } + + if (bitDepth != 16) { + throw new Error(`Only support 16-bit samples. Given ${bitDepth}`); + } +}); + +fs.createReadStream(waveFilename, {'highWaterMark': 4096}) + .pipe(reader) + .on('finish', function(err) { + // tail padding + const floatSamples = + new Float32Array(recognizer.config.featConfig.sampleRate * 0.5); + + buf.push(floatSamples); + const flattened = + Float32Array.from(buf.reduce((a, b) => [...a, ...b], [])); + + stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened); + recognizer.decode(stream); + const r = recognizer.getResult(stream); + console.log(r.text); + + stream.free(); + recognizer.free(); + }); + +readable.on('readable', function() { + let chunk; + while ((chunk = readable.read()) != null) { + const int16Samples = new Int16Array( + chunk.buffer, chunk.byteOffset, + chunk.length / Int16Array.BYTES_PER_ELEMENT); + + const floatSamples = new Float32Array(int16Samples.length); + for (let i = 0; i < floatSamples.length; i++) { + floatSamples[i] = int16Samples[i] / 32768.0; + } + + buf.push(floatSamples); + } +}); diff --git a/nodejs-examples/test-offline-transducer.js b/nodejs-examples/test-offline-transducer.js new file mode 100644 index 000000000..d86cb67b6 --- /dev/null +++ b/nodejs-examples/test-offline-transducer.js @@ -0,0 +1,100 @@ +// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) +// +const fs = require('fs'); +const {Readable} = require('stream'); +const wav = require('wav'); + +const sherpa_onnx = require('sherpa-onnx'); + +function createRecognizer() { + const featConfig = new sherpa_onnx.FeatureConfig(); + featConfig.sampleRate = 16000; + featConfig.featureDim = 80; + + // test online recognizer + const transducer = new sherpa_onnx.OfflineTransducerModelConfig(); + transducer.encoder = + './sherpa-onnx-zipformer-en-2023-06-26/encoder-epoch-99-avg-1.onnx'; + transducer.decoder = + './sherpa-onnx-zipformer-en-2023-06-26/decoder-epoch-99-avg-1.onnx'; + transducer.joiner = + './sherpa-onnx-zipformer-en-2023-06-26/joiner-epoch-99-avg-1.onnx'; + const tokens = './sherpa-onnx-zipformer-en-2023-06-26/tokens.txt'; + + const modelConfig = new sherpa_onnx.OfflineModelConfig(); + modelConfig.transducer = transducer; + modelConfig.tokens = tokens; + modelConfig.modelType = 'transducer'; + + const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig(); + recognizerConfig.featConfig = featConfig; + recognizerConfig.modelConfig = modelConfig; + recognizerConfig.decodingMethod = 'greedy_search'; + + const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig); + return recognizer; +} + +recognizer = createRecognizer(); +stream = recognizer.createStream(); + +const waveFilename = './sherpa-onnx-zipformer-en-2023-06-26/test_wavs/0.wav'; + +const reader = new wav.Reader(); +const readable = new Readable().wrap(reader); +const buf = []; + +reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => { + if (sampleRate != recognizer.config.featConfig.sampleRate) { + throw new Error(`Only support sampleRate ${ + recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`); + } + + if (audioFormat != 1) { + throw new Error(`Only support PCM format. Given ${audioFormat}`); + } + + if (channels != 1) { + throw new Error(`Only a single channel. Given ${channel}`); + } + + if (bitDepth != 16) { + throw new Error(`Only support 16-bit samples. Given ${bitDepth}`); + } +}); + +fs.createReadStream(waveFilename, {'highWaterMark': 4096}) + .pipe(reader) + .on('finish', function(err) { + // tail padding + const floatSamples = + new Float32Array(recognizer.config.featConfig.sampleRate * 0.5); + + buf.push(floatSamples); + const flattened = + Float32Array.from(buf.reduce((a, b) => [...a, ...b], [])); + + stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened); + recognizer.decode(stream); + const r = recognizer.getResult(stream); + console.log(r.text); + + stream.free(); + recognizer.free(); + }); + +readable.on('readable', function() { + let chunk; + while ((chunk = readable.read()) != null) { + const int16Samples = new Int16Array( + chunk.buffer, chunk.byteOffset, + chunk.length / Int16Array.BYTES_PER_ELEMENT); + + const floatSamples = new Float32Array(int16Samples.length); + for (let i = 0; i < floatSamples.length; i++) { + floatSamples[i] = int16Samples[i] / 32768.0; + } + + buf.push(floatSamples); + } +}); diff --git a/nodejs-examples/test-offline-tts-en.js b/nodejs-examples/test-offline-tts-en.js new file mode 100644 index 000000000..e44e1a55c --- /dev/null +++ b/nodejs-examples/test-offline-tts-en.js @@ -0,0 +1,27 @@ +// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) + +const sherpa_onnx = require('sherpa-onnx'); + +function createOfflineTts() { + const vits = new sherpa_onnx.OfflineTtsVitsModelConfig(); + vits.model = './vits-vctk/vits-vctk.onnx'; + vits.lexicon = './vits-vctk/lexicon.txt'; + vits.tokens = './vits-vctk/tokens.txt'; + + const modelConfig = new sherpa_onnx.OfflineTtsModelConfig(); + modelConfig.vits = vits; + + const config = new sherpa_onnx.OfflineTtsConfig(); + config.model = modelConfig; + + return new sherpa_onnx.OfflineTts(config); +} + +const tts = createOfflineTts(); +const speakerId = 99; +const speed = 1.0; +const audio = + tts.generate('Good morning. How are you doing?', speakerId, speed); +audio.save('./test-en.wav'); +console.log('Saved to test-en.wav successfully.'); +tts.free(); diff --git a/nodejs-examples/test-offline-tts-zh.js b/nodejs-examples/test-offline-tts-zh.js new file mode 100644 index 000000000..16555c82b --- /dev/null +++ b/nodejs-examples/test-offline-tts-zh.js @@ -0,0 +1,27 @@ +// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) + +const sherpa_onnx = require('sherpa-onnx'); + +function createOfflineTts() { + const vits = new sherpa_onnx.OfflineTtsVitsModelConfig(); + vits.model = './vits-zh-aishell3/vits-aishell3.onnx'; + vits.lexicon = './vits-zh-aishell3/lexicon.txt'; + vits.tokens = './vits-zh-aishell3/tokens.txt'; + + const modelConfig = new sherpa_onnx.OfflineTtsModelConfig(); + modelConfig.vits = vits; + + const config = new sherpa_onnx.OfflineTtsConfig(); + config.model = modelConfig; + config.ruleFsts = './vits-zh-aishell3/rule.fst'; + + return new sherpa_onnx.OfflineTts(config); +} + +const tts = createOfflineTts(); +const speakerId = 66; +const speed = 1.0; +const audio = tts.generate('3年前中国总人口是1411778724人', speakerId, speed); +audio.save('./test-zh.wav'); +console.log('Saved to test-zh.wav successfully.'); +tts.free(); diff --git a/nodejs-examples/test-offline-whisper.js b/nodejs-examples/test-offline-whisper.js new file mode 100644 index 000000000..1dd320bdf --- /dev/null +++ b/nodejs-examples/test-offline-whisper.js @@ -0,0 +1,97 @@ +// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) +// +const fs = require('fs'); +const {Readable} = require('stream'); +const wav = require('wav'); + +const sherpa_onnx = require('sherpa-onnx'); + +function createRecognizer() { + const featConfig = new sherpa_onnx.FeatureConfig(); + featConfig.sampleRate = 16000; + featConfig.featureDim = 80; + + // test online recognizer + const whisper = new sherpa_onnx.OfflineWhisperModelConfig(); + whisper.encoder = './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx'; + whisper.decoder = './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx'; + const tokens = './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt'; + + const modelConfig = new sherpa_onnx.OfflineModelConfig(); + modelConfig.whisper = whisper; + modelConfig.tokens = tokens; + modelConfig.modelType = 'whisper'; + + const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig(); + recognizerConfig.featConfig = featConfig; + recognizerConfig.modelConfig = modelConfig; + recognizerConfig.decodingMethod = 'greedy_search'; + + const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig); + return recognizer; +} + +recognizer = createRecognizer(); +stream = recognizer.createStream(); + +const waveFilename = './sherpa-onnx-whisper-tiny.en/test_wavs/0.wav'; + +const reader = new wav.Reader(); +const readable = new Readable().wrap(reader); +const buf = []; + +reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => { + if (sampleRate != recognizer.config.featConfig.sampleRate) { + throw new Error(`Only support sampleRate ${ + recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`); + } + + if (audioFormat != 1) { + throw new Error(`Only support PCM format. Given ${audioFormat}`); + } + + if (channels != 1) { + throw new Error(`Only a single channel. Given ${channel}`); + } + + if (bitDepth != 16) { + throw new Error(`Only support 16-bit samples. Given ${bitDepth}`); + } +}); + +fs.createReadStream(waveFilename, {'highWaterMark': 4096}) + .pipe(reader) + .on('finish', function(err) { + // tail padding + const floatSamples = + new Float32Array(recognizer.config.featConfig.sampleRate * 0.5); + + buf.push(floatSamples); + const flattened = + Float32Array.from(buf.reduce((a, b) => [...a, ...b], [])); + + stream.acceptWaveform(recognizer.config.featConfig.sampleRate, flattened); + recognizer.decode(stream); + const r = recognizer.getResult(stream); + console.log(r.text); + + stream.free(); + recognizer.free(); + }); + +readable.on('readable', function() { + let chunk; + while ((chunk = readable.read()) != null) { + const int16Samples = new Int16Array( + chunk.buffer, chunk.byteOffset, + chunk.length / Int16Array.BYTES_PER_ELEMENT); + + const floatSamples = new Float32Array(int16Samples.length); + + for (let i = 0; i < floatSamples.length; i++) { + floatSamples[i] = int16Samples[i] / 32768.0; + } + + buf.push(floatSamples); + } +}); diff --git a/nodejs-examples/test-online-paraformer-microphone.js b/nodejs-examples/test-online-paraformer-microphone.js new file mode 100644 index 000000000..60b28f6f9 --- /dev/null +++ b/nodejs-examples/test-online-paraformer-microphone.js @@ -0,0 +1,86 @@ +// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) +// +const portAudio = require('naudiodon2'); +console.log(portAudio.getDevices()); + +const sherpa_onnx = require('sherpa-onnx'); + +function createRecognizer() { + const featConfig = new sherpa_onnx.FeatureConfig(); + featConfig.sampleRate = 16000; + featConfig.featureDim = 80; + + const paraformer = new sherpa_onnx.OnlineParaformerModelConfig(); + paraformer.encoder = + './sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx'; + paraformer.decoder = + './sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx'; + const tokens = + './sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt'; + + const modelConfig = new sherpa_onnx.OnlineModelConfig(); + modelConfig.paraformer = paraformer; + modelConfig.tokens = tokens; + modelConfig.modelType = 'paraformer'; + + const recognizerConfig = new sherpa_onnx.OnlineRecognizerConfig(); + recognizerConfig.featConfig = featConfig; + recognizerConfig.modelConfig = modelConfig; + recognizerConfig.decodingMethod = 'greedy_search'; + recognizerConfig.enableEndpoint = 1; + + const recognizer = new sherpa_onnx.OnlineRecognizer(recognizerConfig); + return recognizer; +} +recognizer = createRecognizer(); +stream = recognizer.createStream(); + +display = new sherpa_onnx.Display(50); + +let lastText = ''; +let segmentIndex = 0; + +const ai = new portAudio.AudioIO({ + inOptions: { + channelCount: 1, + closeOnError: true, // Close the stream if an audio error is detected, if + // set false then just log the error + deviceId: -1, // Use -1 or omit the deviceId to select the default device + sampleFormat: portAudio.SampleFormatFloat32, + sampleRate: recognizer.config.featConfig.sampleRate + } +}); + +ai.on('data', data => { + const samples = new Float32Array(data.buffer); + + stream.acceptWaveform(recognizer.config.featConfig.sampleRate, samples); + + while (recognizer.isReady(stream)) { + recognizer.decode(stream); + } + + const isEndpoint = recognizer.isEndpoint(stream); + const text = recognizer.getResult(stream).text; + + if (text.length > 0 && lastText != text) { + lastText = text; + display.print(segmentIndex, lastText); + } + if (isEndpoint) { + if (text.length > 0) { + lastText = text; + segmentIndex += 1; + } + recognizer.reset(stream) + } +}); + +ai.on('close', () => { + console.log('Free resources'); + stream.free(); + recognizer.free(); +}); + +ai.start(); +console.log('Started! Please speak') diff --git a/nodejs-examples/test-online-paraformer.js b/nodejs-examples/test-online-paraformer.js new file mode 100644 index 000000000..e2b6a01b7 --- /dev/null +++ b/nodejs-examples/test-online-paraformer.js @@ -0,0 +1,99 @@ +// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) +// +const fs = require('fs'); +const {Readable} = require('stream'); +const wav = require('wav'); + +const sherpa_onnx = require('sherpa-onnx'); + +function createRecognizer() { + const featConfig = new sherpa_onnx.FeatureConfig(); + featConfig.sampleRate = 16000; + featConfig.featureDim = 80; + + const paraformer = new sherpa_onnx.OnlineParaformerModelConfig(); + paraformer.encoder = + './sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.onnx'; + paraformer.decoder = + './sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.onnx'; + const tokens = + './sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt'; + + const modelConfig = new sherpa_onnx.OnlineModelConfig(); + modelConfig.paraformer = paraformer; + modelConfig.tokens = tokens; + modelConfig.modelType = 'paraformer'; + + const recognizerConfig = new sherpa_onnx.OnlineRecognizerConfig(); + recognizerConfig.featConfig = featConfig; + recognizerConfig.modelConfig = modelConfig; + recognizerConfig.decodingMethod = 'greedy_search'; + + const recognizer = new sherpa_onnx.OnlineRecognizer(recognizerConfig); + return recognizer; +} +recognizer = createRecognizer(); +stream = recognizer.createStream(); + +const waveFilename = + './sherpa-onnx-streaming-paraformer-bilingual-zh-en/test_wavs/0.wav'; + +const reader = new wav.Reader(); +const readable = new Readable().wrap(reader); + +function decode(samples) { + stream.acceptWaveform(recognizer.config.featConfig.sampleRate, samples); + + while (recognizer.isReady(stream)) { + recognizer.decode(stream); + } + const r = recognizer.getResult(stream); + console.log(r.text); +} + +reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => { + if (sampleRate != recognizer.config.featConfig.sampleRate) { + throw new Error(`Only support sampleRate ${ + recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`); + } + + if (audioFormat != 1) { + throw new Error(`Only support PCM format. Given ${audioFormat}`); + } + + if (channels != 1) { + throw new Error(`Only a single channel. Given ${channel}`); + } + + if (bitDepth != 16) { + throw new Error(`Only support 16-bit samples. Given ${bitDepth}`); + } +}); + +fs.createReadStream(waveFilename, {'highWaterMark': 4096}) + .pipe(reader) + .on('finish', function(err) { + // tail padding + const floatSamples = + new Float32Array(recognizer.config.featConfig.sampleRate * 0.5); + decode(floatSamples); + stream.free(); + recognizer.free(); + }); + +readable.on('readable', function() { + let chunk; + while ((chunk = readable.read()) != null) { + const int16Samples = new Int16Array( + chunk.buffer, chunk.byteOffset, + chunk.length / Int16Array.BYTES_PER_ELEMENT); + + const floatSamples = new Float32Array(int16Samples.length); + + for (let i = 0; i < floatSamples.length; i++) { + floatSamples[i] = int16Samples[i] / 32768.0; + } + + decode(floatSamples); + } +}); diff --git a/nodejs-examples/test-online-transducer-microphone.js b/nodejs-examples/test-online-transducer-microphone.js new file mode 100644 index 000000000..f16f10d76 --- /dev/null +++ b/nodejs-examples/test-online-transducer-microphone.js @@ -0,0 +1,88 @@ +// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) +// +const portAudio = require('naudiodon2'); +// console.log(portAudio.getDevices()); + +const sherpa_onnx = require('sherpa-onnx'); + +function createRecognizer() { + const featConfig = new sherpa_onnx.FeatureConfig(); + featConfig.sampleRate = 16000; + featConfig.featureDim = 80; + + // test online recognizer + const transducer = new sherpa_onnx.OnlineTransducerModelConfig(); + transducer.encoder = + './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.int8.onnx'; + transducer.decoder = + './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx'; + transducer.joiner = + './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx'; + const tokens = + './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt'; + + const modelConfig = new sherpa_onnx.OnlineModelConfig(); + modelConfig.transducer = transducer; + modelConfig.tokens = tokens; + modelConfig.modelType = 'zipformer'; + + const recognizerConfig = new sherpa_onnx.OnlineRecognizerConfig(); + recognizerConfig.featConfig = featConfig; + recognizerConfig.modelConfig = modelConfig; + recognizerConfig.decodingMethod = 'greedy_search'; + recognizerConfig.enableEndpoint = 1; + + const recognizer = new sherpa_onnx.OnlineRecognizer(recognizerConfig); + return recognizer; +} +recognizer = createRecognizer(); +stream = recognizer.createStream(); +display = new sherpa_onnx.Display(50); + +let lastText = ''; +let segmentIndex = 0; + +const ai = new portAudio.AudioIO({ + inOptions: { + channelCount: 1, + closeOnError: true, // Close the stream if an audio error is detected, if + // set false then just log the error + deviceId: -1, // Use -1 or omit the deviceId to select the default device + sampleFormat: portAudio.SampleFormatFloat32, + sampleRate: recognizer.config.featConfig.sampleRate + } +}); + +ai.on('data', data => { + const samples = new Float32Array(data.buffer); + + stream.acceptWaveform(recognizer.config.featConfig.sampleRate, samples); + + while (recognizer.isReady(stream)) { + recognizer.decode(stream); + } + + const isEndpoint = recognizer.isEndpoint(stream); + const text = recognizer.getResult(stream).text; + + if (text.length > 0 && lastText != text) { + lastText = text; + display.print(segmentIndex, lastText); + } + if (isEndpoint) { + if (text.length > 0) { + lastText = text; + segmentIndex += 1; + } + recognizer.reset(stream) + } +}); + +ai.on('close', () => { + console.log('Free resources'); + stream.free(); + recognizer.free(); +}); + +ai.start(); +console.log('Started! Please speak') diff --git a/nodejs-examples/test-online-transducer.js b/nodejs-examples/test-online-transducer.js new file mode 100644 index 000000000..822b97dae --- /dev/null +++ b/nodejs-examples/test-online-transducer.js @@ -0,0 +1,102 @@ +// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) +// +const fs = require('fs'); +const {Readable} = require('stream'); +const wav = require('wav'); + +const sherpa_onnx = require('sherpa-onnx'); + +function createRecognizer() { + const featConfig = new sherpa_onnx.FeatureConfig(); + featConfig.sampleRate = 16000; + featConfig.featureDim = 80; + + // test online recognizer + const transducer = new sherpa_onnx.OnlineTransducerModelConfig(); + transducer.encoder = + './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/encoder-epoch-99-avg-1.int8.onnx'; + transducer.decoder = + './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/decoder-epoch-99-avg-1.onnx'; + transducer.joiner = + './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/joiner-epoch-99-avg-1.onnx'; + const tokens = + './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/tokens.txt'; + + const modelConfig = new sherpa_onnx.OnlineModelConfig(); + modelConfig.transducer = transducer; + modelConfig.tokens = tokens; + modelConfig.modelType = 'zipformer'; + + const recognizerConfig = new sherpa_onnx.OnlineRecognizerConfig(); + recognizerConfig.featConfig = featConfig; + recognizerConfig.modelConfig = modelConfig; + recognizerConfig.decodingMethod = 'greedy_search'; + + recognizer = new sherpa_onnx.OnlineRecognizer(recognizerConfig); + return recognizer; +} +recognizer = createRecognizer(); +stream = recognizer.createStream(); + +const waveFilename = + './sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20/test_wavs/0.wav'; + +const reader = new wav.Reader(); +const readable = new Readable().wrap(reader); + +function decode(samples) { + stream.acceptWaveform(recognizer.config.featConfig.sampleRate, samples); + + while (recognizer.isReady(stream)) { + recognizer.decode(stream); + } + const r = recognizer.getResult(stream); + console.log(r.text); +} + +reader.on('format', ({audioFormat, bitDepth, channels, sampleRate}) => { + if (sampleRate != recognizer.config.featConfig.sampleRate) { + throw new Error(`Only support sampleRate ${ + recognizer.config.featConfig.sampleRate}. Given ${sampleRate}`); + } + + if (audioFormat != 1) { + throw new Error(`Only support PCM format. Given ${audioFormat}`); + } + + if (channels != 1) { + throw new Error(`Only a single channel. Given ${channel}`); + } + + if (bitDepth != 16) { + throw new Error(`Only support 16-bit samples. Given ${bitDepth}`); + } +}); + +fs.createReadStream(waveFilename, {'highWaterMark': 4096}) + .pipe(reader) + .on('finish', function(err) { + // tail padding + const floatSamples = + new Float32Array(recognizer.config.featConfig.sampleRate * 0.5); + decode(floatSamples); + stream.free(); + recognizer.free(); + }); + +readable.on('readable', function() { + let chunk; + while ((chunk = readable.read()) != null) { + const int16Samples = new Int16Array( + chunk.buffer, chunk.byteOffset, + chunk.length / Int16Array.BYTES_PER_ELEMENT); + + const floatSamples = new Float32Array(int16Samples.length); + + for (let i = 0; i < floatSamples.length; i++) { + floatSamples[i] = int16Samples[i] / 32768.0; + } + + decode(floatSamples); + } +}); diff --git a/nodejs-examples/test-vad-microphone-offline-paraformer.js b/nodejs-examples/test-vad-microphone-offline-paraformer.js new file mode 100644 index 000000000..f5311bea4 --- /dev/null +++ b/nodejs-examples/test-vad-microphone-offline-paraformer.js @@ -0,0 +1,101 @@ +// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) +// +const sherpa_onnx = require('sherpa-onnx3'); +const portAudio = require('naudiodon2'); +console.log(portAudio.getDevices()); + +function createOfflineRecognizer() { + const featConfig = new sherpa_onnx.FeatureConfig(); + featConfig.sampleRate = 16000; + featConfig.featureDim = 80; + + // test online recognizer + const paraformer = new sherpa_onnx.OfflineParaformerModelConfig(); + paraformer.model = './sherpa-onnx-paraformer-zh-2023-03-28/model.int8.onnx'; + const tokens = './sherpa-onnx-paraformer-zh-2023-03-28/tokens.txt'; + + const modelConfig = new sherpa_onnx.OfflineModelConfig(); + modelConfig.paraformer = paraformer; + modelConfig.tokens = tokens; + modelConfig.modelType = 'paraformer'; + + const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig(); + recognizerConfig.featConfig = featConfig; + recognizerConfig.modelConfig = modelConfig; + recognizerConfig.decodingMethod = 'greedy_search'; + + const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig); + return recognizer +} + +function createVad() { + const sileroVadModelConfig = new sherpa_onnx.SileroVadModelConfig(); + sileroVadModelConfig.model = './silero_vad.onnx'; + sileroVadModelConfig.minSpeechDuration = 0.3; // seconds + sileroVadModelConfig.minSilenceDuration = 0.3; // seconds + sileroVadModelConfig.windowSize = 512; + + const vadModelConfig = new sherpa_onnx.VadModelConfig(); + vadModelConfig.sileroVad = sileroVadModelConfig; + vadModelConfig.sampleRate = 16000; + + const bufferSizeInSeconds = 60; + const vad = new sherpa_onnx.VoiceActivityDetector( + vadModelConfig, bufferSizeInSeconds); + return vad; +} + +const recognizer = createOfflineRecognizer(); +const vad = createVad(); + +const bufferSizeInSeconds = 30; +const buffer = + new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate); + +var ai = new portAudio.AudioIO({ + inOptions: { + channelCount: 1, + sampleFormat: portAudio.SampleFormatFloat32, + sampleRate: vad.config.sampleRate, + deviceId: -1, // Use -1 or omit the deviceId to select the default device + closeOnError: true // Close the stream if an audio error is detected, if + // set false then just log the error + } +}); + +let printed = false; +let index = 0; +ai.on('data', data => { + const windowSize = vad.config.sileroVad.windowSize; + buffer.push(new Float32Array(data.buffer)); + while (buffer.size() > windowSize) { + const samples = buffer.get(buffer.head(), windowSize); + buffer.pop(windowSize); + vad.acceptWaveform(samples) + } + + while (!vad.isEmpty()) { + const segment = vad.front(); + vad.pop(); + const stream = recognizer.createStream(); + stream.acceptWaveform( + recognizer.config.featConfig.sampleRate, segment.samples); + recognizer.decode(stream); + const r = recognizer.getResult(stream); + stream.free(); + if (r.text.length > 0) { + console.log(`${index}: ${r.text}`); + index += 1; + } + } +}); + +ai.on('close', () => { + console.log('Free resources'); + recognizer.free(); + vad.free(); + buffer.free(); +}); + +ai.start(); +console.log('Started! Please speak') diff --git a/nodejs-examples/test-vad-microphone-offline-transducer.js b/nodejs-examples/test-vad-microphone-offline-transducer.js new file mode 100644 index 000000000..4cf6d7176 --- /dev/null +++ b/nodejs-examples/test-vad-microphone-offline-transducer.js @@ -0,0 +1,106 @@ +// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) +// +const sherpa_onnx = require('sherpa-onnx'); +const portAudio = require('naudiodon2'); +console.log(portAudio.getDevices()); + +function createOfflineRecognizer() { + const featConfig = new sherpa_onnx.FeatureConfig(); + featConfig.sampleRate = 16000; + featConfig.featureDim = 80; + + // test online recognizer + const transducer = new sherpa_onnx.OfflineTransducerModelConfig(); + transducer.encoder = + './sherpa-onnx-zipformer-en-2023-06-26/encoder-epoch-99-avg-1.onnx'; + transducer.decoder = + './sherpa-onnx-zipformer-en-2023-06-26/decoder-epoch-99-avg-1.onnx'; + transducer.joiner = + './sherpa-onnx-zipformer-en-2023-06-26/joiner-epoch-99-avg-1.onnx'; + const tokens = './sherpa-onnx-zipformer-en-2023-06-26/tokens.txt'; + + const modelConfig = new sherpa_onnx.OfflineModelConfig(); + modelConfig.transducer = transducer; + modelConfig.tokens = tokens; + modelConfig.modelType = 'transducer'; + + const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig(); + recognizerConfig.featConfig = featConfig; + recognizerConfig.modelConfig = modelConfig; + recognizerConfig.decodingMethod = 'greedy_search'; + + const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig); + return recognizer; +} + +function createVad() { + const sileroVadModelConfig = new sherpa_onnx.SileroVadModelConfig(); + sileroVadModelConfig.model = './silero_vad.onnx'; + sileroVadModelConfig.minSpeechDuration = 0.3; // seconds + sileroVadModelConfig.minSilenceDuration = 0.3; // seconds + sileroVadModelConfig.windowSize = 512; + + const vadModelConfig = new sherpa_onnx.VadModelConfig(); + vadModelConfig.sileroVad = sileroVadModelConfig; + vadModelConfig.sampleRate = 16000; + + const bufferSizeInSeconds = 60; + const vad = new sherpa_onnx.VoiceActivityDetector( + vadModelConfig, bufferSizeInSeconds); + return vad; +} + +const recognizer = createOfflineRecognizer(); +const vad = createVad(); + +const bufferSizeInSeconds = 30; +const buffer = + new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate); + +const ai = new portAudio.AudioIO({ + inOptions: { + channelCount: 1, + closeOnError: true, // Close the stream if an audio error is detected, if + // set false then just log the error + deviceId: -1, // Use -1 or omit the deviceId to select the default device + sampleFormat: portAudio.SampleFormatFloat32, + sampleRate: vad.config.sampleRate + } +}); + +let printed = false; +let index = 0; +ai.on('data', data => { + const windowSize = vad.config.sileroVad.windowSize; + buffer.push(new Float32Array(data.buffer)); + while (buffer.size() > windowSize) { + const samples = buffer.get(buffer.head(), windowSize); + buffer.pop(windowSize); + vad.acceptWaveform(samples) + } + + while (!vad.isEmpty()) { + const segment = vad.front(); + vad.pop(); + const stream = recognizer.createStream(); + stream.acceptWaveform( + recognizer.config.featConfig.sampleRate, segment.samples); + recognizer.decode(stream); + const r = recognizer.getResult(stream); + stream.free(); + if (r.text.length > 0) { + console.log(`${index}: ${r.text}`); + index += 1; + } + } +}); + +ai.on('close', () => { + console.log('Free resources'); + recognizer.free(); + vad.free(); + buffer.free(); +}); + +ai.start(); +console.log('Started! Please speak') diff --git a/nodejs-examples/test-vad-microphone-offline-whisper.js b/nodejs-examples/test-vad-microphone-offline-whisper.js new file mode 100644 index 000000000..07a344b89 --- /dev/null +++ b/nodejs-examples/test-vad-microphone-offline-whisper.js @@ -0,0 +1,102 @@ +// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) +// +const sherpa_onnx = require('sherpa-onnx'); +const portAudio = require('naudiodon2'); +console.log(portAudio.getDevices()); + +function createOfflineRecognizer() { + const featConfig = new sherpa_onnx.FeatureConfig(); + featConfig.sampleRate = 16000; + featConfig.featureDim = 80; + + // test online recognizer + const whisper = new sherpa_onnx.OfflineWhisperModelConfig(); + whisper.encoder = './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx'; + whisper.decoder = './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx'; + const tokens = './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt'; + + const modelConfig = new sherpa_onnx.OfflineModelConfig(); + modelConfig.whisper = whisper; + modelConfig.tokens = tokens; + modelConfig.modelType = 'whisper'; + + const recognizerConfig = new sherpa_onnx.OfflineRecognizerConfig(); + recognizerConfig.featConfig = featConfig; + recognizerConfig.modelConfig = modelConfig; + recognizerConfig.decodingMethod = 'greedy_search'; + + const recognizer = new sherpa_onnx.OfflineRecognizer(recognizerConfig); + return recognizer; +} + +function createVad() { + const sileroVadModelConfig = new sherpa_onnx.SileroVadModelConfig(); + sileroVadModelConfig.model = './silero_vad.onnx'; + sileroVadModelConfig.minSpeechDuration = 0.3; // seconds + sileroVadModelConfig.minSilenceDuration = 0.3; // seconds + sileroVadModelConfig.windowSize = 512; + + const vadModelConfig = new sherpa_onnx.VadModelConfig(); + vadModelConfig.sileroVad = sileroVadModelConfig; + vadModelConfig.sampleRate = 16000; + + const bufferSizeInSeconds = 60; + const vad = new sherpa_onnx.VoiceActivityDetector( + vadModelConfig, bufferSizeInSeconds); + return vad; +} + +const recognizer = createOfflineRecognizer(); +const vad = createVad(); + +const bufferSizeInSeconds = 30; +const buffer = + new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate); + +const ai = new portAudio.AudioIO({ + inOptions: { + channelCount: 1, + closeOnError: true, // Close the stream if an audio error is detected, if + // set false then just log the error + deviceId: -1, // Use -1 or omit the deviceId to select the default device + sampleFormat: portAudio.SampleFormatFloat32, + sampleRate: vad.config.sampleRate + } +}); + +let printed = false; +let index = 0; +ai.on('data', data => { + const windowSize = vad.config.sileroVad.windowSize; + buffer.push(new Float32Array(data.buffer)); + while (buffer.size() > windowSize) { + const samples = buffer.get(buffer.head(), windowSize); + buffer.pop(windowSize); + vad.acceptWaveform(samples) + } + + while (!vad.isEmpty()) { + const segment = vad.front(); + vad.pop(); + const stream = recognizer.createStream(); + stream.acceptWaveform( + recognizer.config.featConfig.sampleRate, segment.samples); + recognizer.decode(stream); + const r = recognizer.getResult(stream); + stream.free(); + if (r.text.length > 0) { + console.log(`${index}: ${r.text}`); + index += 1; + } + } +}); + +ai.on('close', () => { + console.log('Free resources'); + recognizer.free(); + vad.free(); + buffer.free(); +}); + +ai.start(); +console.log('Started! Please speak') diff --git a/nodejs-examples/test-vad-microphone.js b/nodejs-examples/test-vad-microphone.js new file mode 100644 index 000000000..ec65b50fc --- /dev/null +++ b/nodejs-examples/test-vad-microphone.js @@ -0,0 +1,74 @@ +// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) + +const sherpa_onnx = require('sherpa-onnx'); +const portAudio = require('naudiodon2'); +console.log(portAudio.getDevices()); + +function createVad() { + const sileroVadModelConfig = new sherpa_onnx.SileroVadModelConfig(); + sileroVadModelConfig.model = './silero_vad.onnx'; + sileroVadModelConfig.minSpeechDuration = 0.3; // seconds + sileroVadModelConfig.minSilenceDuration = 0.3; // seconds + sileroVadModelConfig.windowSize = 512; + + const vadModelConfig = new sherpa_onnx.VadModelConfig(); + vadModelConfig.sileroVad = sileroVadModelConfig; + vadModelConfig.sampleRate = 16000; + + const bufferSizeInSeconds = 60; + const vad = new sherpa_onnx.VoiceActivityDetector( + vadModelConfig, bufferSizeInSeconds); + return vad; +} +vad = createVad(); +const bufferSizeInSeconds = 30; +const buffer = + new sherpa_onnx.CircularBuffer(bufferSizeInSeconds * vad.config.sampleRate); + +const ai = new portAudio.AudioIO({ + inOptions: { + channelCount: 1, + closeOnError: true, // Close the stream if an audio error is detected, if + // set false then just log the error + deviceId: -1, // Use -1 or omit the deviceId to select the default device + sampleFormat: portAudio.SampleFormatFloat32, + sampleRate: vad.config.sampleRate + } +}); + +let printed = false; +let index = 0; +ai.on('data', data => { + const windowSize = vad.config.sileroVad.windowSize; + buffer.push(new Float32Array(data.buffer)); + while (buffer.size() > windowSize) { + const samples = buffer.get(buffer.head(), windowSize); + buffer.pop(windowSize); + vad.acceptWaveform(samples) + if (vad.isDetected() && !printed) { + console.log(`${index}: Detected speech`) + printed = true; + } + + if (!vad.isDetected()) { + printed = false; + } + + while (!vad.isEmpty()) { + const segment = vad.front(); + vad.pop(); + const duration = segment.samples.length / vad.config.sampleRate; + console.log(`${index} End of speech. Duration: ${duration} seconds`); + index += 1; + } + } +}); + +ai.on('close', () => { + console.log('Free resources'); + vad.free(); + buffer.free(); +}); + +ai.start(); +console.log('Started! Please speak') diff --git a/scripts/nodejs/.clang-format b/scripts/nodejs/.clang-format new file mode 100644 index 000000000..f62c72099 --- /dev/null +++ b/scripts/nodejs/.clang-format @@ -0,0 +1,3 @@ +Language: JavaScript +JavaScriptQuotes: Double + diff --git a/scripts/nodejs/.gitignore b/scripts/nodejs/.gitignore new file mode 100644 index 000000000..d84de665b --- /dev/null +++ b/scripts/nodejs/.gitignore @@ -0,0 +1,2 @@ +node_modules +jslint.mjs diff --git a/scripts/nodejs/README.md b/scripts/nodejs/README.md new file mode 100644 index 000000000..ed520597e --- /dev/null +++ b/scripts/nodejs/README.md @@ -0,0 +1,9 @@ +# Introduction + +Text-to-speech and speech-to-text with [Next-gen Kaldi](https://github.com/k2-fsa/). + +It processes everything locally without accessing the Internet. + +Please refer to +https://github.com/k2-fsa/sherpa-onnx/tree/master/nodejs-examples +for examples. diff --git a/scripts/nodejs/index.js b/scripts/nodejs/index.js new file mode 100644 index 000000000..ac77ae4a1 --- /dev/null +++ b/scripts/nodejs/index.js @@ -0,0 +1,717 @@ +// Copyright (c) 2023 Xiaomi Corporation (authors: Fangjun Kuang) +// +// Please use +// +// npm install ffi-napi ref-struct-napi +// +// before you use this file +// +// +// Please use node 13. node 16, 18, 20, and 21 are known not working. +// See also +// https://github.com/node-ffi-napi/node-ffi-napi/issues/244 +// and +// https://github.com/node-ffi-napi/node-ffi-napi/issues/97 +"use strict" + +const debug = require("debug")("sherpa-onnx"); +const os = require("os"); +const path = require("path"); +const ffi = require("ffi-napi"); +const ref = require("ref-napi"); +const fs = require("fs"); +var ArrayType = require("ref-array-napi"); + +const FloatArray = ArrayType(ref.types.float); +const StructType = require("ref-struct-napi"); +const cstring = ref.types.CString; +const cstringPtr = ref.refType(cstring); +const int32_t = ref.types.int32; +const float = ref.types.float; +const floatPtr = ref.refType(float); + +const SherpaOnnxOnlineTransducerModelConfig = StructType({ + "encoder" : cstring, + "decoder" : cstring, + "joiner" : cstring, +}); + +const SherpaOnnxOnlineParaformerModelConfig = StructType({ + "encoder" : cstring, + "decoder" : cstring, +}); + +const SherpaOnnxOnlineModelConfig = StructType({ + "transducer" : SherpaOnnxOnlineTransducerModelConfig, + "paraformer" : SherpaOnnxOnlineParaformerModelConfig, + "tokens" : cstring, + "numThreads" : int32_t, + "provider" : cstring, + "debug" : int32_t, + "modelType" : cstring, +}); + +const SherpaOnnxFeatureConfig = StructType({ + "sampleRate" : int32_t, + "featureDim" : int32_t, +}); + +const SherpaOnnxOnlineRecognizerConfig = StructType({ + "featConfig" : SherpaOnnxFeatureConfig, + "modelConfig" : SherpaOnnxOnlineModelConfig, + "decodingMethod" : cstring, + "maxActivePaths" : int32_t, + "enableEndpoint" : int32_t, + "rule1MinTrailingSilence" : float, + "rule2MinTrailingSilence" : float, + "rule3MinUtteranceLength" : float, + "hotwordsFile" : cstring, + "hotwordsScore" : float, +}); + +const SherpaOnnxOnlineRecognizerResult = StructType({ + "text" : cstring, + "tokens" : cstring, + "tokensArr" : cstringPtr, + "timestamps" : floatPtr, + "count" : int32_t, + "json" : cstring, +}); + +const SherpaOnnxOnlineRecognizerPtr = ref.refType(ref.types.void); +const SherpaOnnxOnlineStreamPtr = ref.refType(ref.types.void); +const SherpaOnnxOnlineStreamPtrPtr = ref.refType(SherpaOnnxOnlineStreamPtr); +const SherpaOnnxOnlineRecognizerResultPtr = + ref.refType(SherpaOnnxOnlineRecognizerResult); + +const SherpaOnnxOnlineRecognizerConfigPtr = + ref.refType(SherpaOnnxOnlineRecognizerConfig); + +const SherpaOnnxOfflineTransducerModelConfig = StructType({ + "encoder" : cstring, + "decoder" : cstring, + "joiner" : cstring, +}); + +const SherpaOnnxOfflineParaformerModelConfig = StructType({ + "model" : cstring, +}); + +const SherpaOnnxOfflineNemoEncDecCtcModelConfig = StructType({ + "model" : cstring, +}); + +const SherpaOnnxOfflineWhisperModelConfig = StructType({ + "encoder" : cstring, + "decoder" : cstring, +}); + +const SherpaOnnxOfflineTdnnModelConfig = StructType({ + "model" : cstring, +}); + +const SherpaOnnxOfflineLMConfig = StructType({ + "model" : cstring, + "scale" : float, +}); + +const SherpaOnnxOfflineModelConfig = StructType({ + "transducer" : SherpaOnnxOfflineTransducerModelConfig, + "paraformer" : SherpaOnnxOfflineParaformerModelConfig, + "nemoCtc" : SherpaOnnxOfflineNemoEncDecCtcModelConfig, + "whisper" : SherpaOnnxOfflineWhisperModelConfig, + "tdnn" : SherpaOnnxOfflineTdnnModelConfig, + "tokens" : cstring, + "numThreads" : int32_t, + "debug" : int32_t, + "provider" : cstring, + "modelType" : cstring, +}); + +const SherpaOnnxOfflineRecognizerConfig = StructType({ + "featConfig" : SherpaOnnxFeatureConfig, + "modelConfig" : SherpaOnnxOfflineModelConfig, + "lmConfig" : SherpaOnnxOfflineLMConfig, + "decodingMethod" : cstring, + "maxActivePaths" : int32_t, + "hotwordsFile" : cstring, + "hotwordsScore" : float, +}); + +const SherpaOnnxOfflineRecognizerResult = StructType({ + "text" : cstring, + "timestamps" : floatPtr, + "count" : int32_t, +}); + +const SherpaOnnxOfflineRecognizerPtr = ref.refType(ref.types.void); +const SherpaOnnxOfflineStreamPtr = ref.refType(ref.types.void); +const SherpaOnnxOfflineStreamPtrPtr = ref.refType(SherpaOnnxOfflineStreamPtr); +const SherpaOnnxOfflineRecognizerResultPtr = + ref.refType(SherpaOnnxOfflineRecognizerResult); + +const SherpaOnnxOfflineRecognizerConfigPtr = + ref.refType(SherpaOnnxOfflineRecognizerConfig); + +// vad +const SherpaOnnxSileroVadModelConfig = StructType({ + "model" : cstring, + "threshold" : float, + "minSilenceDuration" : float, + "minSpeechDuration" : float, + "windowSize" : int32_t, +}); + +const SherpaOnnxVadModelConfig = StructType({ + "sileroVad" : SherpaOnnxSileroVadModelConfig, + "sampleRate" : int32_t, + "numThreads" : int32_t, + "provider" : cstring, + "debug" : int32_t, +}); + +const SherpaOnnxSpeechSegment = StructType({ + "start" : int32_t, + "samples" : FloatArray, + "n" : int32_t, +}); + +const SherpaOnnxVadModelConfigPtr = ref.refType(SherpaOnnxVadModelConfig); +const SherpaOnnxSpeechSegmentPtr = ref.refType(SherpaOnnxSpeechSegment); +const SherpaOnnxCircularBufferPtr = ref.refType(ref.types.void); +const SherpaOnnxVoiceActivityDetectorPtr = ref.refType(ref.types.void); + +// tts +const SherpaOnnxOfflineTtsVitsModelConfig = StructType({ + "model" : cstring, + "lexicon" : cstring, + "tokens" : cstring, + "noiseScale" : float, + "noiseScaleW" : float, + "lengthScale" : float, +}); + +const SherpaOnnxOfflineTtsModelConfig = StructType({ + "vits" : SherpaOnnxOfflineTtsVitsModelConfig, + "numThreads" : int32_t, + "debug" : int32_t, + "provider" : cstring, +}); + +const SherpaOnnxOfflineTtsConfig = StructType({ + "model" : SherpaOnnxOfflineTtsModelConfig, + "ruleFsts" : cstring, +}); + +const SherpaOnnxGeneratedAudio = StructType({ + "samples" : FloatArray, + "n" : int32_t, + "sampleRate" : int32_t, +}); + +const SherpaOnnxOfflineTtsVitsModelConfigPtr = + ref.refType(SherpaOnnxOfflineTtsVitsModelConfig); +const SherpaOnnxOfflineTtsConfigPtr = ref.refType(SherpaOnnxOfflineTtsConfig); +const SherpaOnnxGeneratedAudioPtr = ref.refType(SherpaOnnxGeneratedAudio); +const SherpaOnnxOfflineTtsPtr = ref.refType(ref.types.void); + +const SherpaOnnxDisplayPtr = ref.refType(ref.types.void); + +let soname; +if (os.platform() == "win32") { + // see https://nodejs.org/api/process.html#processarch + if (process.arch == "x64") { + let currentPath = process.env.Path; + let dllDirectory = path.resolve(path.join(__dirname, "lib", "win-x64")); + process.env.Path = currentPath + path.delimiter + dllDirectory; + + soname = path.join(__dirname, "lib", "win-x64", "sherpa-onnx-c-api.dll") + } else if (process.arch == "ia32") { + let currentPath = process.env.Path; + let dllDirectory = path.resolve(path.join(__dirname, "lib", "win-x86")); + process.env.Path = currentPath + path.delimiter + dllDirectory; + + soname = path.join(__dirname, "lib", "win-x86", "sherpa-onnx-c-api.dll") + } else { + throw new Error( + `Support only Windows x86 and x64 for now. Given ${process.arch}`); + } +} else if (os.platform() == "darwin") { + if (process.arch == "x64") { + soname = + path.join(__dirname, "lib", "osx-x64", "libsherpa-onnx-c-api.dylib"); + } else if (process.arch == "arm64") { + soname = + path.join(__dirname, "lib", "osx-arm64", "libsherpa-onnx-c-api.dylib"); + } else { + throw new Error( + `Support only macOS x64 and arm64 for now. Given ${process.arch}`); + } +} else if (os.platform() == "linux") { + if (process.arch == "x64") { + soname = + path.join(__dirname, "lib", "linux-x64", "libsherpa-onnx-c-api.so"); + } else { + throw new Error(`Support only Linux x64 for now. Given ${process.arch}`); + } +} else { + throw new Error(`Unsupported platform ${os.platform()}`); +} + +if (!fs.existsSync(soname)) { + throw new Error(`Cannot find file ${soname}. Please make sure you have run + ./build.sh`); +} + +debug("soname ", soname) + +const libsherpa_onnx = ffi.Library(soname, { + // online asr + "CreateOnlineRecognizer" : [ + SherpaOnnxOnlineRecognizerPtr, [ SherpaOnnxOnlineRecognizerConfigPtr ] + ], + "DestroyOnlineRecognizer" : [ "void", [ SherpaOnnxOnlineRecognizerPtr ] ], + "CreateOnlineStream" : + [ SherpaOnnxOnlineStreamPtr, [ SherpaOnnxOnlineRecognizerPtr ] ], + "CreateOnlineStreamWithHotwords" : + [ SherpaOnnxOnlineStreamPtr, [ SherpaOnnxOnlineRecognizerPtr, cstring ] ], + "DestroyOnlineStream" : [ "void", [ SherpaOnnxOnlineStreamPtr ] ], + "AcceptWaveform" : + [ "void", [ SherpaOnnxOnlineStreamPtr, int32_t, floatPtr, int32_t ] ], + "IsOnlineStreamReady" : + [ int32_t, [ SherpaOnnxOnlineRecognizerPtr, SherpaOnnxOnlineStreamPtr ] ], + "DecodeOnlineStream" : + [ "void", [ SherpaOnnxOnlineRecognizerPtr, SherpaOnnxOnlineStreamPtr ] ], + "DecodeMultipleOnlineStreams" : [ + "void", + [ SherpaOnnxOnlineRecognizerPtr, SherpaOnnxOnlineStreamPtrPtr, int32_t ] + ], + "GetOnlineStreamResult" : [ + SherpaOnnxOnlineRecognizerResultPtr, + [ SherpaOnnxOnlineRecognizerPtr, SherpaOnnxOnlineStreamPtr ] + ], + "DestroyOnlineRecognizerResult" : + [ "void", [ SherpaOnnxOnlineRecognizerResultPtr ] ], + "Reset" : + [ "void", [ SherpaOnnxOnlineRecognizerPtr, SherpaOnnxOnlineStreamPtr ] ], + "InputFinished" : [ "void", [ SherpaOnnxOnlineStreamPtr ] ], + "IsEndpoint" : + [ int32_t, [ SherpaOnnxOnlineRecognizerPtr, SherpaOnnxOnlineStreamPtr ] ], + + // offline asr + "CreateOfflineRecognizer" : [ + SherpaOnnxOfflineRecognizerPtr, [ SherpaOnnxOfflineRecognizerConfigPtr ] + ], + "DestroyOfflineRecognizer" : [ "void", [ SherpaOnnxOfflineRecognizerPtr ] ], + "CreateOfflineStream" : + [ SherpaOnnxOfflineStreamPtr, [ SherpaOnnxOfflineRecognizerPtr ] ], + "DestroyOfflineStream" : [ "void", [ SherpaOnnxOfflineStreamPtr ] ], + "AcceptWaveformOffline" : + [ "void", [ SherpaOnnxOfflineStreamPtr, int32_t, floatPtr, int32_t ] ], + "DecodeOfflineStream" : [ + "void", [ SherpaOnnxOfflineRecognizerPtr, SherpaOnnxOfflineStreamPtr ] + ], + "DecodeMultipleOfflineStreams" : [ + "void", + [ SherpaOnnxOfflineRecognizerPtr, SherpaOnnxOfflineStreamPtrPtr, int32_t ] + ], + "GetOfflineStreamResult" : + [ SherpaOnnxOfflineRecognizerResultPtr, [ SherpaOnnxOfflineStreamPtr ] ], + "DestroyOfflineRecognizerResult" : + [ "void", [ SherpaOnnxOfflineRecognizerResultPtr ] ], + + // vad + "SherpaOnnxCreateCircularBuffer" : + [ SherpaOnnxCircularBufferPtr, [ int32_t ] ], + "SherpaOnnxDestroyCircularBuffer" : + [ "void", [ SherpaOnnxCircularBufferPtr ] ], + "SherpaOnnxCircularBufferPush" : + [ "void", [ SherpaOnnxCircularBufferPtr, floatPtr, int32_t ] ], + "SherpaOnnxCircularBufferGet" : + [ FloatArray, [ SherpaOnnxCircularBufferPtr, int32_t, int32_t ] ], + "SherpaOnnxCircularBufferFree" : [ "void", [ FloatArray ] ], + "SherpaOnnxCircularBufferPop" : + [ "void", [ SherpaOnnxCircularBufferPtr, int32_t ] ], + "SherpaOnnxCircularBufferSize" : [ int32_t, [ SherpaOnnxCircularBufferPtr ] ], + "SherpaOnnxCircularBufferHead" : [ int32_t, [ SherpaOnnxCircularBufferPtr ] ], + "SherpaOnnxCircularBufferReset" : [ "void", [ SherpaOnnxCircularBufferPtr ] ], + "SherpaOnnxCreateVoiceActivityDetector" : [ + SherpaOnnxVoiceActivityDetectorPtr, [ SherpaOnnxVadModelConfigPtr, float ] + ], + "SherpaOnnxDestroyVoiceActivityDetector" : + [ "void", [ SherpaOnnxVoiceActivityDetectorPtr ] ], + "SherpaOnnxVoiceActivityDetectorAcceptWaveform" : + [ "void", [ SherpaOnnxVoiceActivityDetectorPtr, floatPtr, int32_t ] ], + "SherpaOnnxVoiceActivityDetectorEmpty" : + [ int32_t, [ SherpaOnnxVoiceActivityDetectorPtr ] ], + "SherpaOnnxVoiceActivityDetectorDetected" : + [ int32_t, [ SherpaOnnxVoiceActivityDetectorPtr ] ], + "SherpaOnnxVoiceActivityDetectorPop" : + [ "void", [ SherpaOnnxVoiceActivityDetectorPtr ] ], + "SherpaOnnxVoiceActivityDetectorClear" : + [ "void", [ SherpaOnnxVoiceActivityDetectorPtr ] ], + "SherpaOnnxVoiceActivityDetectorFront" : + [ SherpaOnnxSpeechSegmentPtr, [ SherpaOnnxVoiceActivityDetectorPtr ] ], + "SherpaOnnxDestroySpeechSegment" : [ "void", [ SherpaOnnxSpeechSegmentPtr ] ], + "SherpaOnnxVoiceActivityDetectorReset" : + [ "void", [ SherpaOnnxVoiceActivityDetectorPtr ] ], + // tts + "SherpaOnnxCreateOfflineTts" : + [ SherpaOnnxOfflineTtsPtr, [ SherpaOnnxOfflineTtsConfigPtr ] ], + "SherpaOnnxDestroyOfflineTts" : [ "void", [ SherpaOnnxOfflineTtsPtr ] ], + "SherpaOnnxOfflineTtsGenerate" : [ + SherpaOnnxGeneratedAudioPtr, + [ SherpaOnnxOfflineTtsPtr, cstring, int32_t, float ] + ], + "SherpaOnnxDestroyOfflineTtsGeneratedAudio" : + [ "void", [ SherpaOnnxGeneratedAudioPtr ] ], + "SherpaOnnxWriteWave" : [ "void", [ floatPtr, int32_t, int32_t, cstring ] ], + + // display + "CreateDisplay" : [ SherpaOnnxDisplayPtr, [ int32_t ] ], + "DestroyDisplay" : [ "void", [ SherpaOnnxDisplayPtr ] ], + "SherpaOnnxPrint" : [ "void", [ SherpaOnnxDisplayPtr, int32_t, cstring ] ], +}); + +class Display { + constructor(maxWordPerLine) { + this.handle = libsherpa_onnx.CreateDisplay(maxWordPerLine); + } + free() { + if (this.handle) { + libsherpa_onnx.DestroyDisplay(this.handle); + this.handle = null; + } + } + + print(idx, s) { libsherpa_onnx.SherpaOnnxPrint(this.handle, idx, s); } +}; + +class OnlineResult { + constructor(text) { this.text = Buffer.from(text, "utf-8").toString(); } +}; + +class OnlineStream { + constructor(handle) { this.handle = handle } + + free() { + if (this.handle) { + libsherpa_onnx.DestroyOnlineStream(this.handle); + this.handle = null; + } + } + + /** + * @param sampleRate {Number} + * @param samples {Float32Array} Containing samples in the range [-1, 1] + */ + acceptWaveform(sampleRate, samples) { + libsherpa_onnx.AcceptWaveform(this.handle, sampleRate, samples, + samples.length); + } +}; + +class OnlineRecognizer { + constructor(config) { + this.config = config; + this.recognizer_handle = + libsherpa_onnx.CreateOnlineRecognizer(config.ref()); + } + + free() { + if (this.recognizer_handle) { + libsherpa_onnx.DestroyOnlineRecognizer(this.recognizer_handle); + this.recognizer_handle = null; + } + } + + createStream() { + let handle = libsherpa_onnx.CreateOnlineStream(this.recognizer_handle); + return new OnlineStream(handle); + } + + isReady(stream) { + return libsherpa_onnx.IsOnlineStreamReady(this.recognizer_handle, + stream.handle) + } + + isEndpoint(stream) { + return libsherpa_onnx.IsEndpoint(this.recognizer_handle, stream.handle); + } + + reset(stream) { libsherpa_onnx.Reset(this.recognizer_handle, stream.handle); } + + decode(stream) { + libsherpa_onnx.DecodeOnlineStream(this.recognizer_handle, stream.handle) + } + + getResult(stream) { + let handle = libsherpa_onnx.GetOnlineStreamResult(this.recognizer_handle, + stream.handle); + let r = handle.deref(); + let ans = new OnlineResult(r.text); + libsherpa_onnx.DestroyOnlineRecognizerResult(handle); + + return ans + } +}; + +class OfflineResult { + constructor(text) { this.text = Buffer.from(text, "utf-8").toString(); } +}; + +class OfflineStream { + constructor(handle) { this.handle = handle } + + free() { + if (this.handle) { + libsherpa_onnx.DestroyOfflineStream(this.handle); + this.handle = null; + } + } + + /** + * @param sampleRate {Number} + * @param samples {Float32Array} Containing samples in the range [-1, 1] + */ + acceptWaveform(sampleRate, samples) { + libsherpa_onnx.AcceptWaveformOffline(this.handle, sampleRate, samples, + samples.length); + } +}; + +class OfflineRecognizer { + constructor(config) { + this.config = config; + this.recognizer_handle = + libsherpa_onnx.CreateOfflineRecognizer(config.ref()); + } + + free() { + if (this.recognizer_handle) { + libsherpa_onnx.DestroyOfflineRecognizer(this.recognizer_handle); + this.recognizer_handle = null; + } + } + + createStream() { + let handle = libsherpa_onnx.CreateOfflineStream(this.recognizer_handle); + return new OfflineStream(handle); + } + + decode(stream) { + libsherpa_onnx.DecodeOfflineStream(this.recognizer_handle, stream.handle) + } + + getResult(stream) { + let handle = libsherpa_onnx.GetOfflineStreamResult(stream.handle); + let r = handle.deref(); + let ans = new OfflineResult(r.text); + libsherpa_onnx.DestroyOfflineRecognizerResult(handle); + + return ans + } +}; + +class SpeechSegment { + constructor(start, samples) { + this.start = start; + this.samples = samples; + } +}; + +// this buffer holds only float entries. +class CircularBuffer { + /** + * @param capacity {int} The capacity of the circular buffer. + */ + constructor(capacity) { + this.handle = libsherpa_onnx.SherpaOnnxCreateCircularBuffer(capacity); + } + + free() { + if (this.handle) { + libsherpa_onnx.SherpaOnnxDestroyCircularBuffer(this.handle); + this.handle = null; + } + } + + /** + * @param samples {Float32Array} + */ + push(samples) { + libsherpa_onnx.SherpaOnnxCircularBufferPush(this.handle, samples, + samples.length); + } + + get(startIndex, n) { + let data = + libsherpa_onnx.SherpaOnnxCircularBufferGet(this.handle, startIndex, n); + + // https://tootallnate.github.io/ref/#exports-reinterpret + const buffer = data.buffer.reinterpret(n * ref.sizeof.float).buffer; + + // create a copy since we are going to free the buffer at the end + let s = new Float32Array(buffer).slice(0); + libsherpa_onnx.SherpaOnnxCircularBufferFree(data); + return s; + } + + pop(n) { libsherpa_onnx.SherpaOnnxCircularBufferPop(this.handle, n); } + + size() { return libsherpa_onnx.SherpaOnnxCircularBufferSize(this.handle); } + + head() { return libsherpa_onnx.SherpaOnnxCircularBufferHead(this.handle); } + + reset() { libsherpa_onnx.SherpaOnnxCircularBufferReset(this.handle); } +}; + +class VoiceActivityDetector { + constructor(config, bufferSizeInSeconds) { + this.config = config; + this.handle = libsherpa_onnx.SherpaOnnxCreateVoiceActivityDetector( + config.ref(), bufferSizeInSeconds); + } + + free() { + if (this.handle) { + libsherpa_onnx.SherpaOnnxDestroyVoiceActivityDetector(this.handle); + } + } + + acceptWaveform(samples) { + libsherpa_onnx.SherpaOnnxVoiceActivityDetectorAcceptWaveform( + this.handle, samples, samples.length); + } + + isEmpty() { + return libsherpa_onnx.SherpaOnnxVoiceActivityDetectorEmpty(this.handle); + } + + isDetected() { + return libsherpa_onnx.SherpaOnnxVoiceActivityDetectorDetected(this.handle); + } + pop() { libsherpa_onnx.SherpaOnnxVoiceActivityDetectorPop(this.handle); } + + clear() { libsherpa_onnx.SherpaOnnxVoiceActivityDetectorClear(this.handle); } + + reset() { libsherpa_onnx.SherpaOnnxVoiceActivityDetectorReset(this.handle); } + + front() { + let segment = + libsherpa_onnx.SherpaOnnxVoiceActivityDetectorFront(this.handle); + + let buffer = + segment.deref() + .samples.buffer.reinterpret(segment.deref().n * ref.sizeof.float) + .buffer; + + let samples = new Float32Array(buffer).slice(0); + let ans = new SpeechSegment(segment.deref().start, samples); + + libsherpa_onnx.SherpaOnnxDestroySpeechSegment(segment); + return ans; + } +}; + +class GeneratedAudio { + constructor(sampleRate, samples) { + this.sampleRate = sampleRate; + this.samples = samples; + } + save(filename) { + libsherpa_onnx.SherpaOnnxWriteWave(this.samples, this.samples.length, + this.sampleRate, filename); + } +}; + +class OfflineTts { + constructor(config) { + this.config = config; + this.handle = libsherpa_onnx.SherpaOnnxCreateOfflineTts(config.ref()); + } + + free() { + if (this.handle) { + libsherpa_onnx.SherpaOnnxDestroyOfflineTts(this.handle); + this.handle = null; + } + } + generate(text, sid, speed) { + let r = libsherpa_onnx.SherpaOnnxOfflineTtsGenerate(this.handle, text, sid, + speed); + const buffer = + r.deref() + .samples.buffer.reinterpret(r.deref().n * ref.sizeof.float) + .buffer; + let samples = new Float32Array(buffer).slice(0); + let sampleRate = r.deref().sampleRate; + + let generatedAudio = new GeneratedAudio(sampleRate, samples); + + libsherpa_onnx.SherpaOnnxDestroyOfflineTtsGeneratedAudio(r); + + return generatedAudio; + } +}; + +// online asr +const OnlineTransducerModelConfig = SherpaOnnxOnlineTransducerModelConfig; +const OnlineModelConfig = SherpaOnnxOnlineModelConfig; +const FeatureConfig = SherpaOnnxFeatureConfig; +const OnlineRecognizerConfig = SherpaOnnxOnlineRecognizerConfig; +const OnlineParaformerModelConfig = SherpaOnnxOnlineParaformerModelConfig; + +// offline asr +const OfflineTransducerModelConfig = SherpaOnnxOfflineTransducerModelConfig; +const OfflineModelConfig = SherpaOnnxOfflineModelConfig; +const OfflineRecognizerConfig = SherpaOnnxOfflineRecognizerConfig; +const OfflineParaformerModelConfig = SherpaOnnxOfflineParaformerModelConfig; +const OfflineWhisperModelConfig = SherpaOnnxOfflineWhisperModelConfig; +const OfflineNemoEncDecCtcModelConfig = + SherpaOnnxOfflineNemoEncDecCtcModelConfig; +const OfflineTdnnModelConfig = SherpaOnnxOfflineTdnnModelConfig; + +// vad +const SileroVadModelConfig = SherpaOnnxSileroVadModelConfig; +const VadModelConfig = SherpaOnnxVadModelConfig; + +// tts +const OfflineTtsVitsModelConfig = SherpaOnnxOfflineTtsVitsModelConfig; +const OfflineTtsModelConfig = SherpaOnnxOfflineTtsModelConfig; +const OfflineTtsConfig = SherpaOnnxOfflineTtsConfig; + +module.exports = { + // online asr + OnlineTransducerModelConfig, + OnlineModelConfig, + FeatureConfig, + OnlineRecognizerConfig, + OnlineRecognizer, + OnlineStream, + OnlineParaformerModelConfig, + + // offline asr + OfflineRecognizer, + OfflineStream, + OfflineTransducerModelConfig, + OfflineModelConfig, + OfflineRecognizerConfig, + OfflineParaformerModelConfig, + OfflineWhisperModelConfig, + OfflineNemoEncDecCtcModelConfig, + OfflineTdnnModelConfig, + // vad + SileroVadModelConfig, + VadModelConfig, + CircularBuffer, + VoiceActivityDetector, + // tts + OfflineTtsVitsModelConfig, + OfflineTtsModelConfig, + OfflineTtsConfig, + OfflineTts, + + // + Display, +}; diff --git a/scripts/nodejs/package.json b/scripts/nodejs/package.json new file mode 100644 index 000000000..bfe671ff0 --- /dev/null +++ b/scripts/nodejs/package.json @@ -0,0 +1,37 @@ +{ + "name": "sherpa-onnx2", + "version": "1.8.10", + "description": "Real-time speech recognition with Next-gen Kaldi", + "main": "index.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "repository": { + "type": "git", + "url": "git+https://github.com/k2-fsa/sherpa-onnx.git" + }, + "keywords": [ + "speech-to-text", + "text-to-speech", + "real-time speech recognition", + "without internet connect", + "embedded systems", + "open source", + "zipformer", + "asr", + "speech" + ], + "author": "The next-gen Kaldi team", + "license": "Apache-2.0", + "bugs": { + "url": "https://github.com/k2-fsa/sherpa-onnx/issues" + }, + "homepage": "https://github.com/k2-fsa/sherpa-onnx#readme", + "dependencies": { + "ffi-napi": "^4.0.3", + "npm": "^6.14.18", + "ref-array-napi": "^1.2.2", + "ref-napi": "^3.0.3", + "ref-struct-napi": "^1.1.1" + } +} diff --git a/scripts/nodejs/package.json.in b/scripts/nodejs/package.json.in new file mode 100644 index 000000000..b097edc90 --- /dev/null +++ b/scripts/nodejs/package.json.in @@ -0,0 +1,50 @@ +{ + "name": "sherpa-onnx", + "version": "SHERPA_ONNX_VERSION", + "description": "Real-time speech recognition with Next-gen Kaldi", + "main": "index.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "repository": { + "type": "git", + "url": "git+https://github.com/k2-fsa/sherpa-onnx.git" + }, + "keywords": [ + "speech to text", + "text to speech", + "transcription", + "real-time speech recognition", + "without internet connect", + "embedded systems", + "open source", + "zipformer", + "asr", + "tts", + "stt", + "c++", + "onnxruntime", + "onnx", + "ai", + "next-gen kaldi", + "offline", + "privacy", + "open source", + "streaming speech recognition", + "speech", + "recognition" + ], + "author": "The next-gen Kaldi team", + "license": "Apache-2.0", + "bugs": { + "url": "https://github.com/k2-fsa/sherpa-onnx/issues" + }, + "homepage": "https://github.com/k2-fsa/sherpa-onnx#readme", + "dependencies": { + "ffi-napi": "^4.0.3", + "npm": "^6.14.18", + "ref-array-napi": "^1.2.2", + "ref-napi": "^3.0.3", + "ref-struct-napi": "^1.1.1" + } +} diff --git a/scripts/nodejs/run.sh b/scripts/nodejs/run.sh new file mode 100755 index 000000000..213a87ecf --- /dev/null +++ b/scripts/nodejs/run.sh @@ -0,0 +1,123 @@ +#!/usr/bin/env bash +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(realpath $SCRIPT_DIR/../..) +echo "SCRIPT_DIR: $SCRIPT_DIR" +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +SHERPA_ONNX_VERSION=$(grep "SHERPA_ONNX_VERSION" $SHERPA_ONNX_DIR/CMakeLists.txt | cut -d " " -f 2 | cut -d '"' -f 2) + +echo "SHERPA_ONNX_VERSION $SHERPA_ONNX_VERSION" +sed -i.bak s/SHERPA_ONNX_VERSION/$SHERPA_ONNX_VERSION/g ./package.json.in + +cp package.json.in package.json +rm package.json.in +rm package.json.in.bak +rm .clang-format + +function windows_x64() { + echo "Process Windows (x64)" + mkdir -p lib/windows-x64 + dst=$(realpath lib/windows-x64) + mkdir t + cd t + wget -q https://huggingface.co/csukuangfj/sherpa-onnx-wheels/resolve/main/sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-win_amd64.whl + unzip ./sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-win_amd64.whl + + cp -v sherpa_onnx-${SHERPA_ONNX_VERSION}.data/data/bin/*.dll $dst + cp -v sherpa_onnx-${SHERPA_ONNX_VERSION}.data/data/bin/*.lib $dst + rm -fv $dst/sherpa-onnx-portaudio.dll + + cd .. + rm -rf t +} + +function windows_x86() { + echo "Process Windows (x86)" + mkdir -p lib/windows-x86 + dst=$(realpath lib/windows-x86) + mkdir t + cd t + wget -q https://huggingface.co/csukuangfj/sherpa-onnx-wheels/resolve/main/sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-win32.whl + unzip ./sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-win32.whl + + cp -v sherpa_onnx-${SHERPA_ONNX_VERSION}.data/data/bin/*.dll $dst + cp -v sherpa_onnx-${SHERPA_ONNX_VERSION}.data/data/bin/*.lib $dst + rm -fv $dst/sherpa-onnx-portaudio.dll + + cd .. + rm -rf t +} + +function linux_x64() { + echo "Process Linux (x64)" + mkdir -p lib/linux-x64 + dst=$(realpath lib/linux-x64) + mkdir t + cd t + wget -q https://huggingface.co/csukuangfj/sherpa-onnx-wheels/resolve/main/sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + unzip ./sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + + cp -v sherpa_onnx/lib/*.so* $dst + rm -v $dst/libcargs.so + rm -v $dst/libsherpa-onnx-portaudio.so + rm -v $dst/libsherpa-onnx-fst.so + rm -v $dst/libonnxruntime.so + + cd .. + rm -rf t +} + +function osx_x64() { + echo "Process osx-x64" + mkdir -p lib/osx-x64 + dst=$(realpath lib/osx-x64) + mkdir t + cd t + wget -q https://huggingface.co/csukuangfj/sherpa-onnx-wheels/resolve/main/sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-macosx_10_14_x86_64.whl + unzip ./sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-macosx_10_14_x86_64.whl + + cp -v sherpa_onnx/lib/*.dylib $dst/ + rm -v $dst/libonnxruntime.dylib + rm -v $dst/libcargs.dylib + rm -v $dst/libsherpa-onnx-fst.dylib + rm -v $dst/libsherpa-onnx-portaudio.dylib + + cd .. + rm -rf t +} + +function osx_arm64() { + echo "Process osx-arm64" + mkdir -p lib/osx-arm64 + dst=$(realpath lib/osx-arm64) + mkdir t + cd t + wget -q https://huggingface.co/csukuangfj/sherpa-onnx-wheels/resolve/main/sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-macosx_11_0_arm64.whl + unzip ./sherpa_onnx-${SHERPA_ONNX_VERSION}-cp38-cp38-macosx_11_0_arm64.whl + + cp -v sherpa_onnx/lib/*.dylib $dst/ + rm -v $dst/libonnxruntime.dylib + rm -v $dst/libcargs.dylib + rm -v $dst/libsherpa-onnx-fst.dylib + rm -v $dst/libsherpa-onnx-portaudio.dylib + + cd .. + rm -rf t +} + +windows_x64 +ls -lh lib/windows-x64 + +windows_x86 +ls -lh lib/windows-x86 + +linux_x64 +ls -lh lib/linux-x64 + +osx_x64 +ls -lh lib/osx-x64 + +osx_arm64 +ls -lh lib/osx-arm64 diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index 6a7a1dd61..a88063def 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -438,6 +438,10 @@ int32_t SherpaOnnxCircularBufferSize(SherpaOnnxCircularBuffer *buffer) { return buffer->impl->Size(); } +int32_t SherpaOnnxCircularBufferHead(SherpaOnnxCircularBuffer *buffer) { + return buffer->impl->Head(); +} + void SherpaOnnxCircularBufferReset(SherpaOnnxCircularBuffer *buffer) { buffer->impl->Reset(); } @@ -553,6 +557,7 @@ SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( tts_config.model.num_threads = SHERPA_ONNX_OR(config->model.num_threads, 1); tts_config.model.debug = config->model.debug; tts_config.model.provider = SHERPA_ONNX_OR(config->model.provider, "cpu"); + tts_config.rule_fsts = SHERPA_ONNX_OR(config->rule_fsts, ""); if (tts_config.model.debug) { fprintf(stderr, "%s\n", tts_config.ToString().c_str()); diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index 19def531d..9fd48afde 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -130,10 +130,10 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOnlineRecognizerResult { const char *text; // Pointer to continuous memory which holds string based tokens - // which are seperated by \0 + // which are separated by \0 const char *tokens; - // a pointer array contains the address of the first item in tokens + // a pointer array containing the address of the first item in tokens const char *const *tokens_arr; // Pointer to continuous memory which holds timestamps @@ -532,6 +532,11 @@ SHERPA_ONNX_API void SherpaOnnxCircularBufferPop( SHERPA_ONNX_API int32_t SherpaOnnxCircularBufferSize(SherpaOnnxCircularBuffer *buffer); +// Return the head of the buffer. It's always non-decreasing until you +// invoke SherpaOnnxCircularBufferReset() which resets head to 0. +SHERPA_ONNX_API int32_t +SherpaOnnxCircularBufferHead(SherpaOnnxCircularBuffer *buffer); + // Clear all elements in the buffer SHERPA_ONNX_API void SherpaOnnxCircularBufferReset( SherpaOnnxCircularBuffer *buffer); @@ -617,6 +622,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsModelConfig { SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsConfig { SherpaOnnxOfflineTtsModelConfig model; + const char *rule_fsts; } SherpaOnnxOfflineTtsConfig; SHERPA_ONNX_API typedef struct SherpaOnnxGeneratedAudio { diff --git a/sherpa-onnx/csrc/online-recognizer-paraformer-impl.h b/sherpa-onnx/csrc/online-recognizer-paraformer-impl.h index fbbeb2ada..a084e0da6 100644 --- a/sherpa-onnx/csrc/online-recognizer-paraformer-impl.h +++ b/sherpa-onnx/csrc/online-recognizer-paraformer-impl.h @@ -457,7 +457,7 @@ class OnlineRecognizerParaformerImpl : public OnlineRecognizerImpl { // (61 - 7) / 6 + 1 = 10 int32_t left_chunk_size_ = 5; - int32_t right_chunk_size_ = 5; + int32_t right_chunk_size_ = 2; }; } // namespace sherpa_onnx