diff --git a/silero-vad.ipynb b/silero-vad.ipynb index 8a84e81..9fb5d8a 100644 --- a/silero-vad.ipynb +++ b/silero-vad.ipynb @@ -1,662 +1,662 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "silero-vad.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.8" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": {}, - "toc_section_display": true, - "toc_window_display": false - } - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "sVNOuHQQjsrp" - }, - "source": [ - "# PyTorch Examples" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FpMplOCA2Fwp" - }, - "source": [ - "## VAD" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true, - "id": "62A6F_072Fwq" - }, - "source": [ - "### Install Dependencies" - ] - }, - { - "cell_type": "code", - "metadata": { - "hidden": true, - "id": "5w5AkskZ2Fwr" - }, - "source": [ - "#@title Install and Import Dependencies\n", - "\n", - "# this assumes that you have a relevant version of PyTorch installed\n", - "!pip install -q torchaudio soundfile\n", - "\n", - "SAMPLE_RATE = 16000\n", - "\n", - "import glob\n", - "import torch\n", - "torch.set_num_threads(1)\n", - "\n", - "from IPython.display import Audio\n", - "from pprint import pprint\n", - "\n", - "model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", - " model='silero_vad',\n", - " force_reload=True)\n", - "\n", - "(get_speech_timestamps,\n", - " save_audio,\n", - " read_audio,\n", - " VADIterator,\n", - " collect_chunks) = utils\n", - "\n", - "files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fXbbaUO3jsrw" - }, - "source": [ - "### Full Audio" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RAfJPb_a-Auj" - }, - "source": [ - "**Speech timestapms from full audio**" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "aI_eydBPjsrx" - }, - "source": [ - "wav = read_audio(f'{files_dir}/en.wav', sampling_rate=SAMPLE_RATE)\n", - "# get speech timestamps from full audio file\n", - "speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=SAMPLE_RATE)\n", - "pprint(speech_timestamps)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "OuEobLchjsry" - }, - "source": [ - "# merge all speech chunks to one audio\n", - "save_audio('only_speech.wav',\n", - " collect_chunks(speech_timestamps, wav), sampling_rate=16000) \n", - "Audio('only_speech.wav')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "iDKQbVr8jsry" - }, - "source": [ - "**Stream imitation example**" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "q-lql_2Wjsry" - }, - "source": [ - "## using VADIterator class\n", - "\n", - "vad_iterator = VADiterator(double_model)\n", - "wav = read_audio((f'{files_dir}/en.wav', sampling_rate=SAMPLE_RATE)\n", - "\n", - "window_size_samples = 1536 # number of samples in a single audio chunk\n", - "for i in range(0, len(wav), window_size_samples):\n", - " speech_dict = vad_iterator(wav[i: i+ window_size_samples], return_seconds=True)\n", - " if speech_dict:\n", - " print(speech_dict, end=' ')\n", - "vad_iterator.reset_states() # reset model states after each audio" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "BX3UgwwB2Fwv" - }, - "source": [ - "## just probabilities\n", - "\n", - "wav = read_audio((f'{files_dir}/en.wav', sampling_rate=SAMPLE_RATE)\n", - "speech_probs = []\n", - "window_size_samples = 1536\n", - "for i in range(0, len(wav), window_size_samples):\n", - " speech_prob = model(wav[i: i+ window_size_samples], SAMPLE_RATE).item()\n", - " speech_probs.append(speech_prob)\n", - "\n", - "pprint(speech_probs[:100])" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true, - "id": "36jY0niD2Fww" - }, - "source": [ - "## Number detector" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true, - "hidden": true, - "id": "scd1DlS42Fwx" - }, - "source": [ - "### Install Dependencies" - ] - }, - { - "cell_type": "code", - "metadata": { - "hidden": true, - "id": "Kq5gQuYq2Fwx" - }, - "source": [ - "#@title Install and Import Dependencies\n", - "\n", - "# this assumes that you have a relevant version of PyTorch installed\n", - "!pip install -q torchaudio soundfile\n", - "\n", - "import glob\n", - "import torch\n", - "torch.set_num_threads(1)\n", - "\n", - "from IPython.display import Audio\n", - "from pprint import pprint\n", - "\n", - "model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", - " model='silero_number_detector',\n", - " force_reload=True)\n", - "\n", - "(get_number_ts,\n", - " save_audio,\n", - " read_audio,\n", - " collect_chunks,\n", - " drop_chunks) = utils\n", - "\n", - "files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true, - "hidden": true, - "id": "qhPa30ij2Fwy" - }, - "source": [ - "### Full audio" - ] - }, - { - "cell_type": "code", - "metadata": { - "hidden": true, - "id": "EXpau6xq2Fwy" - }, - "source": [ - "wav = read_audio(f'{files_dir}/en_num.wav')\n", - "# get number timestamps from full audio file\n", - "number_timestamps = get_number_ts(wav, model)\n", - "pprint(number_timestamps)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "hidden": true, - "id": "u-KfXRhZ2Fwy" - }, - "source": [ - "sample_rate = 16000\n", - "# convert ms in timestamps to samples\n", - "for timestamp in number_timestamps:\n", - " timestamp['start'] = int(timestamp['start'] * sample_rate / 1000)\n", - " timestamp['end'] = int(timestamp['end'] * sample_rate / 1000)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "hidden": true, - "id": "iwYEC4aZ2Fwy" - }, - "source": [ - "# merge all number chunks to one audio\n", - "save_audio('only_numbers.wav',\n", - " collect_chunks(number_timestamps, wav), sample_rate) \n", - "Audio('only_numbers.wav')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "hidden": true, - "id": "fHaYejX12Fwy" - }, - "source": [ - "# drop all number chunks from audio\n", - "save_audio('no_numbers.wav',\n", - " drop_chunks(number_timestamps, wav), sample_rate) \n", - "Audio('no_numbers.wav')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true, - "id": "PnKtJKbq2Fwz" - }, - "source": [ - "## Language detector" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true, - "hidden": true, - "id": "F5cAmMbP2Fwz" - }, - "source": [ - "### Install Dependencies" - ] - }, - { - "cell_type": "code", - "metadata": { - "hidden": true, - "id": "Zu9D0t6n2Fwz" - }, - "source": [ - "#@title Install and Import Dependencies\n", - "\n", - "# this assumes that you have a relevant version of PyTorch installed\n", - "!pip install -q torchaudio soundfile\n", - "\n", - "import glob\n", - "import torch\n", - "torch.set_num_threads(1)\n", - "\n", - "from IPython.display import Audio\n", - "from pprint import pprint\n", - "\n", - "model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", - " model='silero_lang_detector',\n", - " force_reload=True)\n", - "\n", - "(get_language,\n", - " read_audio) = utils\n", - "\n", - "files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true, - "hidden": true, - "id": "iC696eMX2Fwz" - }, - "source": [ - "### Full audio" - ] - }, - { - "cell_type": "code", - "metadata": { - "hidden": true, - "id": "c8UYnYBF2Fw0" - }, - "source": [ - "wav = read_audio(f'{files_dir}/en.wav')\n", - "lang = get_language(wav, model)\n", - "print(lang)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "57avIBd6jsrz" - }, - "source": [ - "# ONNX Example" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hEhnfORV2Fw0" - }, - "source": [ - "## VAD" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Cy7y-NAyALSe" - }, - "source": [ - "**TO BE DONE**" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true, - "id": "7QMvUvpg2Fw4" - }, - "source": [ - "## Number detector" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true, - "hidden": true, - "id": "tBPDkpHr2Fw4" - }, - "source": [ - "### Install Dependencies" - ] - }, - { - "cell_type": "code", - "metadata": { - "cellView": "form", - "hidden": true, - "id": "PdjGd56R2Fw5" - }, - "source": [ - "#@title Install and Import Dependencies\n", - "\n", - "# this assumes that you have a relevant version of PyTorch installed\n", - "!pip install -q torchaudio soundfile onnxruntime\n", - "\n", - "import glob\n", - "import torch\n", - "import onnxruntime\n", - "from pprint import pprint\n", - "\n", - "from IPython.display import Audio\n", - "\n", - "_, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", - " model='silero_number_detector',\n", - " force_reload=True)\n", - "\n", - "(get_number_ts,\n", - " save_audio,\n", - " read_audio,\n", - " collect_chunks,\n", - " drop_chunks) = utils\n", - "\n", - "files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'\n", - "\n", - "def init_onnx_model(model_path: str):\n", - " return onnxruntime.InferenceSession(model_path)\n", - "\n", - "def validate_onnx(model, inputs):\n", - " with torch.no_grad():\n", - " ort_inputs = {'input': inputs.cpu().numpy()}\n", - " outs = model.run(None, ort_inputs)\n", - " outs = [torch.Tensor(x) for x in outs]\n", - " return outs" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true, - "hidden": true, - "id": "I9QWSFZh2Fw5" - }, - "source": [ - "### Full Audio" - ] - }, - { - "cell_type": "code", - "metadata": { - "hidden": true, - "id": "_r6QZiwu2Fw5" - }, - "source": [ - "model = init_onnx_model(f'{files_dir}/number_detector.onnx')\n", - "wav = read_audio(f'{files_dir}/en_num.wav')\n", - "\n", - "# get number timestamps from full audio file\n", - "number_timestamps = get_number_ts(wav, model, run_function=validate_onnx)\n", - "pprint(number_timestamps)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "hidden": true, - "id": "FN4aDwLV2Fw5" - }, - "source": [ - "sample_rate = 16000\n", - "# convert ms in timestamps to samples\n", - "for timestamp in number_timestamps:\n", - " timestamp['start'] = int(timestamp['start'] * sample_rate / 1000)\n", - " timestamp['end'] = int(timestamp['end'] * sample_rate / 1000)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "hidden": true, - "id": "JnvS6WTK2Fw5" - }, - "source": [ - "# merge all number chunks to one audio\n", - "save_audio('only_numbers.wav',\n", - " collect_chunks(number_timestamps, wav), 16000) \n", - "Audio('only_numbers.wav')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "hidden": true, - "id": "yUxOcOFG2Fw6" - }, - "source": [ - "# drop all number chunks from audio\n", - "save_audio('no_numbers.wav',\n", - " drop_chunks(number_timestamps, wav), 16000) \n", - "Audio('no_numbers.wav')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true, - "id": "SR8Bgcd52Fw6" - }, - "source": [ - "## Language detector" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "heading_collapsed": true, - "hidden": true, - "id": "PBnXPtKo2Fw6" - }, - "source": [ - "### Install Dependencies" - ] - }, - { - "cell_type": "code", - "metadata": { - "cellView": "form", - "hidden": true, - "id": "iNkDWJ3H2Fw6" - }, - "source": [ - "#@title Install and Import Dependencies\n", - "\n", - "# this assumes that you have a relevant version of PyTorch installed\n", - "!pip install -q torchaudio soundfile onnxruntime\n", - "\n", - "import glob\n", - "import torch\n", - "import onnxruntime\n", - "from pprint import pprint\n", - "\n", - "from IPython.display import Audio\n", - "\n", - "_, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", - " model='silero_lang_detector',\n", - " force_reload=True)\n", - "\n", - "(get_language,\n", - " read_audio) = utils\n", - "\n", - "files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'\n", - "\n", - "def init_onnx_model(model_path: str):\n", - " return onnxruntime.InferenceSession(model_path)\n", - "\n", - "def validate_onnx(model, inputs):\n", - " with torch.no_grad():\n", - " ort_inputs = {'input': inputs.cpu().numpy()}\n", - " outs = model.run(None, ort_inputs)\n", - " outs = [torch.Tensor(x) for x in outs]\n", - " return outs" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "hidden": true, - "id": "G8N8oP4q2Fw6" - }, - "source": [ - "### Full Audio" - ] - }, - { - "cell_type": "code", - "metadata": { - "hidden": true, - "id": "WHXnh9IV2Fw6" - }, - "source": [ - "model = init_onnx_model(f'{files_dir}/number_detector.onnx')\n", - "wav = read_audio(f'{files_dir}/en.wav')\n", - "\n", - "lang = get_language(wav, model, run_function=validate_onnx)\n", - "print(lang)" - ], - "execution_count": null, - "outputs": [] - } - ] -} \ No newline at end of file + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "sVNOuHQQjsrp" + }, + "source": [ + "# PyTorch Examples" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FpMplOCA2Fwp" + }, + "source": [ + "## VAD" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true, + "id": "62A6F_072Fwq" + }, + "source": [ + "### Install Dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true, + "id": "5w5AkskZ2Fwr" + }, + "outputs": [], + "source": [ + "#@title Install and Import Dependencies\n", + "\n", + "# this assumes that you have a relevant version of PyTorch installed\n", + "!pip install -q torchaudio\n", + "\n", + "SAMPLE_RATE = 16000\n", + "\n", + "import glob\n", + "import torch\n", + "torch.set_num_threads(1)\n", + "\n", + "from IPython.display import Audio\n", + "from pprint import pprint\n", + "\n", + "model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", + " model='silero_vad',\n", + " force_reload=True)\n", + "\n", + "(get_speech_timestamps,\n", + " save_audio,\n", + " read_audio,\n", + " VADIterator,\n", + " collect_chunks) = utils\n", + "\n", + "files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fXbbaUO3jsrw" + }, + "source": [ + "### Full Audio" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RAfJPb_a-Auj" + }, + "source": [ + "**Speech timestapms from full audio**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "aI_eydBPjsrx" + }, + "outputs": [], + "source": [ + "wav = read_audio(f'{files_dir}/en.wav', sampling_rate=SAMPLE_RATE)\n", + "# get speech timestamps from full audio file\n", + "speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=SAMPLE_RATE)\n", + "pprint(speech_timestamps)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "OuEobLchjsry" + }, + "outputs": [], + "source": [ + "# merge all speech chunks to one audio\n", + "save_audio('only_speech.wav',\n", + " collect_chunks(speech_timestamps, wav), sampling_rate=16000) \n", + "Audio('only_speech.wav')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iDKQbVr8jsry" + }, + "source": [ + "### Stream imitation example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "q-lql_2Wjsry" + }, + "outputs": [], + "source": [ + "## using VADIterator class\n", + "\n", + "vad_iterator = VADIterator(model)\n", + "wav = read_audio(f'{files_dir}/en.wav', sampling_rate=SAMPLE_RATE)\n", + "\n", + "window_size_samples = 1536 # number of samples in a single audio chunk\n", + "for i in range(0, len(wav), window_size_samples):\n", + " speech_dict = vad_iterator(wav[i: i+ window_size_samples], return_seconds=True)\n", + " if speech_dict:\n", + " print(speech_dict, end=' ')\n", + "vad_iterator.reset_states() # reset model states after each audio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BX3UgwwB2Fwv" + }, + "outputs": [], + "source": [ + "## just probabilities\n", + "\n", + "wav = read_audio(f'{files_dir}/en.wav', sampling_rate=SAMPLE_RATE)\n", + "speech_probs = []\n", + "window_size_samples = 1536\n", + "for i in range(0, len(wav), window_size_samples):\n", + " speech_prob = model(wav[i: i+ window_size_samples], SAMPLE_RATE).item()\n", + " speech_probs.append(speech_prob)\n", + "\n", + "pprint(speech_probs[:100])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true, + "id": "36jY0niD2Fww" + }, + "source": [ + "## Number detector" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true, + "hidden": true, + "id": "scd1DlS42Fwx" + }, + "source": [ + "### Install Dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true, + "id": "Kq5gQuYq2Fwx" + }, + "outputs": [], + "source": [ + "#@title Install and Import Dependencies\n", + "\n", + "# this assumes that you have a relevant version of PyTorch installed\n", + "!pip install -q torchaudio soundfile\n", + "\n", + "import glob\n", + "import torch\n", + "torch.set_num_threads(1)\n", + "\n", + "from IPython.display import Audio\n", + "from pprint import pprint\n", + "\n", + "model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", + " model='silero_number_detector',\n", + " force_reload=True)\n", + "\n", + "(get_number_ts,\n", + " save_audio,\n", + " read_audio,\n", + " collect_chunks,\n", + " drop_chunks) = utils\n", + "\n", + "files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true, + "hidden": true, + "id": "qhPa30ij2Fwy" + }, + "source": [ + "### Full audio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true, + "id": "EXpau6xq2Fwy" + }, + "outputs": [], + "source": [ + "wav = read_audio(f'{files_dir}/en_num.wav')\n", + "# get number timestamps from full audio file\n", + "number_timestamps = get_number_ts(wav, model)\n", + "pprint(number_timestamps)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true, + "id": "u-KfXRhZ2Fwy" + }, + "outputs": [], + "source": [ + "sample_rate = 16000\n", + "# convert ms in timestamps to samples\n", + "for timestamp in number_timestamps:\n", + " timestamp['start'] = int(timestamp['start'] * sample_rate / 1000)\n", + " timestamp['end'] = int(timestamp['end'] * sample_rate / 1000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true, + "id": "iwYEC4aZ2Fwy" + }, + "outputs": [], + "source": [ + "# merge all number chunks to one audio\n", + "save_audio('only_numbers.wav',\n", + " collect_chunks(number_timestamps, wav), sample_rate) \n", + "Audio('only_numbers.wav')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true, + "id": "fHaYejX12Fwy" + }, + "outputs": [], + "source": [ + "# drop all number chunks from audio\n", + "save_audio('no_numbers.wav',\n", + " drop_chunks(number_timestamps, wav), sample_rate) \n", + "Audio('no_numbers.wav')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true, + "id": "PnKtJKbq2Fwz" + }, + "source": [ + "## Language detector" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true, + "hidden": true, + "id": "F5cAmMbP2Fwz" + }, + "source": [ + "### Install Dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true, + "id": "Zu9D0t6n2Fwz" + }, + "outputs": [], + "source": [ + "#@title Install and Import Dependencies\n", + "\n", + "# this assumes that you have a relevant version of PyTorch installed\n", + "!pip install -q torchaudio soundfile\n", + "\n", + "import glob\n", + "import torch\n", + "torch.set_num_threads(1)\n", + "\n", + "from IPython.display import Audio\n", + "from pprint import pprint\n", + "\n", + "model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", + " model='silero_lang_detector',\n", + " force_reload=True)\n", + "\n", + "(get_language,\n", + " read_audio) = utils\n", + "\n", + "files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true, + "hidden": true, + "id": "iC696eMX2Fwz" + }, + "source": [ + "### Full audio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true, + "id": "c8UYnYBF2Fw0" + }, + "outputs": [], + "source": [ + "wav = read_audio(f'{files_dir}/en.wav')\n", + "lang = get_language(wav, model)\n", + "print(lang)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "57avIBd6jsrz" + }, + "source": [ + "# ONNX Example" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hEhnfORV2Fw0" + }, + "source": [ + "## VAD" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Cy7y-NAyALSe" + }, + "source": [ + "**TO BE DONE**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true, + "id": "7QMvUvpg2Fw4" + }, + "source": [ + "## Number detector" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true, + "hidden": true, + "id": "tBPDkpHr2Fw4" + }, + "source": [ + "### Install Dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "hidden": true, + "id": "PdjGd56R2Fw5" + }, + "outputs": [], + "source": [ + "#@title Install and Import Dependencies\n", + "\n", + "# this assumes that you have a relevant version of PyTorch installed\n", + "!pip install -q torchaudio soundfile onnxruntime\n", + "\n", + "import glob\n", + "import torch\n", + "import onnxruntime\n", + "from pprint import pprint\n", + "\n", + "from IPython.display import Audio\n", + "\n", + "_, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", + " model='silero_number_detector',\n", + " force_reload=True)\n", + "\n", + "(get_number_ts,\n", + " save_audio,\n", + " read_audio,\n", + " collect_chunks,\n", + " drop_chunks) = utils\n", + "\n", + "files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'\n", + "\n", + "def init_onnx_model(model_path: str):\n", + " return onnxruntime.InferenceSession(model_path)\n", + "\n", + "def validate_onnx(model, inputs):\n", + " with torch.no_grad():\n", + " ort_inputs = {'input': inputs.cpu().numpy()}\n", + " outs = model.run(None, ort_inputs)\n", + " outs = [torch.Tensor(x) for x in outs]\n", + " return outs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true, + "hidden": true, + "id": "I9QWSFZh2Fw5" + }, + "source": [ + "### Full Audio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true, + "id": "_r6QZiwu2Fw5" + }, + "outputs": [], + "source": [ + "model = init_onnx_model(f'{files_dir}/number_detector.onnx')\n", + "wav = read_audio(f'{files_dir}/en_num.wav')\n", + "\n", + "# get number timestamps from full audio file\n", + "number_timestamps = get_number_ts(wav, model, run_function=validate_onnx)\n", + "pprint(number_timestamps)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true, + "id": "FN4aDwLV2Fw5" + }, + "outputs": [], + "source": [ + "sample_rate = 16000\n", + "# convert ms in timestamps to samples\n", + "for timestamp in number_timestamps:\n", + " timestamp['start'] = int(timestamp['start'] * sample_rate / 1000)\n", + " timestamp['end'] = int(timestamp['end'] * sample_rate / 1000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true, + "id": "JnvS6WTK2Fw5" + }, + "outputs": [], + "source": [ + "# merge all number chunks to one audio\n", + "save_audio('only_numbers.wav',\n", + " collect_chunks(number_timestamps, wav), 16000) \n", + "Audio('only_numbers.wav')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true, + "id": "yUxOcOFG2Fw6" + }, + "outputs": [], + "source": [ + "# drop all number chunks from audio\n", + "save_audio('no_numbers.wav',\n", + " drop_chunks(number_timestamps, wav), 16000) \n", + "Audio('no_numbers.wav')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true, + "id": "SR8Bgcd52Fw6" + }, + "source": [ + "## Language detector" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "heading_collapsed": true, + "hidden": true, + "id": "PBnXPtKo2Fw6" + }, + "source": [ + "### Install Dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "hidden": true, + "id": "iNkDWJ3H2Fw6" + }, + "outputs": [], + "source": [ + "#@title Install and Import Dependencies\n", + "\n", + "# this assumes that you have a relevant version of PyTorch installed\n", + "!pip install -q torchaudio soundfile onnxruntime\n", + "\n", + "import glob\n", + "import torch\n", + "import onnxruntime\n", + "from pprint import pprint\n", + "\n", + "from IPython.display import Audio\n", + "\n", + "_, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", + " model='silero_lang_detector',\n", + " force_reload=True)\n", + "\n", + "(get_language,\n", + " read_audio) = utils\n", + "\n", + "files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'\n", + "\n", + "def init_onnx_model(model_path: str):\n", + " return onnxruntime.InferenceSession(model_path)\n", + "\n", + "def validate_onnx(model, inputs):\n", + " with torch.no_grad():\n", + " ort_inputs = {'input': inputs.cpu().numpy()}\n", + " outs = model.run(None, ort_inputs)\n", + " outs = [torch.Tensor(x) for x in outs]\n", + " return outs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "hidden": true, + "id": "G8N8oP4q2Fw6" + }, + "source": [ + "### Full Audio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "hidden": true, + "id": "WHXnh9IV2Fw6" + }, + "outputs": [], + "source": [ + "model = init_onnx_model(f'{files_dir}/number_detector.onnx')\n", + "wav = read_audio(f'{files_dir}/en.wav')\n", + "\n", + "lang = get_language(wav, model, run_function=validate_onnx)\n", + "print(lang)" + ] + } + ], + "metadata": { + "colab": { + "name": "silero-vad.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/utils_vad.py b/utils_vad.py index aed2a0e..1a03085 100644 --- a/utils_vad.py +++ b/utils_vad.py @@ -20,7 +20,6 @@ def validate(model, def read_audio(path: str, sampling_rate: int = 16000): - assert torchaudio.get_audio_backend() == 'soundfile' wav, sr = torchaudio.load(path) if wav.size(0) > 1: @@ -63,7 +62,7 @@ def make_visualization(probs, step): def get_speech_timestamps(audio: torch.Tensor, model, threshold: float = 0.5, - sample_rate: int = 16000, + sampling_rate: int = 16000, min_speech_duration_ms: int = 250, min_silence_duration_ms: int = 100, window_size_samples: int = 1536, @@ -85,7 +84,7 @@ def get_speech_timestamps(audio: torch.Tensor, Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH. It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets. - sample_rate: int (default - 16000) + sampling_rate: int (default - 16000) Currently silero VAD models support 8000 and 16000 sample rates min_speech_duration_ms: int (default - 250 milliseconds) @@ -126,15 +125,15 @@ def get_speech_timestamps(audio: torch.Tensor, if len(audio.shape) > 1: raise ValueError("More than one dimension in audio. Are you trying to process audio with 2 channels?") - if sample_rate == 8000 and window_size_samples > 768: - warnings.warn('window_size_samples is too big for 8000 sample_rate! Better set window_size_samples to 256, 512 or 1536 for 8000 sample rate!') + if sampling_rate == 8000 and window_size_samples > 768: + warnings.warn('window_size_samples is too big for 8000 sampling_rate! Better set window_size_samples to 256, 512 or 1536 for 8000 sample rate!') if window_size_samples not in [256, 512, 768, 1024, 1536]: - warnings.warn('Unusual window_size_samples! Supported window_size_samples:\n - [512, 1024, 1536] for 16000 sample_rate\n - [256, 512, 768] for 8000 sample_rate') + warnings.warn('Unusual window_size_samples! Supported window_size_samples:\n - [512, 1024, 1536] for 16000 sampling_rate\n - [256, 512, 768] for 8000 sampling_rate') model.reset_states() - min_speech_samples = sample_rate * min_speech_duration_ms / 1000 - min_silence_samples = sample_rate * min_silence_duration_ms / 1000 - speech_pad_samples = sample_rate * speech_pad_ms / 1000 + min_speech_samples = sampling_rate * min_speech_duration_ms / 1000 + min_silence_samples = sampling_rate * min_silence_duration_ms / 1000 + speech_pad_samples = sampling_rate * speech_pad_ms / 1000 audio_length_samples = len(audio) @@ -143,7 +142,7 @@ def get_speech_timestamps(audio: torch.Tensor, chunk = audio[current_start_sample: current_start_sample + window_size_samples] if len(chunk) < window_size_samples: chunk = torch.nn.functional.pad(chunk, (0, int(window_size_samples - len(chunk)))) - speech_prob = model(chunk, sample_rate).item() + speech_prob = model(chunk, sampling_rate).item() speech_probs.append(speech_prob) triggered = False @@ -194,11 +193,11 @@ def get_speech_timestamps(audio: torch.Tensor, if return_seconds: for speech_dict in speeches: - speech_dict['start'] = round(speech_dict['start'] / sample_rate, 1) - speech_dict['end'] = round(speech_dict['end'] / sample_rate, 1) + speech_dict['start'] = round(speech_dict['start'] / sampling_rate, 1) + speech_dict['end'] = round(speech_dict['end'] / sampling_rate, 1) if visualize_probs: - make_visualization(speech_probs, window_size_samples / sample_rate) + make_visualization(speech_probs, window_size_samples / sampling_rate) return speeches @@ -276,7 +275,7 @@ class VADIterator: def __init__(self, model, threshold: float = 0.5, - sample_rate: int = 16000, + sampling_rate: int = 16000, min_silence_duration_ms: int = 100, speech_pad_ms: int = 30 ): @@ -292,7 +291,7 @@ def __init__(self, Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH. It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets. - sample_rate: int (default - 16000) + sampling_rate: int (default - 16000) Currently silero VAD models support 8000 and 16000 sample rates min_silence_duration_ms: int (default - 100 milliseconds) @@ -304,9 +303,9 @@ def __init__(self, self.model = model self.threshold = threshold - self.sample_rate = sample_rate - self.min_silence_samples = sample_rate * min_silence_duration_ms / 1000 - self.speech_pad_samples = sample_rate * speech_pad_ms / 1000 + self.sampling_rate = sampling_rate + self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000 + self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000 self.reset_states() def reset_states(self): @@ -327,7 +326,7 @@ def __call__(self, x, return_seconds=False): window_size_samples = len(x[0]) if x.dim() == 2 else len(x) self.current_sample += window_size_samples - speech_prob = self.model(x, self.sample_rate).item() + speech_prob = self.model(x, self.sampling_rate).item() if (speech_prob >= self.threshold) and self.temp_end: self.temp_end = 0 @@ -335,7 +334,7 @@ def __call__(self, x, return_seconds=False): if (speech_prob >= self.threshold) and not self.triggered: self.triggered = True speech_start = self.current_sample - self.speech_pad_samples - return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sample_rate, 1)} + return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)} if (speech_prob < self.threshold - 0.15) and self.triggered: if not self.temp_end: @@ -346,7 +345,7 @@ def __call__(self, x, return_seconds=False): speech_end = self.temp_end + self.speech_pad_samples self.temp_end = 0 self.triggered = False - return {'end': int(speech_end) if not return_seconds else round(speech_end / self.sample_rate, 1)} + return {'end': int(speech_end) if not return_seconds else round(speech_end / self.sampling_rate, 1)} return None