Skip to content

Other Models

Dimitrii Voronin edited this page Dec 17, 2021 · 7 revisions

Other models besides VAD

Number Detector

Number Detector detects spoken numbers (i.e thirty five) in 4 languages - english, german, russian, spanish

In some cases it is crucial to be able to anonymize large-scale spoken corpora (i.e. remove personal data). Typically personal data is considered to be private or sensitive if it contains a name or some private ID. Name recognition is a highly subjective matter and it depends on locale and business case, but VAD and Number Detection are quite general tasks.

How to use Number Detector:

  • It is recommended to split long audio into short ones (< 15s) and apply model on each of them.
  • Number Detector can classify if the whole audio contains a number, or if each audio frame contains a number.
  • Audio is split into frames in a certain way, so, having a per-frame output, we can reconstruct the time boundaries for numbers with an accuracy of about 0.2s.
example
#@title Install and Import Dependencies

# this assumes that you have a relevant version of PyTorch installed
!pip install -q torchaudio

SAMPLING_RATE = 16000

import torch
torch.set_num_threads(1)

from IPython.display import Audio
from pprint import pprint
# download example
torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en_num.wav', 'en_number_example.wav')

USE_ONNX = True # change this to True if you want to test onnx model
if USE_ONNX:
    !pip install -q onnxruntime
  
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_number_detector',
                              force_reload=True,
                              onnx=USE_ONNX)
(get_number_ts,
 save_audio,
 read_audio,
 collect_chunks,
 drop_chunks) = utils

wav = read_audio('en_number_example.wav', sampling_rate=SAMPLING_RATE)
# get number timestamps from full audio file
number_timestamps = get_number_ts(wav, model)
pprint(number_timestamps)

# convert ms in timestamps to samples
for timestamp in number_timestamps:
    timestamp['start'] = int(timestamp['start'] * SAMPLING_RATE / 1000)
    timestamp['end'] = int(timestamp['end'] * SAMPLING_RATE / 1000)

# merge all number chunks to one audio
save_audio('only_numbers.wav',
           collect_chunks(number_timestamps, wav), SAMPLING_RATE) 
Audio('only_numbers.wav')

# drop all number chunks from audio
save_audio('no_numbers.wav',
           drop_chunks(number_timestamps, wav), SAMPLING_RATE) 
Audio('no_numbers.wav')

</details>

### Language Classifier

- **99%** validation accuracy.
- **Language classifier** was trained using audio samples in 4 languages: **Russian**, **English**, **Spanish**, **German**.
- Arbitrary audio length can be used, although network was trained using audio shorter than 15 seconds
- [95 languages version](#language-classifier-95)

<details>
<summary>JIT example</summary>


```python
import torch
torch.set_num_threads(1)
from pprint import pprint

model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_lang_detector',
                              force_reload=True)

get_language, read_audio, _ = utils

files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'

wav = read_audio(f'{files_dir}/de.wav')
language = get_language(wav, model)

pprint(language)
ONNX example
import torch
import onnxruntime
from pprint import pprint

model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_lang_detector',
                              force_reload=True)
                              
get_language, read_audio, donwload_onnx_model = utils
donwload_onnx_model('number_detector')

files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'


def init_onnx_model(model_path: str):
    return onnxruntime.InferenceSession(model_path)

def validate_onnx(model, inputs):
    with torch.no_grad():
        ort_inputs = {'input': inputs.cpu().numpy()}
        outs = model.run(None, ort_inputs)
        outs = [torch.Tensor(x) for x in outs]
    return outs
    
model = init_onnx_model('number_detector.onnx')
wav = read_audio(f'{files_dir}/de.wav')

language = get_language(wav, model, run_function=validate_onnx)
print(language)

Language Classifier 95

  • 85% validation accuracy among 95 languages, 90% validation accuracy among 58 language groups
  • Language classifier 95 was trained using audio samples in 95 languages
  • Arbitrary audio length can be used, although network was trained using audio shorter than 20 seconds
JIT example
import torch
torch.set_num_threads(1)
from pprint import pprint

model, lang_dict, lang_group_dict,  utils = torch.hub.load(
                              repo_or_dir='snakers4/silero-vad',
                              model='silero_lang_detector_95',
                              force_reload=True)

get_language_and_group, read_audio, _ = utils

files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'

wav = read_audio(f'{files_dir}/de.wav')
languages, language_groups = get_language_and_group(wav, model, lang_dict, lang_group_dict, top_n=2)

for i in languages:
  pprint(f'Language: {i[0]} with prob {i[-1]}')

for i in language_groups:
  pprint(f'Language group: {i[0]} with prob {i[-1]}')
ONNX example
import torch
import onnxruntime
from pprint import pprint

model, lang_dict, lang_group_dict,  utils = torch.hub.load(
                              repo_or_dir='snakers4/silero-vad',
                              model='silero_lang_detector_95',
                              force_reload=True)
                              
get_language_and_group, read_audio, donwload_onnx_model = utils
donwload_onnx_model('lang_classifier_95')

files_dir = torch.hub.get_dir() + '/snakers4_silero-vad_master/files'

def init_onnx_model(model_path: str):
    return onnxruntime.InferenceSession(model_path)

def validate_onnx(model, inputs):
    with torch.no_grad():
        ort_inputs = {'input': inputs.cpu().numpy()}
        outs = model.run(None, ort_inputs)
        outs = [torch.Tensor(x) for x in outs]
    return outs
    
model = init_onnx_model('lang_classifier_95.onnx')
wav = read_audio(f'{files_dir}/de.wav')

languages, language_groups = get_language_and_group(wav, model, lang_dict, lang_group_dict, top_n=2, run_function=validate_onnx)

for i in languages:
  pprint(f'Language: {i[0]} with prob {i[-1]}')

for i in language_groups:
  pprint(f'Language group: {i[0]} with prob {i[-1]}')
Clone this wiki locally