-
Notifications
You must be signed in to change notification settings - Fork 44
/
stream.py
135 lines (106 loc) · 3.67 KB
/
stream.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import torch
import os
import time
import random
import argparse
import numpy as np
import torch.nn.functional as F
import torchaudio
import json
import sounddevice as sd
import soundfile as sf
from parts.text.cleaners import english_cleaners
from datetime import datetime
from absl import app, flags
import av
import torch
import torchaudio
from absl import app, flags
from rnnt.args import FLAGS
from rnnt.stream import PytorchStreamDecoder, OpenVINOStreamDecoder
import tempfile
import queue
import sys
av.logging.set_level(av.logging.ERROR)
# PytorchStreamDecoder
flags.DEFINE_string('model_name', "last.pt", help='steps of checkpoint')
flags.DEFINE_integer('step_n_frame', 2, help='input frame(stacked)')
flags.DEFINE_enum('stream_decoder', 'torch', ['torch', 'openvino'],
help='stream decoder implementation')
flags.DEFINE_string('url', 'https://www.youtube.com/watch?v=2EppLNonncc',
help='youtube live link')
flags.DEFINE_integer('reset_step', 500, help='reset hidden state')
flags.DEFINE_string('path', None, help='path to .wav')
'''
server: AudioPreprocessing(
normalize='none', sample_rate=16000, window_size=0.02,
window_stride=0.015, features=args.audio_feat, n_fft=512, log=True,
feat_type='logfbank', trim_silence=True, window='hann',dither=0.00001, frame_splicing=1, transpose_out=False
),
rust: AudioPreprocessing(
normalize='none', sample_rate=16000, window_size=0.02,
window_stride=0.01, features=args.audio_feat, n_fft=512, log=True,
feat_type='logfbank', trim_silence=True, window='hann',dither=0.00001, frame_splicing=1, transpose_out=False
),
'''
global blank_counter
blank_counter = 0
buffer = []
sd.default.samplerate = 16000
'''
SHALL I NEVER MISS HOME TALK AND BLESSING AND THE COMMON KISS THAT
COMES TO EACH IN TURN NOR COUNT IT STRANGE WHEN I LOOK UP TO DROP ON
A NEW RANGE OF WALLS AND FLOORS ANOTHER HOME THAN THIS
'''
def callback(raw_indata, outdata,frames, time, status):
global buffer
global encoder_h
global blank_counter
if status: # usually something bad
print("X", flush=True, end=" ")
else:
indata = raw_indata.copy()
buffer.append(indata)
buffer = buffer[-2:]
indata = np.concatenate(buffer[-2:], axis=0)
# print(indata.shape)
indata = indata / (1<<16)
waveform = torch.from_numpy(indata.flatten()).float()
waveform = waveform.unsqueeze(0)
seq = stream_decoder.decode(waveform)
if seq == "":
blank_counter += 1
if blank_counter == 35:
print(' [Background]')
stream_decoder.reset()
else:
blank_counter = 0
print(seq, end='', flush=True)
def chunks(lst, n):
"""Yield successive n-sized chunks from lst."""
for i in range(0, len(lst), n):
yield lst[i:i + n]
def test_wav(wav_file):
import torchaudio
data, sr = torchaudio.load(wav_file, normalization=True)
if sr != 16000:
resample = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
data = resample(data)
sr = 16000
data_ = data[0]
data_ = data_.unsqueeze(0)
seq = stream_decoder.decode(data_)
print(seq)
def main(argv):
global stream_decoder
stream_decoder = PytorchStreamDecoder(FLAGS)
duration = 80
if FLAGS.path is not None:
test_wav(FLAGS.path)
else:
with sd.Stream(channels=1,dtype='float32', samplerate=16000,
blocksize=FLAGS.win_length*FLAGS.step_n_frame+ (FLAGS.step_n_frame-1), callback=callback,
latency='high'):
sd.sleep(duration * 1000)
if __name__ == "__main__":
app.run(main)