forked from aalto-speech/speaker-diarization
-
Notifications
You must be signed in to change notification settings - Fork 0
/
voice-detection2.py
executable file
·198 lines (179 loc) · 8.12 KB
/
voice-detection2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
#!/usr/bin/env python2
import argparse
import sys
import os.path as op
import re
def parse_recipe(rfile):
"""Parses input recipe, checks for WAV's"""
r = []
audio_file = re.compile('audio=(\S+)')
for line in rfile:
try:
audio = audio_file.search(line).groups()[0]
r.append(audio)
except AttributeError:
print 'Recipe line without recognizable audio files:'
print line
return r
def wav_to_exp(wavfile, epath):
"""Converts .wav filename into the asociated .exp file"""
expfile = op.splitext(op.basename(wavfile))[0]
expfile += '.exp'
expfile = op.join(epath, expfile)
if not op.isfile(expfile):
print 'Error,', expfile, 'does not exist'
exit()
return expfile
def inc_lna(lna):
"""Generate suitable lna names"""
for c in xrange(len(lna) - 1, -1, -1):
if lna[c] != 'z':
lna = lna[0:c] + chr(ord(lna[c]) + 1) + 'a' * (len(lna) - c - 1)
break
else:
lna = 'a' * (len(lna) + 1)
return lna
def parse_exp_file(expfile, lna):
"""Return the proper speech-nonspeech turns with a suitable lna name."""
get_frame_re = re.compile(r'(\d+) (p|<w>)')
def _get_frame(exps):
"""Get the next frame and it's tag."""
for match in get_frame_re.finditer(exps):
yield match.group(1), match.group(2)
rate = float(args.rate) # To ensure floating point division with it
t = []
lnacount = 1
start = 0.0
end = 0.0
total_time = 0.0
previous_frame = 0.0
frame_diff = 0.0
in_speech = False # We assume we don't start in speech
with open(expfile, 'r') as efile:
for line in efile: # Should be 1 in the new format
for frame, token in _get_frame(line):
frame_diff = float(frame) - previous_frame
previous_frame = float(frame)
advanced_time = frame_diff / rate
total_time += advanced_time
if in_speech:
if token == '<w>': # Silence
# Possible ending, record it
end = total_time
elif token == 'p': # Puhe, speech
if end: # There was a possible end, it counts?
if advanced_time < mns: # Ignore this end
end = 0.0
else: # It counts
# Speech turn over, write
in_speech = False
t.append((lna + '_' + str(lnacount), start - sbe,
end + see))
lnacount += 1
# Possible start of a new turn
start = total_time
else:
# print 'Warning: Speech-in-Speech, frame: %d' % \
# int(frame)
pass
else: # In silence
if token == '<w>': # Silence
if start: # There was a possible start, it counts?
if advanced_time < ms: # Ignore this start
start = 0.0
else: # It counts
# Silence over, this silence marks a possible
# end
end = total_time
in_speech = True
else:
# print 'Warning: Silence-in-Silence, frame: %d' % \
# int(frame)
pass
elif token == 'p': # Puhe, speech
# Possible start, record it
start = total_time
if start:
# One turn started but it hasn't ended, write it as last turn if longer
# than ms
last_frame_file = op.splitext(expfile)[0] + '.last_frame'
with open(last_frame_file, 'r') as f:
last_frame_time = float(f.read()) / rate
if last_frame_time - start >= ms:
t.append((lna + '_' + str(lnacount), start - sbe, last_frame_time))
return t
def write_recipe_line(wav, lna, start, end, outf):
"""Write output recipes"""
outf.write('audio=' + wav +
' lna=' + lna +
' start-time=' + str(start) +
' end-time=' + str(end) + '\n')
def write_recipe(rec, epath, outf):
lna = 'a' # Lna base name for the next recipe line
for wav in rec:
turns = parse_exp_file(wav_to_exp(wav, epath), lna)
for t in turns:
write_recipe_line(wav, t[0], t[1], t[2], outf)
lna = inc_lna(lna) # Increment Lna base name for the following recipe line
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Creates a recipe from the \
Speech Activity Detection generate_exp.py output, \
(.exp files) that is, speech/non-speech turn detection')
parser.add_argument('recfile', type=str,
help='Specifies the input recipe file')
parser.add_argument('exppath', type=str,
help='Specifies the input .exp files path')
parser.add_argument('-o', dest='outfile', type=str, default='stdout',
help='Specifies an output file, default stdout.')
parser.add_argument('-r', dest='rate', type=int, default=125,
help='Specifies the sample rate, default 125.')
parser.add_argument('-ms', dest='minspeech', type=float, default=0.2,
help='Specifies the minimum speech turn duration, \
default 0.2 seconds (roughly one word).')
# This 0.3 second default comes from "The 2009 (RT-09) Rich
# Transcription Meeting Recognition Evaluation Plan", page 4
parser.add_argument('-mns', dest='minnonspeech', type=float, default=0.3,
help='Specifies the minimum nonspeech between-turns\
duration, default 0.3 seconds (NIST standard).')
parser.add_argument('-sbe', dest='seg_before_exp', type=float, default=0.0,
help='Specifies a segment expansion time, that is, to\
remove some time before each detected speaker segment.\
No overlapping check, so set this to less than half \
the value of -mns. Default is 0.0 seconds (no expansion).')
parser.add_argument('-see', dest='seg_end_exp', type=float, default=0.0,
help='Specifies a segment end expansion time, that is,\
to add some time after each detected speaker segment. \
No overlapping check, so set this to less than half \
the value of -mns. Default is 0.0 seconds (no expansion).')
args = parser.parse_args()
# Process arguments
print 'Reading recipe from:', args.recfile
with open(args.recfile, 'r') as recfile:
recipe = parse_recipe(recfile)
print 'Reading .exp files from:', args.exppath
exppath = args.exppath
if not op.isdir(exppath):
print 'Error,', exppath, 'is not a valid directory'
exit()
if args.outfile != 'stdout':
outfile = args.outfile
print 'Writing output to:', args.outfile
else:
outfile = sys.stdout
print 'Writing output to: stdout'
print 'Sample rate set to:', args.rate
rate = float(args.rate) # To ensure floating point division with it
ms = args.minspeech
print 'Minimum speech turn duration:', ms, 'seconds'
mns = args.minnonspeech
print 'Minimum nonspeech between-turns duration:', mns, 'seconds'
sbe = args.seg_before_exp
print 'Segment before expansion set to:', sbe, 'seconds'
see = args.seg_end_exp
print 'Segment end expansion set to:', see, 'seconds'
# Do the real work
if outfile != sys.stdout:
with open(outfile, 'w') as outf:
write_recipe(recipe, exppath, outf)
else:
write_recipe(recipe, exppath, outfile)