forked from aalto-speech/speaker-diarization
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvoice-detection.py
executable file
·192 lines (175 loc) · 7.14 KB
/
voice-detection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
#!/usr/bin/python2
import argparse
import sys
import os
import re
def parse_recipe(rfile):
"""Parses input recipe, checks for WAV's"""
r = []
audio_file = re.compile('audio=(\S+)')
for line in rfile:
try:
audio = audio_file.search(line).groups()[0]
r.append(audio)
except AttributeError:
print 'Recipe line without recognizable audio files:'
print line
return r
def wav_to_exp(wavfile, epath):
"""Converts .wav filename into the asociated .exp file"""
expfile = epath
expfile += os.path.splitext(os.path.basename(wavfile))[0]
expfile += '.exp'
if not os.path.isfile(expfile):
print 'Error,', expfile, 'does not exist'
exit()
return expfile
def inc_lna(lna):
"""Generate suitable lna names"""
for c in xrange(len(lna) - 1, -1, -1):
if lna[c] != 'z':
lna = lna[0:c] + chr(ord(lna[c]) + 1) + 'a' * (len(lna) - c - 1)
break
else:
lna = 'a' * (len(lna) + 1)
return lna
def parse_exp_file(expfile, lna):
"""Return the proper speech-nonspeech turns with a suitable lna name"""
t = []
lnacount = 1
linecount = 0.0
start = 0
end = 0
in_speech = False
in_speech_silence_count = 0
in_silence_speech_count = 0
rline = None
with open(expfile, 'r') as efile:
for line in efile:
if '<w>' in line and not in_speech:
if in_silence_speech_count > 0:
in_silence_speech_count -= 1
else:
rline = None
linecount += 1
continue
elif 'p' in line and in_speech:
if in_speech_silence_count > 0:
in_speech_silence_count -= 1
else:
rline = None
linecount += 1
continue
elif 'p' in line and not in_speech:
in_silence_speech_count += 1
if not rline:
rline = linecount
if in_silence_speech_count / rate < ms:
linecount += 1
continue
in_speech = True
in_silence_speech_count = 0
start = rline / rate - sbe
rline = None
linecount += 1
continue
elif '<w>' in line and in_speech:
in_speech_silence_count += 1
if not rline:
rline = linecount
if in_speech_silence_count / rate < mns:
linecount += 1
continue
in_speech = False
in_speech_silence_count = 0
end = rline / rate + see
rline = None
t.append((lna + '_' + str(lnacount), start, end))
start = None
linecount += 1
lnacount += 1
if start:
# One turn started but it hasn't ended, write it as last turn
# if it's bigger than ms
end = (linecount - 1) / rate
if end - start >= ms:
end += see
t.append((lna + '_' + str(lnacount), start, end))
return t
def write_recipe_line(wav, lna, start, end, outf):
"""Write output recipes"""
outf.write('audio=' + wav +
' lna=' + lna +
' start-time=' + str(start) +
' end-time=' + str(end) + '\n')
def write_recipe(rec, epath, outf):
lna = 'a' # Lna base name for the next recipe line
for wav in rec:
turns = parse_exp_file(wav_to_exp(wav, epath), lna)
for t in turns:
write_recipe_line(wav, t[0], t[1], t[2], outf)
lna = inc_lna(lna) # Increment Lna base name for the following recipe line
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Creates a recipe from the \
Speech Activity Detection classify_speecon output, \
(.exp files) that is, speech/non-speech turn detection')
parser.add_argument('recfile', type=str,
help='Specifies the input recipe file')
parser.add_argument('exppath', type=str,
help='Specifies the input .exp files path')
parser.add_argument('-o', dest='outfile', type=str, default=sys.stdout,
help='Specifies an output file, default stdout.')
parser.add_argument('-r', dest='rate', type=int, default=125,
help='Specifies the sample rate, default 125.')
parser.add_argument('-ms', dest='minspeech', type=float, default=0.1,
help='Specifies the minimum speech turn duration, \
default 0.1 seconds (roughly one word).')
# This 0.3 second default comes from "The 2009 (RT-09) Rich
# Transcription Meeting Recognition Evaluation Plan", page 4
parser.add_argument('-mns', dest='minnonspeech', type=float, default=0.3,
help='Specifies the minimum nonspeech between-turns\
duration, default 0.3 seconds (NIST standard).')
parser.add_argument('-sbe', dest='seg_before_exp', type=float, default=0.0,
help='Specifies a segment expansion time, that is, to\
remove some time before each detected speaker segment.\
No overlapping check, so set this to less than half \
the value of -mns. Default is 0.0 seconds (no expansion).')
parser.add_argument('-see', dest='seg_end_exp', type=float, default=0.0,
help='Specifies a segment end expansion time, that is,\
to add some time after each detected speaker segment. \
No overlapping check, so set this to less than half \
the value of -mns. Default is 0.0 seconds (no expansion).')
args = parser.parse_args()
# Process arguments
print 'Reading recipe from:', args.recfile
with open(args.recfile, 'r') as recfile:
recipe = parse_recipe(recfile)
print 'Reading .exp files from:', args.exppath
exppath = args.exppath
if exppath[-1] != '/':
exppath += '/'
if not os.path.isdir(exppath):
print 'Error,', exppath, 'is not a valid directory'
exit()
if args.outfile != sys.stdout:
outfile = args.outfile
print 'Writing output to:', args.outfile
else:
outfile = sys.stdout
print 'Writing output to: stdout'
print 'Sample rate set to:', args.rate
rate = float(args.rate) # To ensure floating point division with it
ms = args.minspeech
print 'Minimum speech turn duration:', ms, 'seconds'
mns = args.minnonspeech
print 'Minimum nonspeech between-turns duration:', mns, 'seconds'
sbe = args.seg_before_exp
print 'Segment before expansion set to:', sbe, 'seconds'
see = args.seg_end_exp
print 'Segment end expansion set to:', see, 'seconds'
# Do the real work
if outfile != sys.stdout:
with open(outfile, 'w') as outf:
write_recipe(recipe, exppath, outf)
else:
write_recipe(recipe, exppath, outfile)