-
Notifications
You must be signed in to change notification settings - Fork 6
/
build.py
100 lines (75 loc) · 2.74 KB
/
build.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import numpy as np
import pyworld as pw
import soundfile as sf
import tensorflow as tf
from analyzer import SPEAKERS, pw2wav, read, read_whole_features
args = tf.app.flags.FLAGS
tf.app.flags.DEFINE_string(
'train_file_pattern',
'./dataset/vcc2016/bin/Training Set/*/*.bin',
'training dir (to *.bin)')
def main():
tf.gfile.MkDir('./etc')
# ==== Save max and min value ====
x = read_whole_features(args.train_file_pattern)
x_all = list()
y_all = list()
f0_all = list()
sv = tf.train.Supervisor()
with sv.managed_session() as sess:
while True:
try:
features = sess.run(x)
print('Processing {}'.format(features['filename']))
x_all.append(features['sp'])
y_all.append(features['speaker'])
f0_all.append(features['f0'])
finally:
pass
x_all = np.concatenate(x_all, axis=0)
y_all = np.concatenate(y_all, axis=0)
f0_all = np.concatenate(f0_all, axis=0)
# ==== F0 stats ====
for s in SPEAKERS:
print('Speaker {}'.format(s))
f0 = f0_all[SPEAKERS.index(s) == y_all]
print(' len: {}'.format(len(f0)))
f0 = f0[f0 > 2.]
f0 = np.log(f0)
mu, std = f0.mean(), f0.std()
# Save as `float32`
with open('./etc/{}.npf'.format(s), 'wb') as fp:
fp.write(np.asarray([mu, std]).tostring())
# ==== Min/Max value ====
# mu = x_all.mean(0)
# std = x_all.std(0)
q005 = np.percentile(x_all, 0.5, axis=0)
q995 = np.percentile(x_all, 99.5, axis=0)
# Save as `float32`
with open('./etc/xmin.npf', 'wb') as fp:
fp.write(q005.tostring())
with open('./etc/xmax.npf', 'wb') as fp:
fp.write(q995.tostring())
def test():
# ==== Test: batch mixer (conclusion: capacity should be larger to make sure good mixing) ====
x, y = read('./dataset/vcc2016/bin/*/*/1*001.bin', 32, min_after_dequeue=1024, capacity=2048)
sv = tf.train.Supervisor()
with sv.managed_session() as sess:
for _ in range(200):
x_, y_ = sess.run([x, y])
print(y_)
# ===== Read binary ====
features = read_whole_features('./dataset/vcc2016/bin/Training Set/SF1/*001.bin')
sv = tf.train.Supervisor()
with sv.managed_session() as sess:
features = sess.run(features)
y = pw2wav(features)
sf.write('test1.wav', y, 16000) # TODO fs should be specified externally.
# ==== Direct read =====
f = './dataset/vcc2016/bin/Training Set/SF1/100001.bin'
features = np.fromfile(f, np.float32)
features = np.reshape(features, [-1, 513*2 + 1 + 1 + 1]) # f0, en, spk
y = pw2wav(features)
sf.write('test2.wav', y, 16000)
if __name__ == '__main__':
main()