forked from as-ideas/ForwardTacotron
-
Notifications
You must be signed in to change notification settings - Fork 0
/
hparams.py
161 lines (119 loc) · 6.89 KB
/
hparams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# CONFIG -----------------------------------------------------------------------------------------------------------#
# Here are the input and output data paths (Note: you can override wav_path in preprocess.py)
wav_path = '/path/to/wav_files/'
data_path = 'data/'
# model ids are separate - that way you can use a new tts with an old wavernn and vice versa
# NB: expect undefined behaviour if models were trained on different DSP settings
voc_model_id = 'ljspeech_raw'
tts_model_id = 'ljspeech_tts'
# set this to True if you are only interested in WaveRNN
ignore_tts = False
# DSP --------------------------------------------------------------------------------------------------------------#
# Settings for all models
sample_rate = 22050
n_fft = 1024
fft_bins = n_fft // 2 + 1
num_mels = 80
hop_length = 256 # 12.5ms - in line with Tacotron 2 paper
win_length = 1024 # 50ms - same reason as above
fmin = 0
fmax = 8000
bits = 9 # bit depth of signal
mu_law = True # Recommended to suppress noise if using raw bits in hp.voc_mode below
peak_norm = False # Normalise to the peak of each wav file
trim_start_end_silence = True # Whether to trim leading and trailing silence
trim_silence_top_db = 60 # Threshold in decibels below reference to consider silence for for trimming
# start and end silences with librosa (no trimming if really high)
pitch_max_freq = 600 # Maximum value for pitch frequency to remove outliers (Common pitch range is
# about 60-300)
# Params for trimming long silences, from https://github.com/resemble-ai/Resemblyzer/blob/master/resemblyzer/hparams.py
trim_long_silences = False # Whether to reduce long silence using WebRTC Voice Activity Detector
vad_window_length = 30 # In milliseconds
vad_moving_average_width = 8
vad_max_silence_length = 12
vad_sample_rate = 16000
# GENERAL TRAINING ----------------------------------------------------------------------------------------------------------#
seed = 42
n_val = 200 # num validatino samples
# WAVERNN / VOCODER ------------------------------------------------------------------------------------------------#
# Model Hparams
voc_mode = 'RAW' # either 'RAW' (softmax on raw bits) or 'MOL' (sample from mixture of logistics)
voc_upsample_factors = (4, 8, 8) # NB - this needs to correctly factorise hop_length
voc_rnn_dims = 512
voc_fc_dims = 512
voc_compute_dims = 128
voc_res_out_dims = 128
voc_res_blocks = 10
# Training
voc_schedule = [(1e-4, 300_000, 32), # progressive training schedule
(2e-5, 2_000_000, 32)] # (lr, step, batch_size)
voc_checkpoint_every = 25_000
voc_gen_samples_every = 5000 # how often to generate samples for cherry-picking models
voc_gen_num_samples = 3 # number of samples to generate for cherry-picking models
voc_keep_top_k = 3 # how many top performing models to keep
voc_pad = 2 # this will pad the input so that the resnet can 'see' wider than input length
voc_seq_len = hop_length * 5 # must be a multiple of hop_length
voc_clip_grad_norm = 4 # set to None if no gradient clipping needed
# Generating / Synthesizing
voc_gen_batched = True # very fast (realtime+) single utterance batched generation
voc_target = 11_000 # target number of samples to be generated in each batch entry
voc_overlap = 550 # number of samples for crossfading between batches
# Duration Extraction from Attention
extract_durations_with_dijkstra = True # slower but much more robust than simply counting attention peaks
# TACOTRON TTS -----------------------------------------------------------------------------------------------------#
# Model Hparams
tts_embed_dims = 256 # embedding dimension for the graphemes/phoneme inputs
tts_encoder_dims = 128
tts_decoder_dims = 256
tts_postnet_dims = 128
tts_encoder_K = 16
tts_lstm_dims = 512
tts_postnet_K = 8
tts_num_highways = 4
tts_dropout = 0.5
language = 'en-us'
tts_cleaner_name = 'english_cleaners'
tts_stop_threshold = -11 # Value below which audio generation ends.
# For example, for a range of [-4, 4], this
# will terminate the sequence at the first
# frame that has all values < -3.4
# Training
tts_schedule = [(10, 1e-3, 10_000, 32), # progressive training schedule
(5, 1e-4, 20_000, 16), # (r, lr, step, batch_size)
(2, 1e-4, 30_000, 8),
(1, 1e-4, 50_000, 8)]
tts_max_mel_len = 1250 # if you have a couple of extremely long spectrograms you might want to use this
tts_clip_grad_norm = 1.0 # clips the gradient norm to prevent explosion - set to None if not needed
tts_checkpoint_every = 10_000 # checkpoints the model every X steps
tts_plot_every = 1000
# ------------------------------------------------------------------------------------------------------------------#
# FORWARD TACOTRON -----------------------------------------------------------------------------------------------------#
# Model Hparams
forward_embed_dims = 256 # embedding dimension for the graphemes/phoneme inputs
forward_prenet_dims = 256
forward_postnet_dims = 256
forward_durpred_conv_dims = 256
forward_durpred_rnn_dims = 64
forward_durpred_dropout = 0.5
forward_pitch_conv_dims = 256
forward_pitch_rnn_dims = 128
forward_pitch_dropout = 0.5
forward_pitch_emb_dims = 64 # embedding dimension of pitch, set to 0 if you don't want pitch conditioning
forward_pitch_proj_dropout = 0.
forward_prenet_K = 16
forward_postnet_K = 8
forward_rnn_dims = 512
forward_num_highways = 4
forward_dropout = 0.1
# Training
forward_schedule = [(1e-4, 10_000, 32), # progressive training schedule
(1e-4, 300_000, 32), # (lr, step, batch_size)
(2e-5, 600_000, 32)] # (lr, step, batch_size)
forward_max_mel_len = 1250 # if you have a couple of extremely long spectrograms you might want to use this
forward_clip_grad_norm = 1.0 # clips the gradient norm to prevent explosion - set to None if not needed
forward_checkpoint_every = 10_000 # checkpoints the model every X steps
forward_plot_every = 1000
forward_filter_attention = True # whether to filter data with bad attention scores
forward_min_attention_sharpness = 0.5 # filter data with bad attention sharpness score, if 0 then no filter
forward_min_attention_alignment = 0.95 # filter data with bad attention alignment score, if 0 then no filter
# ------------------------------------------------------------------------------------------------------------------#