Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master'
Browse files Browse the repository at this point in the history
  • Loading branch information
apls777 committed Sep 20, 2018
2 parents ea8977b + 5b2118d commit e52c53e
Show file tree
Hide file tree
Showing 22 changed files with 925 additions and 439 deletions.
30 changes: 24 additions & 6 deletions datasets/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,16 @@ def save_wav(wav, path, sr):
def save_wavenet_wav(wav, path, sr):
librosa.output.write_wav(path, wav, sr=sr)

def preemphasis(wav, k, preemphasize=True):
if preemphasize:
return signal.lfilter([1, -k], [1], wav)
return wav

def inv_preemphasis(wav, k, inv_preemphasize=True):
if inv_preemphasize:
return signal.lfilter([1], [1, -k], wav)
return wav

#From https://github.com/r9y9/wavenet_vocoder/blob/master/audio.py
def start_and_end_indices(quantized, silence_threshold=2):
for start in range(quantized.size):
Expand Down Expand Up @@ -47,15 +57,15 @@ def get_hop_size(hparams):
return hop_size

def linearspectrogram(wav, hparams):
D = _stft(wav, hparams)
D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
S = _amp_to_db(np.abs(D), hparams) - hparams.ref_level_db

if hparams.signal_normalization:
return _normalize(S, hparams)
return S

def melspectrogram(wav, hparams):
D = _stft(wav, hparams)
D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
S = _amp_to_db(_linear_to_mel(np.abs(D), hparams), hparams) - hparams.ref_level_db

if hparams.signal_normalization:
Expand All @@ -75,9 +85,9 @@ def inv_linear_spectrogram(linear_spectrogram, hparams):
processor = _lws_processor(hparams)
D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
y = processor.istft(D).astype(np.float32)
return y
return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
else:
return _griffin_lim(S ** hparams.power, hparams)
return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)


def inv_mel_spectrogram(mel_spectrogram, hparams):
Expand All @@ -93,9 +103,9 @@ def inv_mel_spectrogram(mel_spectrogram, hparams):
processor = _lws_processor(hparams)
D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
y = processor.istft(D).astype(np.float32)
return y
return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
else:
return _griffin_lim(S ** hparams.power, hparams)
return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)

def _lws_processor(hparams):
import lws
Expand All @@ -122,6 +132,8 @@ def _stft(y, hparams):
def _istft(y, hparams):
return librosa.istft(y, hop_length=get_hop_size(hparams), win_length=hparams.win_size)

##########################################################
#Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!)
def num_frames(length, fsize, fshift):
"""Compute number of time frames of spectrogram
"""
Expand All @@ -141,6 +153,12 @@ def pad_lr(x, fsize, fshift):
T = len(x) + 2 * pad
r = (M - 1) * fshift + fsize - T
return pad, pad + r
##########################################################
#Librosa correct padding
def librosa_pad_lr(x, fsize, fshift):
'''compute right padding (final frame)
'''
return int(fsize // 2)


# Conversions
Expand Down
18 changes: 13 additions & 5 deletions datasets/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,12 +116,20 @@ def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hpar
#sanity check
assert linear_frames == mel_frames

#Ensure time resolution adjustement between audio and mel-spectrogram
fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))
if hparams.use_lws:
#Ensure time resolution adjustement between audio and mel-spectrogram
fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

#Zero pad audio signal
out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
else:
#Ensure time resolution adjustement between audio and mel-spectrogram
pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams))

#Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
out = np.pad(out, pad, mode='reflect')

#Zero pad for quantized signal
out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
assert len(out) >= mel_frames * audio.get_hop_size(hparams)

#time resolution adjustement
Expand Down
18 changes: 13 additions & 5 deletions datasets/wavenet_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,12 +103,20 @@ def _process_utterance(mel_dir, wav_dir, index, wav_path, hparams):
if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
return None

#Ensure time resolution adjustement between audio and mel-spectrogram
fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))
if hparams.use_lws:
#Ensure time resolution adjustement between audio and mel-spectrogram
fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

#Zero pad audio signal
out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
else:
#Ensure time resolution adjustement between audio and mel-spectrogram
pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams))

#Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
out = np.pad(out, pad, mode='reflect')

#Zero pad for quantized signal
out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
assert len(out) >= mel_frames * audio.get_hop_size(hparams)

#time resolution adjustement
Expand Down
104 changes: 98 additions & 6 deletions griffin_lim_synthesis_tool.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,30 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {
"scrolled": true
},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/rayhane/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
" from ._conv import register_converters as _register_converters\n"
]
},
{
"data": {
"text/plain": [
"(636, 80)"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"from datasets.audio import *\n",
Expand All @@ -20,22 +39,95 @@
"\n",
"os.makedirs(out_dir, exist_ok=True)\n",
"\n",
"mel_file = os.path.join(mel_folder, mel_file)\n",
"#mel_file = os.path.join(mel_folder, mel_file)\n",
"mel_file = 'training_data/mels/mel-LJ001-0005.npy'\n",
"mel_spectro = np.load(mel_file)\n",
"mel_spectro.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/rayhane/.local/lib/python3.6/site-packages/librosa/util/utils.py:1725: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
" if np.issubdtype(x.dtype, float) or np.issubdtype(x.dtype, complex):\n"
]
}
],
"source": [
"wav = inv_mel_spectrogram(mel_spectro.T, hparams) \n",
"#save the wav under test_<folder>_<file>\n",
"save_wav(wav, os.path.join(out_dir, 'test_{}.wav'.format(mel_file.replace('/', '_').replace('\\\\', '_'))),\n",
"save_wav(wav, os.path.join(out_dir, 'test_mel_{}.wav'.format(mel_file.replace('/', '_').replace('\\\\', '_').replace('.npy', ''))),\n",
" sr=hparams.sample_rate)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from tacotron.utils.plot import *\n",
"\n",
"plot_spectrogram(mel_spectro, path=os.path.join(out_dir, 'test_mel_{}.png'.format(mel_file.replace('/', '_').replace('\\\\', '_').replace('.npy', ''))))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(636, 1025)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lin_file = 'training_data/linear/linear-LJ001-0005.npy'\n",
"lin_spectro = np.load(lin_file)\n",
"lin_spectro.shape"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/rayhane/.local/lib/python3.6/site-packages/librosa/util/utils.py:1725: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
" if np.issubdtype(x.dtype, float) or np.issubdtype(x.dtype, complex):\n"
]
}
],
"source": [
"wav = inv_linear_spectrogram(lin_spectro.T, hparams)\n",
"save_wav(wav, os.path.join(out_dir, 'test_linear_{}.wav'.format(mel_file.replace('/', '_').replace('\\\\', '_').replace('.npy', ''))),\n",
" sr=hparams.sample_rate)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"plot_spectrogram(lin_spectro, path=os.path.join(out_dir, 'test_linear_{}.png'.format(mel_file.replace('/', '_').replace('\\\\', '_').replace('.npy', ''))),\n",
" auto_aspect=True)"
]
}
],
"metadata": {
Expand Down
Loading

0 comments on commit e52c53e

Please sign in to comment.