Merge remote-tracking branch 'origin/master'

Rayhane-mamah · Sep 20, 2018 · e52c53e · e52c53e
2 parents ea8977b + 5b2118d
commit e52c53e
Show file tree

Hide file tree

Showing 22 changed files with 925 additions and 439 deletions.
diff --git a/datasets/audio.py b/datasets/audio.py
@@ -17,6 +17,16 @@ def save_wav(wav, path, sr):
 def save_wavenet_wav(wav, path, sr):
 	librosa.output.write_wav(path, wav, sr=sr)
 
+def preemphasis(wav, k, preemphasize=True):
+	if preemphasize:
+		return signal.lfilter([1, -k], [1], wav)
+	return wav
+
+def inv_preemphasis(wav, k, inv_preemphasize=True):
+	if inv_preemphasize:
+		return signal.lfilter([1], [1, -k], wav)
+	return wav
+
 #From https://github.com/r9y9/wavenet_vocoder/blob/master/audio.py
 def start_and_end_indices(quantized, silence_threshold=2):
 	for start in range(quantized.size):
@@ -47,15 +57,15 @@ def get_hop_size(hparams):
 	return hop_size
 
 def linearspectrogram(wav, hparams):
-	D = _stft(wav, hparams)
+	D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
 	S = _amp_to_db(np.abs(D), hparams) - hparams.ref_level_db
 
 	if hparams.signal_normalization:
 		return _normalize(S, hparams)
 	return S
 
 def melspectrogram(wav, hparams):
-	D = _stft(wav, hparams)
+	D = _stft(preemphasis(wav, hparams.preemphasis, hparams.preemphasize), hparams)
 	S = _amp_to_db(_linear_to_mel(np.abs(D), hparams), hparams) - hparams.ref_level_db
 
 	if hparams.signal_normalization:
@@ -75,9 +85,9 @@ def inv_linear_spectrogram(linear_spectrogram, hparams):
 		processor = _lws_processor(hparams)
 		D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
 		y = processor.istft(D).astype(np.float32)
-		return y
+		return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
 	else:
-		return _griffin_lim(S ** hparams.power, hparams)
+		return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
 
 
 def inv_mel_spectrogram(mel_spectrogram, hparams):
@@ -93,9 +103,9 @@ def inv_mel_spectrogram(mel_spectrogram, hparams):
 		processor = _lws_processor(hparams)
 		D = processor.run_lws(S.astype(np.float64).T ** hparams.power)
 		y = processor.istft(D).astype(np.float32)
-		return y
+		return inv_preemphasis(y, hparams.preemphasis, hparams.preemphasize)
 	else:
-		return _griffin_lim(S ** hparams.power, hparams)
+		return inv_preemphasis(_griffin_lim(S ** hparams.power, hparams), hparams.preemphasis, hparams.preemphasize)
 
 def _lws_processor(hparams):
 	import lws
@@ -122,6 +132,8 @@ def _stft(y, hparams):
 def _istft(y, hparams):
 	return librosa.istft(y, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
 
+##########################################################
+#Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!)
 def num_frames(length, fsize, fshift):
 	"""Compute number of time frames of spectrogram
 	"""
@@ -141,6 +153,12 @@ def pad_lr(x, fsize, fshift):
 	T = len(x) + 2 * pad
 	r = (M - 1) * fshift + fsize - T
 	return pad, pad + r
+##########################################################
+#Librosa correct padding
+def librosa_pad_lr(x, fsize, fshift):
+	'''compute right padding (final frame)
+	'''
+	return int(fsize // 2)
 
 
 # Conversions

diff --git a/datasets/preprocessor.py b/datasets/preprocessor.py
@@ -116,12 +116,20 @@ def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hpar
 	#sanity check
 	assert linear_frames == mel_frames
 
-	#Ensure time resolution adjustement between audio and mel-spectrogram
-	fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
-	l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))
+	if hparams.use_lws:
+		#Ensure time resolution adjustement between audio and mel-spectrogram
+		fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
+		l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))
+
+		#Zero pad audio signal
+		out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
+	else:
+		#Ensure time resolution adjustement between audio and mel-spectrogram
+		pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams))
+
+		#Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
+		out = np.pad(out, pad, mode='reflect')
 
-	#Zero pad for quantized signal
-	out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
 	assert len(out) >= mel_frames * audio.get_hop_size(hparams)
 
 	#time resolution adjustement

diff --git a/datasets/wavenet_preprocessor.py b/datasets/wavenet_preprocessor.py
@@ -103,12 +103,20 @@ def _process_utterance(mel_dir, wav_dir, index, wav_path, hparams):
 	if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
 		return None
 
-	#Ensure time resolution adjustement between audio and mel-spectrogram
-	fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
-	l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))
+	if hparams.use_lws:
+		#Ensure time resolution adjustement between audio and mel-spectrogram
+		fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
+		l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))
+
+		#Zero pad audio signal
+		out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
+	else:
+		#Ensure time resolution adjustement between audio and mel-spectrogram
+		pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams))
+
+		#Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency)
+		out = np.pad(out, pad, mode='reflect')
 
-	#Zero pad for quantized signal
-	out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
 	assert len(out) >= mel_frames * audio.get_hop_size(hparams)
 
 	#time resolution adjustement

diff --git a/griffin_lim_synthesis_tool.ipynb b/griffin_lim_synthesis_tool.ipynb
@@ -2,11 +2,30 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {
     "scrolled": true
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/rayhane/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
+      "  from ._conv import register_converters as _register_converters\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(636, 80)"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "import numpy as np\n",
     "from datasets.audio import *\n",
@@ -20,22 +39,95 @@
     "\n",
     "os.makedirs(out_dir, exist_ok=True)\n",
     "\n",
-    "mel_file = os.path.join(mel_folder, mel_file)\n",
+    "#mel_file = os.path.join(mel_folder, mel_file)\n",
+    "mel_file = 'training_data/mels/mel-LJ001-0005.npy'\n",
     "mel_spectro = np.load(mel_file)\n",
     "mel_spectro.shape"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/rayhane/.local/lib/python3.6/site-packages/librosa/util/utils.py:1725: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
+      "  if np.issubdtype(x.dtype, float) or np.issubdtype(x.dtype, complex):\n"
+     ]
+    }
+   ],
    "source": [
     "wav = inv_mel_spectrogram(mel_spectro.T, hparams) \n",
     "#save the wav under test_<folder>_<file>\n",
-    "save_wav(wav, os.path.join(out_dir, 'test_{}.wav'.format(mel_file.replace('/', '_').replace('\\\\', '_'))),\n",
+    "save_wav(wav, os.path.join(out_dir, 'test_mel_{}.wav'.format(mel_file.replace('/', '_').replace('\\\\', '_').replace('.npy', ''))),\n",
+    "        sr=hparams.sample_rate)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tacotron.utils.plot import *\n",
+    "\n",
+    "plot_spectrogram(mel_spectro, path=os.path.join(out_dir, 'test_mel_{}.png'.format(mel_file.replace('/', '_').replace('\\\\', '_').replace('.npy', ''))))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(636, 1025)"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "lin_file = 'training_data/linear/linear-LJ001-0005.npy'\n",
+    "lin_spectro = np.load(lin_file)\n",
+    "lin_spectro.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/rayhane/.local/lib/python3.6/site-packages/librosa/util/utils.py:1725: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
+      "  if np.issubdtype(x.dtype, float) or np.issubdtype(x.dtype, complex):\n"
+     ]
+    }
+   ],
+   "source": [
+    "wav = inv_linear_spectrogram(lin_spectro.T, hparams)\n",
+    "save_wav(wav, os.path.join(out_dir, 'test_linear_{}.wav'.format(mel_file.replace('/', '_').replace('\\\\', '_').replace('.npy', ''))),\n",
     "        sr=hparams.sample_rate)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_spectrogram(lin_spectro, path=os.path.join(out_dir, 'test_linear_{}.png'.format(mel_file.replace('/', '_').replace('\\\\', '_').replace('.npy', ''))),\n",
+    "                auto_aspect=True)"
+   ]
   }
  ],
  "metadata": {