From 12ed509e991c013022a1da705b966d61d558c7c5 Mon Sep 17 00:00:00 2001 From: Nick John Date: Sat, 9 Nov 2019 14:07:20 +0800 Subject: [PATCH] Update README.md and hparams.py --- README.md | 61 ++++++++++++++++++++++++++++++++++-------------------- hparams.py | 14 ++++++------- 2 files changed, 45 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index f6916b86..0faf5d8d 100644 --- a/README.md +++ b/README.md @@ -1,34 +1,41 @@ -# Tacotron-2-Chinese +# **Tacotron-2-Chinese 中文语音合成** -## 预训练模型 +## **预训练模型下载** -[标贝数据集100K步模型](https://github.com/JasonWei512/Tacotron-2-Chinese/releases/download/Biaobei_Tacotron-100K/logs-Tacotron-2.zip) +    [标贝数据集100K步模型(把解压出的 logs-Tacotron-2 文件夹放到 Tacotron-2-Chinese 文件夹中)](https://github.com/JasonWei512/Tacotron-2-Chinese/releases/download/Biaobei_Tacotron-100K/logs-Tacotron-2.zip) -[生成语音样本](https://github.com/JasonWei512/Tacotron-2-Chinese/releases/download/Biaobei_Tacotron-100K/generated_sample.wav) +    [生成语音样本](https://github.com/JasonWei512/Tacotron-2-Chinese/releases/download/Biaobei_Tacotron-100K/generated_sample.wav) -仅Tacotron,无WaveNet(正在尝试 mulaw-quantize) +    仅 Tacotron 频谱预测部分,无 WaveNet 声码器(实验中),可用 Griffin-Lim 合成语音(见下)。 -使用标贝数据集,为避免爆显存用了ffmpeg把语料的采样率从48KHz降到了36KHz +    使用标贝数据集训练,为避免爆显存用了 ffmpeg 把语料的采样率从 48KHz 降到了 36KHz,听感基本无区别。 -## 安装依赖 +## **安装依赖** -1. 安装 Python 3 和 Tensorflow(在 Tensorflow 1.14 上用 WaveNet 会有Bug,在 1.10 上正常) +1. 安装 Python 3 和 Tensorflow 1.10(在 Tensorflow 1.14 上用 WaveNet 会有Bug,在 1.10 上正常)。 2. 安装依赖: - ``` + + ```Shell apt-get install -y libasound-dev portaudio19-dev libportaudio2 libportaudiocpp0 ffmpeg libav-tools ``` -3. 安装 requirements: + 若 libav-tools 安装失败则手动安装: + + ```Shell + wget http://launchpadlibrarian.net/339874908/libav-tools_3.3.4-2_all.deb + dpkg -i libav-tools_3.3.4-2_all.deb ``` + +3. 安装 requirements: + + ```Shell pip install -r requirements.txt ``` -## 训练 +## **训练模型** -1. **下载[标贝数据集](https://weixinxcxdb.oss-cn-beijing.aliyuncs.com/gwYinPinKu/BZNSYP.rar),解压至 `Tacotron-2-Chinese`** - - 目录结构如下: +1. 下载 [标贝数据集](https://weixinxcxdb.oss-cn-beijing.aliyuncs.com/gwYinPinKu/BZNSYP.rar),解压至 `Tacotron-2-Chinese` 文件夹根目录。目录结构如下: ``` Tacotron-2-Chinese @@ -38,27 +45,35 @@ |- Wave ``` -2. **用ffmpeg把BZNSYP/Wave中的wav的采样率降到36KHz** - ``` +2. 用 ffmpeg 把 `/BZNSYP/Wave/` 中的 wav 的采样率降到36KHz: + + ```Shell ffmpeg.exe -i 输入.wav -ar 36000 输出.wav ``` -3. **预处理数据** - ``` +3. 预处理数据: + + ```Shell python preprocess.py --dataset='Biaobei' ``` -4. **训练模型(自动从最新 Checkpoint 继续)** - ``` +4. 训练模型(自动从最新 Checkpoint 继续): + + ```Shell python train.py --model='Tacotron-2' ``` -5. **从最新 Checkpoint 合成语音** +## **合成语音** - ``` +* 用根目录的 `sentences.txt` 中的文本合成语音。 + + ```Shell python synthesize.py --model='Tacotron-2' --text_list='sentences.txt' ``` - 无WaveNet时,Tacotron输出mel谱,后处理得线性谱,由Griffin-Lim生成波形 + + 若无 WaveNet 模型,仅有频谱预测模型,则仅由 Griffin-Lim 生成语音,输出至 `/tacotron_output/logs-eval/wavs/` 文件夹中。 + + 若有 WaveNet 模型,则 WaveNet 生成的语音位于 `/wavenet_output/wavs/` 中。   diff --git a/hparams.py b/hparams.py index 9871fbff..bb187781 100644 --- a/hparams.py +++ b/hparams.py @@ -199,12 +199,12 @@ #model parameters #To use Gaussian distribution as output distribution instead of mixture of logistics, set "out_channels = 2" instead of "out_channels = 10 * 3". (UNDER TEST) - out_channels = 2, #This should be equal to quantize channels when input type is 'mulaw-quantize' else: num_distributions * 3 (prob, mean, log_scale). - layers = 20, #Number of dilated convolutions (Default: Simplified Wavenet of Tacotron-2 paper) - stacks = 2, #Number of dilated convolution stacks (Default: Simplified Wavenet of Tacotron-2 paper) - residual_channels = 128, #Number of residual block input/output channels. - gate_channels = 256, #split in 2 in gated convolutions - skip_out_channels = 128, #Number of residual block skip convolution channels. + out_channels = 30, #This should be equal to quantize channels when input type is 'mulaw-quantize' else: num_distributions * 3 (prob, mean, log_scale). + layers = 24, #Number of dilated convolutions (Default: Simplified Wavenet of Tacotron-2 paper) + stacks = 4, #Number of dilated convolution stacks (Default: Simplified Wavenet of Tacotron-2 paper) + residual_channels = 256, #Number of residual block input/output channels. + gate_channels = 512, #split in 2 in gated convolutions + skip_out_channels = 256, #Number of residual block skip convolution channels. kernel_size = 3, #The number of inputs to consider in dilated convolutions. #Upsampling parameters (local conditioning) @@ -310,7 +310,7 @@ wavenet_learning_rate = 1e-3, #wavenet initial learning rate wavenet_warmup = float(4000), #Only used with 'noam' scheme. Defines the number of ascending learning rate steps. wavenet_decay_rate = 0.5, #Only used with 'exponential' scheme. Defines the decay rate. - wavenet_decay_steps = 200000, #Only used with 'exponential' scheme. Defines the decay steps. + wavenet_decay_steps = 150000, #Only used with 'exponential' scheme. Defines the decay steps. #Optimization parameters wavenet_adam_beta1 = 0.9, #Adam beta1