From 12ed509e991c013022a1da705b966d61d558c7c5 Mon Sep 17 00:00:00 2001
From: Nick John <nickjohn1995@live.com>
Date: Sat, 9 Nov 2019 14:07:20 +0800
Subject: [PATCH] Update README.md and hparams.py

---
 README.md  | 61 ++++++++++++++++++++++++++++++++++--------------------
 hparams.py | 14 ++++++-------
 2 files changed, 45 insertions(+), 30 deletions(-)

diff --git a/README.md b/README.md
index f6916b86..0faf5d8d 100644
--- a/README.md
+++ b/README.md
@@ -1,34 +1,41 @@
-# Tacotron-2-Chinese
+# **Tacotron-2-Chinese 中文语音合成**
 
-## 预训练模型
+## **预训练模型下载**
 
-[标贝数据集100K步模型](https://github.com/JasonWei512/Tacotron-2-Chinese/releases/download/Biaobei_Tacotron-100K/logs-Tacotron-2.zip)
+&ensp; &ensp; [标贝数据集100K步模型（把解压出的 logs-Tacotron-2 文件夹放到 Tacotron-2-Chinese 文件夹中）](https://github.com/JasonWei512/Tacotron-2-Chinese/releases/download/Biaobei_Tacotron-100K/logs-Tacotron-2.zip)
 
-[生成语音样本](https://github.com/JasonWei512/Tacotron-2-Chinese/releases/download/Biaobei_Tacotron-100K/generated_sample.wav)
+&ensp; &ensp; [生成语音样本](https://github.com/JasonWei512/Tacotron-2-Chinese/releases/download/Biaobei_Tacotron-100K/generated_sample.wav)
 
-仅Tacotron，无WaveNet（正在尝试 mulaw-quantize）
+&ensp; &ensp; 仅 Tacotron 频谱预测部分，无 WaveNet 声码器（实验中），可用 Griffin-Lim 合成语音（见下）。
 
-使用标贝数据集，为避免爆显存用了ffmpeg把语料的采样率从48KHz降到了36KHz
+&ensp; &ensp; 使用标贝数据集训练，为避免爆显存用了 ffmpeg 把语料的采样率从 48KHz 降到了 36KHz，听感基本无区别。
 
-## 安装依赖
+## **安装依赖**
 
-1. 安装 Python 3 和 Tensorflow（在 Tensorflow 1.14 上用 WaveNet 会有Bug，在 1.10 上正常）
+1. 安装 Python 3 和 Tensorflow 1.10（在 Tensorflow 1.14 上用 WaveNet 会有Bug，在 1.10 上正常）。
 
 2. 安装依赖：
-   ```
+   
+   ```Shell
    apt-get install -y libasound-dev portaudio19-dev libportaudio2 libportaudiocpp0 ffmpeg libav-tools
    ```
 
-3. 安装 requirements：
+   若 libav-tools 安装失败则手动安装：
+
+   ```Shell
+   wget http://launchpadlibrarian.net/339874908/libav-tools_3.3.4-2_all.deb
+   dpkg -i libav-tools_3.3.4-2_all.deb
    ```
+
+3. 安装 requirements：
+   
+   ```Shell
    pip install -r requirements.txt
    ```
 
-## 训练
+## **训练模型**
 
-1. **下载[标贝数据集](https://weixinxcxdb.oss-cn-beijing.aliyuncs.com/gwYinPinKu/BZNSYP.rar)，解压至 `Tacotron-2-Chinese`**
-   
-   目录结构如下：
+1. 下载 [标贝数据集](https://weixinxcxdb.oss-cn-beijing.aliyuncs.com/gwYinPinKu/BZNSYP.rar)，解压至 `Tacotron-2-Chinese` 文件夹根目录。目录结构如下：
 
    ```
    Tacotron-2-Chinese
@@ -38,27 +45,35 @@
          |- Wave
    ```
 
-2. **用ffmpeg把BZNSYP/Wave中的wav的采样率降到36KHz**
-   ```
+2. 用 ffmpeg 把 `/BZNSYP/Wave/` 中的 wav 的采样率降到36KHz：
+   
+   ```Shell
    ffmpeg.exe -i 输入.wav -ar 36000 输出.wav
    ```
 
-3. **预处理数据**
-   ```
+3. 预处理数据：
+   
+   ```Shell
    python preprocess.py --dataset='Biaobei'
    ```
 
-4. **训练模型（自动从最新 Checkpoint 继续）**
-   ```
+4. 训练模型（自动从最新 Checkpoint 继续）：
+   
+   ```Shell
    python train.py --model='Tacotron-2'
    ```
 
-5. **从最新 Checkpoint 合成语音** 
+## **合成语音**
 
-   ```
+* 用根目录的 `sentences.txt` 中的文本合成语音。
+
+   ```Shell
    python synthesize.py --model='Tacotron-2' --text_list='sentences.txt'
    ```
-   无WaveNet时，Tacotron输出mel谱，后处理得线性谱，由Griffin-Lim生成波形
+
+   若无 WaveNet 模型，仅有频谱预测模型，则仅由 Griffin-Lim 生成语音，输出至 `/tacotron_output/logs-eval/wavs/` 文件夹中。
+
+   若有 WaveNet 模型，则 WaveNet 生成的语音位于 `/wavenet_output/wavs/` 中。
 
 &nbsp;
 
diff --git a/hparams.py b/hparams.py
index 9871fbff..bb187781 100644
--- a/hparams.py
+++ b/hparams.py
@@ -199,12 +199,12 @@
 
 	#model parameters
 	#To use Gaussian distribution as output distribution instead of mixture of logistics, set "out_channels = 2" instead of "out_channels = 10 * 3". (UNDER TEST)
-	out_channels = 2, #This should be equal to quantize channels when input type is 'mulaw-quantize' else: num_distributions * 3 (prob, mean, log_scale).
-	layers = 20, #Number of dilated convolutions (Default: Simplified Wavenet of Tacotron-2 paper)
-	stacks = 2, #Number of dilated convolution stacks (Default: Simplified Wavenet of Tacotron-2 paper)
-	residual_channels = 128, #Number of residual block input/output channels.
-	gate_channels = 256, #split in 2 in gated convolutions
-	skip_out_channels = 128, #Number of residual block skip convolution channels.
+	out_channels = 30, #This should be equal to quantize channels when input type is 'mulaw-quantize' else: num_distributions * 3 (prob, mean, log_scale).
+	layers = 24, #Number of dilated convolutions (Default: Simplified Wavenet of Tacotron-2 paper)
+	stacks = 4, #Number of dilated convolution stacks (Default: Simplified Wavenet of Tacotron-2 paper)
+	residual_channels = 256, #Number of residual block input/output channels.
+	gate_channels = 512, #split in 2 in gated convolutions
+	skip_out_channels = 256, #Number of residual block skip convolution channels.
 	kernel_size = 3, #The number of inputs to consider in dilated convolutions.
 
 	#Upsampling parameters (local conditioning)
@@ -310,7 +310,7 @@
 	wavenet_learning_rate = 1e-3, #wavenet initial learning rate
 	wavenet_warmup = float(4000), #Only used with 'noam' scheme. Defines the number of ascending learning rate steps.
 	wavenet_decay_rate = 0.5, #Only used with 'exponential' scheme. Defines the decay rate.
-	wavenet_decay_steps = 200000, #Only used with 'exponential' scheme. Defines the decay steps.
+	wavenet_decay_steps = 150000, #Only used with 'exponential' scheme. Defines the decay steps.
 
 	#Optimization parameters
 	wavenet_adam_beta1 = 0.9, #Adam beta1