final

nobody132 · May 18, 2020 · cf32417 · cf32417
commit cf32417
Show file tree

Hide file tree

Showing 27 changed files with 1,173 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,12 @@
+*
+!examples
+!models
+!*.py
+!.gitignore
+!*.md
+!requirements.txt
+!docs/
+!images
+!*.svg
+!*.png
+!LICENSE*
diff --git a/README.md b/README.md
@@ -0,0 +1,27 @@
+# MASR 中文语音识别
+
+**MASR**是一个基于**端到端的深度神经网络**的**中文普通话语音识别**项目。
+
+## 原理
+
+MASR使用的是门控卷积神经网络（Gated Convolutional Network），网络结构类似于Facebook在2016年提出的Wav2letter。但是使用的激活函数不是`ReLU`或者是`HardTanh`，而是`GLU`（门控线性单元）。因此称作门控卷积网络。根据我的实验，使用`GLU`的收敛速度比`HardTanh`要快。如果你想要研究卷积网络用于语音识别的效果，这个项目可以作为一个参考。
+
+**以下用字错误率CER来衡量模型的表现，CER = 编辑距离 / 句子长度，越低越好**
+
+**大致可以理解为 1 - CER 就是识别准确率。**
+
+模型使用AISHELL-1数据集训练，共150小时的录音，覆盖了4000多个汉字。**工业界使用的语音识别系统通常使用至少10倍于本项目的录音数据来训练，同时使用特定场景的语料来训练语言模型**，所以，不要期待本项目可以和工业界的识别效果媲美。这对于Github上任何个人项目来说都不现实，除非有更先进的技术诞生。
+
+*什么叫特定场景的语料训练的语言模型？比如你使用游戏中的语音识别，它更倾向于将你的话识别成你在玩游戏时可能说的话，比如「貂蝉被蓝打死了」。而在其他场景下，「貂蝉被蓝打死了」根本就不是一句通顺的话。不信你和一个只读过三国演义没玩过王者荣耀的人说「貂蝉被蓝打死了」，你确定ta不会反问你：「啥？貂蝉被谁打死了？lan是谁？」*
+
+在单卡GTX 1080Ti上，模型每迭代一个epoch大约需要20分钟。（实验室的CUDA版本较低，不排除更新CUDA版本后会快一些的可能。）
+
+<img src="images/train.svg">
+
+上图为验证集的CER随epoch的训练曲线。可以看到，目前验证集CER已经下降到11%。
+
+图中没有显示测试集的表现。测试集的CER稍高一些，在14%。
+
+通过外接语言模型可以将测试集的CER降低到8%。
+
+项目目前提供的预训练模型训练了大约是100个epoch时候的，已经接近最好了。
diff --git a/beamdecode.py b/beamdecode.py
@@ -0,0 +1,48 @@
+import _init_path
+import torch
+import feature
+from models.conv import GatedConv
+import torch.nn.functional as F
+from ctcdecode import CTCBeamDecoder
+
+alpha = 0.8
+beta = 0.3
+lm_path = "lm/zh_giga.no_cna_cmn.prune01244.klm"
+cutoff_top_n = 40
+cutoff_prob = 1.0
+beam_width = 32
+num_processes = 4
+blank_index = 0
+
+model = GatedConv.load("pretrained/gated-conv.pth")
+model.eval()
+
+decoder = CTCBeamDecoder(
+    model.vocabulary,
+    lm_path,
+    alpha,
+    beta,
+    cutoff_top_n,
+    cutoff_prob,
+    beam_width,
+    num_processes,
+    blank_index,
+)
+
+
+def translate(vocab, out, out_len):
+    return "".join([vocab[x] for x in out[0:out_len]])
+
+
+def predict(f):
+    wav = feature.load_audio(f)
+    spec = feature.spectrogram(wav)
+    spec.unsqueeze_(0)
+    with torch.no_grad():
+        y = model.cnn(spec)
+        y = F.softmax(y, 1)
+    y_len = torch.tensor([y.size(-1)])
+    y = y.permute(0, 2, 1)  # B * T * V
+    print("decoding")
+    out, score, offset, out_len = decoder.decode(y, y_len)
+    return translate(model.vocabulary, out[0][0], out_len[0][0])
diff --git a/data.py b/data.py
@@ -0,0 +1,97 @@
+import torch
+import librosa
+import wave
+import numpy as np
+import scipy
+import json
+from torch.utils.data import DataLoader
+from torch.utils.data import Dataset
+
+sample_rate = 16000
+window_size = 0.02
+window_stride = 0.01
+n_fft = int(sample_rate * window_size)
+win_length = n_fft
+hop_length = int(sample_rate * window_stride)
+window = "hamming"
+
+
+def load_audio(wav_path, normalize=True):  # -> numpy array
+    with wave.open(wav_path) as wav:
+        wav = np.frombuffer(wav.readframes(wav.getnframes()), dtype="int16")
+        wav = wav.astype("float")
+    if normalize:
+        return (wav - wav.mean()) / wav.std()
+    else:
+        return wav
+
+
+def spectrogram(wav, normalize=True):
+    D = librosa.stft(
+        wav, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window
+    )
+
+    spec, phase = librosa.magphase(D)
+    spec = np.log1p(spec)
+    spec = torch.FloatTensor(spec)
+
+    if normalize:
+        spec = (spec - spec.mean()) / spec.std()
+
+    return spec
+
+
+class MASRDataset(Dataset):
+    def __init__(self, index_path, labels_path):
+        with open(index_path) as f:
+            idx = f.readlines()
+        idx = [x.strip().split(",", 1) for x in idx]
+        self.idx = idx
+        with open(labels_path) as f:
+            labels = json.load(f)
+        self.labels = dict([(labels[i], i) for i in range(len(labels))])
+        self.labels_str = labels
+
+    def __getitem__(self, index):
+        wav, transcript = self.idx[index]
+        wav = load_audio(wav)
+        spect = spectrogram(wav)
+        transcript = list(filter(None, [self.labels.get(x) for x in transcript]))
+
+        return spect, transcript
+
+    def __len__(self):
+        return len(self.idx)
+
+
+def _collate_fn(batch):
+    def func(p):
+        return p[0].size(1)
+
+    batch = sorted(batch, key=lambda sample: sample[0].size(1), reverse=True)
+    longest_sample = max(batch, key=func)[0]
+    freq_size = longest_sample.size(0)
+    minibatch_size = len(batch)
+    max_seqlength = longest_sample.size(1)
+    inputs = torch.zeros(minibatch_size, freq_size, max_seqlength)
+    input_lens = torch.IntTensor(minibatch_size)
+    target_lens = torch.IntTensor(minibatch_size)
+    targets = []
+    for x in range(minibatch_size):
+        sample = batch[x]
+        tensor = sample[0]
+        target = sample[1]
+        seq_length = tensor.size(1)
+        inputs[x].narrow(1, 0, seq_length).copy_(tensor)
+        input_lens[x] = seq_length
+        target_lens[x] = len(target)
+        targets.extend(target)
+    targets = torch.IntTensor(targets)
+    return inputs, targets, input_lens, target_lens
+
+
+class MASRDataLoader(DataLoader):
+    def __init__(self, *args, **kwargs):
+        super(MASRDataLoader, self).__init__(*args, **kwargs)
+        self.collate_fn = _collate_fn
+
diff --git a/decoder.py b/decoder.py
@@ -0,0 +1,135 @@
+import Levenshtein as Lev
+import torch
+from six.moves import xrange
+
+
+class Decoder(object):
+    """
+    Basic decoder class from which all other decoders inherit. Implements several
+    helper functions. Subclasses should implement the decode() method.
+
+    Arguments:
+        labels (string): mapping from integers to characters.
+        blank_index (int, optional): index for the blank '_' character. Defaults to 0.
+        space_index (int, optional): index for the space ' ' character. Defaults to 28.
+    """
+
+    def __init__(self, labels, blank_index=0):
+        # e.g. labels = "_'ABCDEFGHIJKLMNOPQRSTUVWXYZ#"
+        self.labels = labels
+        self.int_to_char = dict([(i, c) for (i, c) in enumerate(labels)])
+        self.blank_index = blank_index
+        """
+        space_index = len(labels)  # To prevent errors in decode, we add an out of bounds index for the space
+        if ' ' in labels:
+            space_index = labels.index(' ')
+        self.space_index = space_index
+        """
+
+    def wer(self, s1, s2):
+        """
+        Computes the Word Error Rate, defined as the edit distance between the
+        two provided sentences after tokenizing to words.
+        Arguments:
+            s1 (string): space-separated sentence
+            s2 (string): space-separated sentence
+        """
+
+        # build mapping of words to integers
+        b = set(s1.split() + s2.split())
+        word2char = dict(zip(b, range(len(b))))
+
+        # map the words to a char array (Levenshtein packages only accepts
+        # strings)
+        w1 = [chr(word2char[w]) for w in s1.split()]
+        w2 = [chr(word2char[w]) for w in s2.split()]
+
+        return Lev.distance("".join(w1), "".join(w2))
+
+    def cer(self, s1, s2):
+        """
+        Computes the Character Error Rate, defined as the edit distance.
+
+        Arguments:
+            s1 (string): space-separated sentence
+            s2 (string): space-separated sentence
+        """
+        s1, s2, = s1.replace(" ", ""), s2.replace(" ", "")
+        return Lev.distance(s1, s2)
+
+    def decode(self, probs, sizes=None):
+        """
+        Given a matrix of character probabilities, returns the decoder's
+        best guess of the transcription
+
+        Arguments:
+            probs: Tensor of character probabilities, where probs[c,t]
+                            is the probability of character c at time t
+            sizes(optional): Size of each sequence in the mini-batch
+        Returns:
+            string: sequence of the model's best guess for the transcription
+        """
+        raise NotImplementedError
+
+
+class GreedyDecoder(Decoder):
+    def __init__(self, labels, blank_index=0):
+        super(GreedyDecoder, self).__init__(labels, blank_index)
+
+    def convert_to_strings(
+        self, sequences, sizes=None, remove_repetitions=False, return_offsets=False
+    ):
+        """Given a list of numeric sequences, returns the corresponding strings"""
+        strings = []
+        offsets = [] if return_offsets else None
+        for x in xrange(len(sequences)):
+            seq_len = sizes[x] if sizes is not None else len(sequences[x])
+            string, string_offsets = self.process_string(
+                sequences[x], seq_len, remove_repetitions
+            )
+            strings.append([string])  # We only return one path
+            if return_offsets:
+                offsets.append([string_offsets])
+        if return_offsets:
+            return strings, offsets
+        else:
+            return strings
+
+    def process_string(self, sequence, size, remove_repetitions=False):
+        string = ""
+        offsets = []
+        for i in range(size):
+            char = self.int_to_char[sequence[i].item()]
+            if char != self.int_to_char[self.blank_index]:
+                # if this char is a repetition and remove_repetitions=true, then skip
+                if (
+                    remove_repetitions
+                    and i != 0
+                    and char == self.int_to_char[sequence[i - 1].item()]
+                ):
+                    pass
+                else:
+                    string = string + char
+                    offsets.append(i)
+        return string, torch.tensor(offsets, dtype=torch.int)
+
+    def decode(self, probs, sizes=None):
+        """
+        Returns the argmax decoding given the probability matrix. Removes
+        repeated elements in the sequence, as well as blanks.
+
+        Arguments:
+            probs: Tensor of character probabilities from the network. Expected shape of batch x seq_length x output_dim
+            sizes(optional): Size of each sequence in the mini-batch
+        Returns:
+            strings: sequences of the model's best guess for the transcription on inputs
+            offsets: time step per character predicted
+        """
+        _, max_probs = torch.max(probs, 2)
+        strings, offsets = self.convert_to_strings(
+            max_probs.view(max_probs.size(0), max_probs.size(1)),
+            sizes,
+            remove_repetitions=True,
+            return_offsets=True,
+        )
+        return strings, offsets
diff --git a/docs/compare.md b/docs/compare.md
@@ -0,0 +1,71 @@
+# MASR对比Github其他项目
+
+Github上的中文语音识别项目质量良莠不齐，本文为你提供一个选择的参考。
+
+本文将MASR和以下Github上Star数较高的中文语音识别项目进行了对比
+
+1. ASRT：https://github.com/nl8590687/ASRT_SpeechRecognition
+2. 项目2：https://github.com/xxbb1234021/speech_recognition
+3. DeepSpeech：https://github.com/PaddlePaddle/DeepSpeech
+
+在正式对比之前，我想解释下一些概念。
+
+## 训练集和测试集
+
+通常来说，一个语音识别模型在训练集、测试集、实际使用中的效果满足如下关系：
+
+* 训练集效果 > 测试集效果 > 实际使用的效果
+
+以下对比的时候凡是提到识别效果，指的都是测试集识别效果。
+
+本项目使用的测试集是AISHELL-1的测试集。
+
+实际使用的效果都会比测试集效果要差。
+
+## 和ASRT对比
+
+|                    | ASRT            | MASR                                         |
+| ------------------ | --------------- | -------------------------------------------- |
+| 是否提供预训练模型 | 是              | 是                                           |
+| 预训练模型大小     | **20MB**        | 120MB                                        |
+| 识别效果           | 拼音准确率：80% | **汉字准确率：86%**（加入语言模型后**92%**） |
+| 基于框架           | keras           | pytorch                                      |
+| 是否端到端         | 是[?]           | **是**                                       |
+| 支持自行录音识别   | 是              | 是                                           |
+
+* ASRT预训练模型很小，下载方便，这点确实是优势。
+* 汉字准确率作为评价标准比拼音准确率要难。
+* 尽管ASRT声称是端到端的，但实际上是语音转拼音，再拼音转文字。
+* 这里的准确率指的是（1 - 字错误率）
+
+## 和项目2对比
+
+项目2把thchs-30的测试集拿去当训练集用了，只有**6个小时**的录音时长。完全的过拟合，毫无泛化能力，简单地说就是，**根本不能用**。
+
+项目2的首页在展示的时候用的是**已经被训练过**的「测试集」上的识别效果，毫无意义。
+
+项目2的issue页里基本都是踩坑的小白。
+
+|                    | 项目2                       | MASR                                         |
+| ------------------ | --------------------------- | -------------------------------------------- |
+| 是否提供预训练模型 | 要去issue页里面翻，可以翻到 | **是**                                       |
+| 预训练模型大小     | 110MB                       | 120MB                                        |
+| 识别效果           | 没有识别语音的能力          | **汉字准确率：86%**（加入语言模型后**92%**） |
+
+## 和DeepSpeech对比
+
+项目3是百度官方的。质量上确实比个人项目高很多，我也测试过。如果你比较有经验，推荐使用百度的这个项目。
+
+DeepSpeech提供了分别提供了中英文语音识别的预训练模型。其中中文语音识别模型提供了2个，一个基于AISHELL-1数据集训练（和本项目一样），一个基于百度内部的1200小时的中文数据集训练（钞能力）。百度给出了后者在内部测试集上的表现，大约是**87%的准确率**。
+
+|                    | DeepSpeech            | MASR                   |
+| ------------------ | --------------------- | ---------------------- |
+| 基于框架           | paddlepaddle          | pytorch                |
+| 是否提供预训练模型 | 是                    | 是                     |
+| 模型结构           | CNN + RNN(GRU)        | CNN only               |
+| 预训练模型大小     | 750MB                 | **120MB**              |
+| 预训练数据集大小   | **1200小时**          | 150小时                |
+| 外部语言模型       | **有**                | 无                     |
+| 识别效果           | **87%（内部数据集）** | 92%（AISHELL-1测试集） |
+
+注意：百度内部数据集上的**87%**准确率比本项目的在AISHELL-1测试集上的**92%**准确率要更好，因为它的测试集要丰富得多。个人估计，百度预训练模型在AISHELL-1上的准确率应该是可以做到95%左右的，但它没有公布。