Skip to content

Commit

Permalink
change the text to phoneme conversion for Mandarin Chinese
Browse files Browse the repository at this point in the history
  • Loading branch information
Flux9665 committed Nov 6, 2022
1 parent bf0a59f commit 0850235
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 10 deletions.
60 changes: 50 additions & 10 deletions Preprocessing/TextFrontend.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import sys

import torch
from dragonmapper.transcriptions import pinyin_to_ipa
from phonemizer.backend import EspeakBackend
from pypinyin import pinyin

Expand Down Expand Up @@ -245,21 +246,57 @@ def get_phone_string(self, text, include_eos_symbol=True, for_feature_extraction
# expand abbreviations
utt = self.expand_abbreviations(text)
# phonemize
phones = self.phonemizer_backend.phonemize([utt], strip=True)[0] # To use a different phonemizer, this is the only line that needs to be exchanged
if self.g2p_lang == "cmn-latn-pinyin" or self.g2p_lang == "cmn":
phones = pinyin_to_ipa(utt)
else:
phones = self.phonemizer_backend.phonemize([utt], strip=True)[0] # To use a different phonemizer, this is the only line that needs to be exchanged

# Unfortunately tonal languages don't agree on the tone, most tonal
# languages use different tones denoted by different numbering
# systems. At this point in the script, it is attempted to unify
# them all to the tones in the IPA standard.
if self.g2p_lang == "cmn-latn-pinyin" or self.g2p_lang == "cmn":
"""
handling for the espeak use-case
phones = phones.replace(".", "") # no idea why espeak puts dots everywhere for Chinese
phones = phones.replace('1', "˥")
phones = phones.replace('2', "˧˥")
phones = phones.replace('ɜ', "˨˩") # I'm fairly certain that this is a bug in espeak and ɜ is meant to be 3
phones = phones.replace('3', "˨˩") # I'm fairly certain that this is a bug in espeak and ɜ is meant to be 3
phones = phones.replace('4', "˦˩")
phones = phones.replace('5', "˧")
phones = phones.replace('0', "˧")
# fix for a bug in espeak that ignores the second target in multi-target tones in mandarin, as proposed by GitHub user @GodEase
phones = ' '.join([re.sub(r'[1-5ɜ]', u[-1], p) for p, u in zip(phones.split(), utt.split())])
# unfortunately the pypinyin package gives us only the unique characters instead of the base characters with modifiers,
# so we have to do every vowel separately ̌ ́ ̄ ̀
# handle flat tone
phones = phones.replace("ā", "˥")
phones = phones.replace("ē", "˥")
phones = phones.replace("ī", "˥")
phones = phones.replace("ō", "˥")
phones = phones.replace("ū", "˥")
phones = phones.replace("ǖ", "˥")
# handle rising tone
phones = phones.replace("á", "˧˥")
phones = phones.replace("é", "˧˥")
phones = phones.replace("í", "˧˥")
phones = phones.replace("ó", "˧˥")
phones = phones.replace("ú", "˧˥")
phones = phones.replace("ǘ", "˧˥")
# handle dip tone
phones = phones.replace("ǎ", "˨˩˦")
phones = phones.replace("ĕ", "˨˩˦")
phones = phones.replace("ǐ", "˨˩˦")
phones = phones.replace("ǒ", "˨˩˦")
phones = phones.replace("ǔ", "˨˩˦")
phones = phones.replace("ǚ", "˨˩˦")
# handle falling tone
phones = phones.replace("à", "˥˩")
phones = phones.replace("è", "˥˩")
phones = phones.replace("ì", "˥˩")
phones = phones.replace("ò", "˥˩")
phones = phones.replace("ù", "˥˩")
phones = phones.replace("ǜ", "˥˩")
"""

if self.g2p_lang == "vi":
phones = phones.replace('1', "˧")
phones = phones.replace('2', "˨˩")
Expand All @@ -279,6 +316,7 @@ def postprocess_phoneme_string(self, phoneme_string, for_feature_extraction, inc
replacements = [
# punctuation in languages with non-latin script
("。", "."),
(",", ","),
("【", '"'),
("】", '"'),
("、", ","),
Expand All @@ -288,6 +326,8 @@ def postprocess_phoneme_string(self, phoneme_string, for_feature_extraction, inc
("“", '"'),
("”", '"'),
("؛", ","),
("《", '"'),
("》", '"'),
# latin script punctuation
("/", " "),
("—", ""),
Expand Down Expand Up @@ -456,8 +496,8 @@ def get_language_id(language):

tf = ArticulatoryCombinedTextFrontend(language="cmn")
tf.string_to_tensor("这是一个复杂的句子,它甚至包含一个停顿。", view=True)
tf.string_to_tensor("李绅 《悯农》 锄禾日当午, 汗滴禾下土。 谁知盘中餐, 粒粒皆辛苦。", view=True)
tf.string_to_tensor("巴 拔 把 爸 吧", view=True)
tf.string_to_tensor("李绅 《悯农》 锄禾日当午, 汗滴禾下土。 谁知盘中餐, 粒粒皆辛苦。", view=True)
tf.string_to_tensor("巴 拔 把 爸 吧", view=True)

tf = ArticulatoryCombinedTextFrontend(language="vi")
tf.string_to_tensor("Xin chào thế giới, quả là một ngày tốt lành để học nói tiếng Việt!", view=True)
Expand Down
Binary file modified requirements.txt
Binary file not shown.

0 comments on commit 0850235

Please sign in to comment.