change the text to phoneme conversion for Mandarin Chinese

DigitalPhonetics · Nov 6, 2022 · 0850235 · 0850235
1 parent bf0a59f
commit 0850235
Show file tree

Hide file tree

Showing 2 changed files with 50 additions and 10 deletions.
diff --git a/Preprocessing/TextFrontend.py b/Preprocessing/TextFrontend.py
@@ -5,6 +5,7 @@
 import sys
 
 import torch
+from dragonmapper.transcriptions import pinyin_to_ipa
 from phonemizer.backend import EspeakBackend
 from pypinyin import pinyin
 
@@ -245,21 +246,57 @@ def get_phone_string(self, text, include_eos_symbol=True, for_feature_extraction
         # expand abbreviations
         utt = self.expand_abbreviations(text)
         # phonemize
-        phones = self.phonemizer_backend.phonemize([utt], strip=True)[0]  # To use a different phonemizer, this is the only line that needs to be exchanged
+        if self.g2p_lang == "cmn-latn-pinyin" or self.g2p_lang == "cmn":
+            phones = pinyin_to_ipa(utt)
+        else:
+            phones = self.phonemizer_backend.phonemize([utt], strip=True)[0]  # To use a different phonemizer, this is the only line that needs to be exchanged
 
         # Unfortunately tonal languages don't agree on the tone, most tonal
         # languages use different tones denoted by different numbering
         # systems. At this point in the script, it is attempted to unify
         # them all to the tones in the IPA standard.
         if self.g2p_lang == "cmn-latn-pinyin" or self.g2p_lang == "cmn":
+            """
+            handling for the espeak use-case
+            
             phones = phones.replace(".", "")  # no idea why espeak puts dots everywhere for Chinese
-            phones = phones.replace('1', "˥")
-            phones = phones.replace('2', "˧˥")
-            phones = phones.replace('ɜ', "˨˩")  # I'm fairly certain that this is a bug in espeak and ɜ is meant to be 3
-            phones = phones.replace('3', "˨˩")  # I'm fairly certain that this is a bug in espeak and ɜ is meant to be 3
-            phones = phones.replace('4', "˦˩")
-            phones = phones.replace('5', "˧")
-            phones = phones.replace('0', "˧")
+            
+            # fix for a bug in espeak that ignores the second target in multi-target tones in mandarin, as proposed by GitHub user @GodEase
+            phones = ' '.join([re.sub(r'[1-5ɜ]', u[-1], p) for p, u in zip(phones.split(), utt.split())])
+
+            # unfortunately the pypinyin package gives us only the unique characters instead of the base characters with modifiers,
+            # so we have to do every vowel separately  ̌ ́ ̄ ̀
+
+            # handle flat tone
+            phones = phones.replace("ā", "˥")
+            phones = phones.replace("ē", "˥")
+            phones = phones.replace("ī", "˥")
+            phones = phones.replace("ō", "˥")
+            phones = phones.replace("ū", "˥")
+            phones = phones.replace("ǖ", "˥")
+            # handle rising tone
+            phones = phones.replace("á", "˧˥")
+            phones = phones.replace("é", "˧˥")
+            phones = phones.replace("í", "˧˥")
+            phones = phones.replace("ó", "˧˥")
+            phones = phones.replace("ú", "˧˥")
+            phones = phones.replace("ǘ", "˧˥")
+            # handle dip tone
+            phones = phones.replace("ǎ", "˨˩˦")
+            phones = phones.replace("ĕ", "˨˩˦")
+            phones = phones.replace("ǐ", "˨˩˦")
+            phones = phones.replace("ǒ", "˨˩˦")
+            phones = phones.replace("ǔ", "˨˩˦")
+            phones = phones.replace("ǚ", "˨˩˦")
+            # handle falling tone
+            phones = phones.replace("à", "˥˩")
+            phones = phones.replace("è", "˥˩")
+            phones = phones.replace("ì", "˥˩")
+            phones = phones.replace("ò", "˥˩")
+            phones = phones.replace("ù", "˥˩")
+            phones = phones.replace("ǜ", "˥˩")
+            """
+
         if self.g2p_lang == "vi":
             phones = phones.replace('1', "˧")
             phones = phones.replace('2', "˨˩")
@@ -279,6 +316,7 @@ def postprocess_phoneme_string(self, phoneme_string, for_feature_extraction, inc
         replacements = [
             # punctuation in languages with non-latin script
             ("。", "."),
+            ("，", ","),
             ("【", '"'),
             ("】", '"'),
             ("、", ","),
@@ -288,6 +326,8 @@ def postprocess_phoneme_string(self, phoneme_string, for_feature_extraction, inc
             ("“", '"'),
             ("”", '"'),
             ("؛", ","),
+            ("《", '"'),
+            ("》", '"'),
             # latin script punctuation
             ("/", " "),
             ("—", ""),
@@ -456,8 +496,8 @@ def get_language_id(language):
 
     tf = ArticulatoryCombinedTextFrontend(language="cmn")
     tf.string_to_tensor("这是一个复杂的句子，它甚至包含一个停顿。", view=True)
-    tf.string_to_tensor("李绅 《悯农》    锄禾日当午，    汗滴禾下土。    谁知盘中餐，    粒粒皆辛苦。", view=True)
-    tf.string_to_tensor("巴	拔	把	爸	吧", view=True)
+    tf.string_to_tensor("李绅 《悯农》 锄禾日当午， 汗滴禾下土。 谁知盘中餐， 粒粒皆辛苦。", view=True)
+    tf.string_to_tensor("巴 拔 把 爸 吧", view=True)
 
     tf = ArticulatoryCombinedTextFrontend(language="vi")
     tf.string_to_tensor("Xin chào thế giới, quả là một ngày tốt lành để học nói tiếng Việt!", view=True)

diff --git a/requirements.txt b/requirements.txt