g2p: default to prons from the reference lexicon

proger · Mar 4, 2023 · 57e1638 · 57e1638
1 parent d8b3030
commit 57e1638
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 5 deletions.
diff --git a/uk/g2p.py b/uk/g2p.py
@@ -22,6 +22,7 @@
 
 g2p_base = G2P('ukro-base-uncased')
 
+# X3 means prolonged version of X
 replacements = {
     'SH23': 'SH2',
     'H3': 'H',
@@ -39,9 +40,14 @@
     'ZH3': 'ZH',
 }
 
+_, reference_lexicon = read_lexicon(Path(__file__).parent / '../data/local/dict/lexicon_common_voice_uk.txt')
+
 
 def g2p(word):
-    return [replacements.get(p, p) for p in g2p_base(word)]
+    pron = reference_lexicon.get(word)
+    if not pron:
+        pron = [replacements.get(p, p) for p in g2p_base(word)]
+    return pron
 
 
 def g2p_batch(words: Sequence[str]) -> Mapping[str, str]:

diff --git a/uk/prepare_dataset.py b/uk/prepare_dataset.py
@@ -57,10 +57,11 @@ def prepare(dataset, datadir, g2p=None, alphabet='cyr', copy_wav=False):
             lexicon_txt = stack.enter_context(open(datadir / 'lexicon.txt', 'w'))
 
         for sample in tqdm(dataset):
-            utterance_id = str(sample.get('id') or Path(sample['path']).stem)
+            #utterance_id = str(sample.get('id') or Path(sample['path']).stem)
+            utterance_id = Path(sample['path']).stem
             speaker_id = str(sample.get('speaker_id', utterance_id))
 
-            orig_sentence = sample.get('sentence') or sample['text']
+            orig_sentence = sample.get('sentence') or sample.get('transcription') or sample['text']
             sentence = keep_useful_characters(orig_sentence, alphabet=alphabet, utterance_id=utterance_id)
             if sentence is None:
                 continue
@@ -85,7 +86,7 @@ def prepare(dataset, datadir, g2p=None, alphabet='cyr', copy_wav=False):
                                          text=text[utterance_id],
                                          orig_text=orig_sentence,
                                          spk=utt2spk[utterance_id],
-                                         media=loc), pk='utterance_id')
+                                         media=str(loc)), pk='utterance_id')
 
             for word in words:
                 if not word in lexicon:
@@ -116,6 +117,8 @@ def prepare(dataset, datadir, g2p=None, alphabet='cyr', copy_wav=False):
                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('--lexicon', action='store_true',
                         help='generate lexicon for every word using ukro-g2p')
+    parser.add_argument('--copy-wav', action='store_true',
+                        help='copy wav files from the dataset (useful if the paths are relative and make_mfcc fails to find them later on)')
     parser.add_argument('--dataset', default='mozilla-foundation/common_voice_10_0',
                         help='dataset name on Hugging Face')
     parser.add_argument('--subset', default='uk',
@@ -141,4 +144,4 @@ def prepare(dataset, datadir, g2p=None, alphabet='cyr', copy_wav=False):
     else:
         g2p = None
 
-    prepare(uk, datadir, g2p=g2p, alphabet=args.alphabet)
+    prepare(uk, datadir, g2p=g2p, alphabet=args.alphabet, copy_wav=args.copy_wav)