Skip to content

Commit

Permalink
g2p: default to prons from the reference lexicon
Browse files Browse the repository at this point in the history
  • Loading branch information
proger committed Mar 4, 2023
1 parent d8b3030 commit 57e1638
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 5 deletions.
8 changes: 7 additions & 1 deletion uk/g2p.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

g2p_base = G2P('ukro-base-uncased')

# X3 means prolonged version of X
replacements = {
'SH23': 'SH2',
'H3': 'H',
Expand All @@ -39,9 +40,14 @@
'ZH3': 'ZH',
}

_, reference_lexicon = read_lexicon(Path(__file__).parent / '../data/local/dict/lexicon_common_voice_uk.txt')


def g2p(word):
return [replacements.get(p, p) for p in g2p_base(word)]
pron = reference_lexicon.get(word)
if not pron:
pron = [replacements.get(p, p) for p in g2p_base(word)]
return pron


def g2p_batch(words: Sequence[str]) -> Mapping[str, str]:
Expand Down
11 changes: 7 additions & 4 deletions uk/prepare_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,11 @@ def prepare(dataset, datadir, g2p=None, alphabet='cyr', copy_wav=False):
lexicon_txt = stack.enter_context(open(datadir / 'lexicon.txt', 'w'))

for sample in tqdm(dataset):
utterance_id = str(sample.get('id') or Path(sample['path']).stem)
#utterance_id = str(sample.get('id') or Path(sample['path']).stem)
utterance_id = Path(sample['path']).stem
speaker_id = str(sample.get('speaker_id', utterance_id))

orig_sentence = sample.get('sentence') or sample['text']
orig_sentence = sample.get('sentence') or sample.get('transcription') or sample['text']
sentence = keep_useful_characters(orig_sentence, alphabet=alphabet, utterance_id=utterance_id)
if sentence is None:
continue
Expand All @@ -85,7 +86,7 @@ def prepare(dataset, datadir, g2p=None, alphabet='cyr', copy_wav=False):
text=text[utterance_id],
orig_text=orig_sentence,
spk=utt2spk[utterance_id],
media=loc), pk='utterance_id')
media=str(loc)), pk='utterance_id')

for word in words:
if not word in lexicon:
Expand Down Expand Up @@ -116,6 +117,8 @@ def prepare(dataset, datadir, g2p=None, alphabet='cyr', copy_wav=False):
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--lexicon', action='store_true',
help='generate lexicon for every word using ukro-g2p')
parser.add_argument('--copy-wav', action='store_true',
help='copy wav files from the dataset (useful if the paths are relative and make_mfcc fails to find them later on)')
parser.add_argument('--dataset', default='mozilla-foundation/common_voice_10_0',
help='dataset name on Hugging Face')
parser.add_argument('--subset', default='uk',
Expand All @@ -141,4 +144,4 @@ def prepare(dataset, datadir, g2p=None, alphabet='cyr', copy_wav=False):
else:
g2p = None

prepare(uk, datadir, g2p=g2p, alphabet=args.alphabet)
prepare(uk, datadir, g2p=g2p, alphabet=args.alphabet, copy_wav=args.copy_wav)

0 comments on commit 57e1638

Please sign in to comment.