diff --git a/test/test_synthesis_engine.py b/test/test_synthesis_engine.py index e155c2649..9ff7fb563 100644 --- a/test/test_synthesis_engine.py +++ b/test/test_synthesis_engine.py @@ -13,11 +13,17 @@ # TODO: import from voicevox_engine.synthesis_engine.mora from voicevox_engine.synthesis_engine.synthesis_engine import ( + apply_intonation_scale, + apply_output_sampling_rate, + apply_output_stereo, + apply_pitch_scale, + apply_prepost_silence, + apply_speed_scale, + apply_volume_scale, calc_frame_per_phoneme, calc_frame_phoneme, calc_frame_pitch, mora_phoneme_list, - pad_with_silence, pre_process, split_mora, to_flatten_moras, @@ -173,8 +179,8 @@ def _gen_mora( ) -def test_pad_with_silence(): - """Test `pad_with_silence`.""" +def test_apply_prepost_silence(): + """Test `apply_prepost_silence`.""" # Inputs query = _gen_query(prePhonemeLength=2 * 0.01067, postPhonemeLength=6 * 0.01067) moras = [ @@ -189,11 +195,139 @@ def test_pad_with_silence(): ] # Outputs - moras_with_silence = pad_with_silence(moras, query) + moras_with_silence = apply_prepost_silence(moras, query) assert moras_with_silence == true_moras_with_silence +def test_apply_speed_scale(): + """Test `apply_speed_scale`.""" + # Inputs + query = _gen_query(speedScale=2.0) + input_moras = [ + _gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 50.0), + _gen_mora("ン", None, None, "N", 4 * 0.01067, 50.0), + _gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0), + _gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 125.0), + _gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0), + ] + + # Expects - x2 fast + true_moras = [ + _gen_mora("コ", "k", 1 * 0.01067, "o", 2 * 0.01067, 50.0), + _gen_mora("ン", None, None, "N", 2 * 0.01067, 50.0), + _gen_mora("、", None, None, "pau", 1 * 0.01067, 0.0), + _gen_mora("ヒ", "h", 1 * 0.01067, "i", 2 * 0.01067, 125.0), + _gen_mora("ホ", "h", 2 * 0.01067, "O", 1 * 0.01067, 0.0), + ] + + # Outputs + moras = apply_speed_scale(input_moras, query) + + assert moras == true_moras + + +def test_apply_pitch_scale(): + """Test `apply_pitch_scale`.""" + # Inputs + query = _gen_query(pitchScale=2.0) + input_moras = [ + _gen_mora("コ", "k", 0.0, "o", 0.0, 50.0), + _gen_mora("ン", None, None, "N", 0.0, 50.0), + _gen_mora("、", None, None, "pau", 0.0, 0.0), + _gen_mora("ヒ", "h", 0.0, "i", 0.0, 125.0), + _gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), + ] + + # Expects - x4 value scaled + true_moras = [ + _gen_mora("コ", "k", 0.0, "o", 0.0, 200.0), + _gen_mora("ン", None, None, "N", 0.0, 200.0), + _gen_mora("、", None, None, "pau", 0.0, 0.0), + _gen_mora("ヒ", "h", 0.0, "i", 0.0, 500.0), + _gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), + ] + + # Outputs + moras = apply_pitch_scale(input_moras, query) + + assert moras == true_moras + + +def test_apply_intonation_scale(): + """Test `apply_intonation_scale`.""" + # Inputs + query = _gen_query(intonationScale=0.5) + input_moras = [ + _gen_mora("コ", "k", 0.0, "o", 0.0, 200.0), + _gen_mora("ン", None, None, "N", 0.0, 200.0), + _gen_mora("、", None, None, "pau", 0.0, 0.0), + _gen_mora("ヒ", "h", 0.0, "i", 0.0, 500.0), + _gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), + ] + + # Expects - mean=300 var x0.5 intonation scaling + true_moras = [ + _gen_mora("コ", "k", 0.0, "o", 0.0, 250.0), + _gen_mora("ン", None, None, "N", 0.0, 250.0), + _gen_mora("、", None, None, "pau", 0.0, 0.0), + _gen_mora("ヒ", "h", 0.0, "i", 0.0, 400.0), + _gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), + ] + + # Outputs + moras = apply_intonation_scale(input_moras, query) + + assert moras == true_moras + + +def test_apply_volume_scale(): + """Test `apply_volume_scale`.""" + # Inputs + query = _gen_query(volumeScale=3.0) + input_wave = numpy.array([0.0, 1.0, 2.0]) + + # Expects - x3 scale + true_wave = numpy.array([0.0, 3.0, 6.0]) + + # Outputs + wave = apply_volume_scale(input_wave, query) + + assert numpy.allclose(wave, true_wave) + + +def test_apply_output_sampling_rate(): + """Test `apply_output_sampling_rate`.""" + # Inputs + query = _gen_query(outputSamplingRate=12000) + input_wave = numpy.array([1.0 for _ in range(120)]) + input_sr_wave = 24000 + + # Expects - half sampling rate + true_wave = numpy.array([1.0 for _ in range(60)]) + assert true_wave.shape == (60,), "Prerequisites" + + # Outputs + wave = apply_output_sampling_rate(input_wave, input_sr_wave, query) + + assert wave.shape[0] == true_wave.shape[0] + + +def test_apply_output_stereo(): + """Test `apply_output_stereo`.""" + # Inputs + query = _gen_query(outputStereo=True) + input_wave = numpy.array([1.0, 0.0, 2.0]) + + # Expects - Stereo :: (Time, Channel) + true_wave = numpy.array([[1.0, 1.0], [0.0, 0.0], [2.0, 2.0]]) + + # Outputs + wave = apply_output_stereo(input_wave, query) + + assert numpy.array_equal(wave, true_wave) + + def test_calc_frame_per_phoneme(): """Test `calc_frame_per_phoneme`.""" # Inputs @@ -325,7 +459,7 @@ def test_feat_to_framescale(): assert true_frame_per_phoneme.shape[0] == len(phoneme_data_list), "Prerequisites" # Outputs - flatten_moras = pad_with_silence(flatten_moras, query) + flatten_moras = apply_prepost_silence(flatten_moras, query) frame_per_phoneme = calc_frame_per_phoneme(query, flatten_moras) f0 = calc_frame_pitch(query, flatten_moras, phoneme_data_list, frame_per_phoneme) frame_phoneme = calc_frame_phoneme(phoneme_data_list, frame_per_phoneme) diff --git a/voicevox_engine/synthesis_engine/synthesis_engine.py b/voicevox_engine/synthesis_engine/synthesis_engine.py index 9bd7dde56..9fa12d3a5 100644 --- a/voicevox_engine/synthesis_engine/synthesis_engine.py +++ b/voicevox_engine/synthesis_engine/synthesis_engine.py @@ -1,8 +1,10 @@ +import math import threading from itertools import chain from typing import List, Optional, Tuple import numpy +from numpy import ndarray from soxr import resample from ..acoustic_feature_extractor import OjtPhoneme @@ -112,8 +114,9 @@ def generate_silence_mora(length: float) -> Mora: return Mora(text=" ", vowel="sil", vowel_length=length, pitch=0.0) -def pad_with_silence(moras: list[Mora], query: AudioQuery) -> list[Mora]: - """モーラ列の先頭/最後尾へqueryに基づいた無音モーラを追加 +def apply_prepost_silence(moras: list[Mora], query: AudioQuery) -> list[Mora]: + """ + 前後無音(`prePhonemeLength` & `postPhonemeLength`)の適用 Parameters ---------- moras : List[Mora] @@ -131,6 +134,27 @@ def pad_with_silence(moras: list[Mora], query: AudioQuery) -> list[Mora]: return moras +def apply_speed_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]: + """ + 話速スケール(`speedScale`)の適用 + Parameters + ---------- + moras : list[Mora] + モーラ系列 + query : AudioQuery + 音声合成クエリ + Returns + ------- + moras : list[Mora] + 話速スケールが適用されたモーラ系列 + """ + for mora in moras: + mora.vowel_length /= query.speedScale + if mora.consonant_length: + mora.consonant_length /= query.speedScale + return moras + + def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]): """ 音素あたりのフレーム長を算出 @@ -145,6 +169,9 @@ def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]): frame_per_phoneme : NDArray[] 音素あたりのフレーム長。端数丸め。 """ + # Apply: グローバル特徴量による補正(話速) + moras = apply_speed_scale(moras, query) + # 音素あたりの継続長 sec_per_phoneme = numpy.array( [ @@ -157,10 +184,6 @@ def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]): ], dtype=numpy.float32, ) - - # 話速による継続長の補正 - sec_per_phoneme /= query.speedScale - # 音素あたりのフレーム長。端数丸め。 framerate = 24000 / 256 # framerate 93.75 [frame/sec] frame_per_phoneme = numpy.round(sec_per_phoneme * framerate).astype(numpy.int32) @@ -168,6 +191,48 @@ def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]): return frame_per_phoneme +def apply_pitch_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]: + """ + 音高スケール(`pitchScale`)の適用 + Parameters + ---------- + moras : list[Mora] + モーラ系列 + query : AudioQuery + 音声合成クエリ + Returns + ------- + moras : list[Mora] + 音高スケールが適用されたモーラ系列 + """ + for mora in moras: + mora.pitch *= 2**query.pitchScale + return moras + + +def apply_intonation_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]: + """ + 抑揚スケール(`intonationScale`)の適用 + Parameters + ---------- + moras : list[Mora] + モーラ系列 + query : AudioQuery + 音声合成クエリ + Returns + ------- + moras : list[Mora] + 抑揚スケールが適用されたモーラ系列 + """ + # 有声音素 (f0>0) の平均値に対する乖離度をスケール + voiced = list(filter(lambda mora: mora.pitch > 0, moras)) + mean_f0 = numpy.mean(list(map(lambda mora: mora.pitch, voiced))).item() + if mean_f0 != math.nan: # 空リスト -> NaN + for mora in voiced: + mora.pitch = (mora.pitch - mean_f0) * query.intonationScale + mean_f0 + return moras + + def calc_frame_pitch( query: AudioQuery, moras: List[Mora], @@ -191,30 +256,41 @@ def calc_frame_pitch( frame_f0 : NDArray[] フレームごとの基本周波数系列 """ + moras = apply_pitch_scale(moras, query) + moras = apply_intonation_scale(moras, query) + # TODO: Better function name (c.f. VOICEVOX/voicevox_engine#790) # モーラごとの基本周波数 f0 = numpy.array([mora.pitch for mora in moras], dtype=numpy.float32) - # 音高スケールによる補正 - f0 *= 2**query.pitchScale - - # 抑揚スケールによる補正。有声音素 (f0>0) の平均値に対する乖離度をスケール - voiced = f0 > 0 - mean_f0 = f0[voiced].mean() - if not numpy.isnan(mean_f0): - f0[voiced] = (f0[voiced] - mean_f0) * query.intonationScale + mean_f0 - - # フレームごとのピッチ化 + # Rescale: 時間スケールの変更(モーラ -> フレーム) # 母音インデックスに基づき "音素あたりのフレーム長" を "モーラあたりのフレーム長" に集約 vowel_indexes = numpy.array(split_mora(phonemes)[2]) frame_per_mora = [ a.sum() for a in numpy.split(frame_per_phoneme, vowel_indexes[:-1] + 1) ] - # モーラの基本周波数を子音・母音に割当てフレーム化 frame_f0 = numpy.repeat(f0, frame_per_mora) return frame_f0 +def apply_volume_scale(wave: numpy.ndarray, query: AudioQuery) -> numpy.ndarray: + """ + 音量スケール(`volumeScale`)の適用 + Parameters + ---------- + wave : numpy.ndarray + 音声波形 + query : AudioQuery + 音声合成クエリ + Returns + ------- + wave : numpy.ndarray + 音量スケールが適用された音声波形 + """ + wave *= query.volumeScale + return wave + + def calc_frame_phoneme(phonemes: List[OjtPhoneme], frame_per_phoneme: numpy.ndarray): """ フレームごとの音素列の生成(onehot化 + フレーム化) @@ -230,11 +306,59 @@ def calc_frame_phoneme(phonemes: List[OjtPhoneme], frame_per_phoneme: numpy.ndar フレームごとの音素系列 """ # TODO: Better function name (c.f. VOICEVOX/voicevox_engine#790) + # Convert: Core入力形式への変換(onehotベクトル系列) onehot_phoneme = numpy.stack([p.onehot for p in phonemes]) + + # Rescale: 時間スケールの変更(音素 -> フレーム) frame_phoneme = numpy.repeat(onehot_phoneme, frame_per_phoneme, axis=0) return frame_phoneme +def apply_output_sampling_rate( + wave: ndarray, sr_wave: int, query: AudioQuery +) -> ndarray: + """ + 出力サンプリングレート(`outputSamplingRate`)の適用 + Parameters + ---------- + wave : ndarray + 音声波形 + sr_wave : int + `wave`のサンプリングレート + query : AudioQuery + 音声合成クエリ + Returns + ------- + wave : ndarray + 出力サンプリングレートが適用された音声波形 + """ + # サンプリングレート一致のときはスルー + if sr_wave == query.outputSamplingRate: + return wave + + wave = resample(wave, sr_wave, query.outputSamplingRate) + return wave + + +def apply_output_stereo(wave: ndarray, query: AudioQuery) -> ndarray: + """ + ステレオ出力(`outputStereo`)の適用 + Parameters + ---------- + wave : ndarray + 音声波形 + query : AudioQuery + 音声合成クエリ + Returns + ------- + wave : ndarray + ステレオ出力設定が適用された音声波形 + """ + if query.outputStereo: + wave = numpy.array([wave, wave]).T + return wave + + class SynthesisEngine(SynthesisEngineBase): """音声合成器(core)の管理/実行/プロキシと音声合成フロー""" @@ -493,7 +617,7 @@ def _synthesis_impl(self, query: AudioQuery, style_id: int): # AccentPhraseをすべてMoraおよびOjtPhonemeの形に分解し、処理可能な形にする flatten_moras, phoneme_data_list = pre_process(query.accent_phrases) - flatten_moras = pad_with_silence(flatten_moras, query) + flatten_moras = apply_prepost_silence(flatten_moras, query) frame_per_phoneme = calc_frame_per_phoneme(query, flatten_moras) f0 = calc_frame_pitch( query, flatten_moras, phoneme_data_list, frame_per_phoneme @@ -509,21 +633,10 @@ def _synthesis_impl(self, query: AudioQuery, style_id: int): phoneme=phoneme, style_id=numpy.array(style_id, dtype=numpy.int64).reshape(-1), ) + sr_wave = self.default_sampling_rate - # volume: ゲイン適用 - wave *= query.volumeScale - - # 出力サンプリングレートがデフォルト(decode forwarderによるもの、24kHz)でなければ、それを適用する - if query.outputSamplingRate != self.default_sampling_rate: - wave = resample( - wave, - self.default_sampling_rate, - query.outputSamplingRate, - ) - - # ステレオ変換 - # 出力設定がステレオなのであれば、ステレオ化する - if query.outputStereo: - wave = numpy.array([wave, wave]).T + wave = apply_volume_scale(wave, query) + wave = apply_output_sampling_rate(wave, sr_wave, query) + wave = apply_output_stereo(wave, query) return wave