Skip to content

Commit

Permalink
Refactor: frame_per_mora による置き換え (#841)
Browse files Browse the repository at this point in the history
  • Loading branch information
tarepan authored Dec 9, 2023
1 parent d0a596d commit 5d7562c
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 31 deletions.
48 changes: 34 additions & 14 deletions test/test_synthesis_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
apply_prepost_silence,
apply_speed_scale,
apply_volume_scale,
calc_frame_per_mora,
calc_frame_per_phoneme,
calc_frame_phoneme,
calc_frame_pitch,
Expand Down Expand Up @@ -353,24 +354,43 @@ def test_calc_frame_per_phoneme():
assert numpy.array_equal(frame_per_phoneme, true_frame_per_phoneme)


def test_calc_frame_per_mora():
"""Test `calc_frame_per_mora`."""
# Inputs
moras = [
_gen_mora(" ", None, None, " ", 2 * 0.01067, 0.0), # 0.01067 [sec/frame]
_gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 0.0),
_gen_mora("ン", None, None, "N", 4 * 0.01067, 0.0),
_gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0),
_gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 0.0),
_gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0),
_gen_mora(" ", None, None, " ", 6 * 0.01067, 0.0),
]

# Expects
# Pre ko N pau hi hO Pst
true_frame_per_mora = [2, 6, 4, 2, 6, 6, 6]
true_frame_per_mora = numpy.array(true_frame_per_mora, dtype=numpy.int32)

# Outputs
frame_per_phoneme = numpy.array(list(map(calc_frame_per_mora, moras)))

assert numpy.array_equal(frame_per_phoneme, true_frame_per_mora)


def test_calc_frame_pitch():
"""Test `test_calc_frame_pitch`."""
# Inputs
query = _gen_query(pitchScale=2.0, intonationScale=0.5)
moras = [
_gen_mora(" ", None, None, " ", 0.0, 0.0),
_gen_mora("コ", "k", 0.0, "o", 0.0, 50.0),
_gen_mora("ン", None, None, "N", 0.0, 50.0),
_gen_mora("、", None, None, "pau", 0.0, 0.0),
_gen_mora("ヒ", "h", 0.0, "i", 0.0, 125.0),
_gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
_gen_mora(" ", None, None, " ", 0.0, 0.0),
_gen_mora(" ", None, None, " ", 1 * 0.01067, 0.0),
_gen_mora("コ", "k", 1 * 0.01067, "o", 2 * 0.01067, 50.0),
_gen_mora("ン", None, None, "N", 2 * 0.01067, 50.0),
_gen_mora("、", None, None, "pau", 1 * 0.01067, 0.0),
_gen_mora("ヒ", "h", 1 * 0.01067, "i", 2 * 0.01067, 125.0),
_gen_mora("ホ", "h", 2 * 0.01067, "O", 1 * 0.01067, 0.0),
_gen_mora(" ", None, None, " ", 3 * 0.01067, 0.0),
]
phoneme_str = "pau k o N pau h i h O pau"
phonemes = [OjtPhoneme(p) for p in phoneme_str.split()]
# Pre k o N pau h i h O Pst
frame_per_phoneme = [1, 1, 2, 2, 1, 1, 2, 2, 1, 3]
frame_per_phoneme = numpy.array(frame_per_phoneme, dtype=numpy.int32)

# Expects - x4 value scaled -> mean=300 var x0.5 intonation scaling
# pau ko ko ko N N
Expand All @@ -382,7 +402,7 @@ def test_calc_frame_pitch():
true_f0 = numpy.array(true1_f0 + true2_f0 + true3_f0, dtype=numpy.float32)

# Outputs
f0 = calc_frame_pitch(query, moras, phonemes, frame_per_phoneme)
f0 = calc_frame_pitch(query, moras)

assert numpy.array_equal(f0, true_f0)

Expand Down Expand Up @@ -461,7 +481,7 @@ def test_feat_to_framescale():
# Outputs
flatten_moras = apply_prepost_silence(flatten_moras, query)
frame_per_phoneme = calc_frame_per_phoneme(query, flatten_moras)
f0 = calc_frame_pitch(query, flatten_moras, phoneme_data_list, frame_per_phoneme)
f0 = calc_frame_pitch(query, flatten_moras)
frame_phoneme = calc_frame_phoneme(phoneme_data_list, frame_per_phoneme)

assert numpy.array_equal(frame_phoneme, true_frame_phoneme)
Expand Down
43 changes: 26 additions & 17 deletions voicevox_engine/synthesis_engine/synthesis_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,29 @@ def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]):
return frame_per_phoneme


def _to_frame(sec: float) -> ndarray:
FRAMERATE = 93.75 # 24000 / 256 [frame/sec]
return numpy.round(sec * FRAMERATE).astype(numpy.int32)


def calc_frame_per_mora(mora: Mora) -> ndarray:
"""
モーラあたりのフレーム長を算出
Parameters
----------
mora : Mora
モーラ
Returns
-------
frame_per_mora : NDArray[]
モーラあたりのフレーム長。端数丸め。
"""
# 音素ごとにフレーム長を算出し、和をモーラのフレーム長とする
vowel_frames = _to_frame(mora.vowel_length)
consonant_frames = _to_frame(mora.consonant_length) if mora.consonant else 0
return vowel_frames + consonant_frames


def apply_pitch_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]:
"""
音高スケール(`pitchScale`)の適用
Expand Down Expand Up @@ -233,12 +256,7 @@ def apply_intonation_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]:
return moras


def calc_frame_pitch(
query: AudioQuery,
moras: List[Mora],
phonemes: List[OjtPhoneme],
frame_per_phoneme: numpy.ndarray,
):
def calc_frame_pitch(query: AudioQuery, moras: list[Mora]) -> ndarray:
"""
フレームごとのピッチの生成
Parameters
Expand All @@ -247,10 +265,6 @@ def calc_frame_pitch(
音声合成クエリ
moras : List[Mora]
モーラ列
phonemes : List[OjtPhoneme]
音素列
frame_per_phoneme: NDArray
音素あたりのフレーム長。端数丸め。
Returns
-------
frame_f0 : NDArray[]
Expand All @@ -265,10 +279,7 @@ def calc_frame_pitch(

# Rescale: 時間スケールの変更(モーラ -> フレーム)
# 母音インデックスに基づき "音素あたりのフレーム長" を "モーラあたりのフレーム長" に集約
vowel_indexes = numpy.array(split_mora(phonemes)[2])
frame_per_mora = [
a.sum() for a in numpy.split(frame_per_phoneme, vowel_indexes[:-1] + 1)
]
frame_per_mora = numpy.array(list(map(calc_frame_per_mora, moras)))
frame_f0 = numpy.repeat(f0, frame_per_mora)
return frame_f0

Expand Down Expand Up @@ -619,9 +630,7 @@ def _synthesis_impl(self, query: AudioQuery, style_id: int):

flatten_moras = apply_prepost_silence(flatten_moras, query)
frame_per_phoneme = calc_frame_per_phoneme(query, flatten_moras)
f0 = calc_frame_pitch(
query, flatten_moras, phoneme_data_list, frame_per_phoneme
)
f0 = calc_frame_pitch(query, flatten_moras)
phoneme = calc_frame_phoneme(phoneme_data_list, frame_per_phoneme)

# 今まで生成された情報をdecode_forwardにかけ、推論器によって音声波形を生成する
Expand Down

0 comments on commit 5d7562c

Please sign in to comment.