diff --git a/test/test_synthesis_engine.py b/test/test_synthesis_engine.py
index e155c2649..9ff7fb563 100644
--- a/test/test_synthesis_engine.py
+++ b/test/test_synthesis_engine.py
@@ -13,11 +13,17 @@
 
 # TODO: import from voicevox_engine.synthesis_engine.mora
 from voicevox_engine.synthesis_engine.synthesis_engine import (
+    apply_intonation_scale,
+    apply_output_sampling_rate,
+    apply_output_stereo,
+    apply_pitch_scale,
+    apply_prepost_silence,
+    apply_speed_scale,
+    apply_volume_scale,
     calc_frame_per_phoneme,
     calc_frame_phoneme,
     calc_frame_pitch,
     mora_phoneme_list,
-    pad_with_silence,
     pre_process,
     split_mora,
     to_flatten_moras,
@@ -173,8 +179,8 @@ def _gen_mora(
     )
 
 
-def test_pad_with_silence():
-    """Test `pad_with_silence`."""
+def test_apply_prepost_silence():
+    """Test `apply_prepost_silence`."""
     # Inputs
     query = _gen_query(prePhonemeLength=2 * 0.01067, postPhonemeLength=6 * 0.01067)
     moras = [
@@ -189,11 +195,139 @@ def test_pad_with_silence():
     ]
 
     # Outputs
-    moras_with_silence = pad_with_silence(moras, query)
+    moras_with_silence = apply_prepost_silence(moras, query)
 
     assert moras_with_silence == true_moras_with_silence
 
 
+def test_apply_speed_scale():
+    """Test `apply_speed_scale`."""
+    # Inputs
+    query = _gen_query(speedScale=2.0)
+    input_moras = [
+        _gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 50.0),
+        _gen_mora("ン", None, None, "N", 4 * 0.01067, 50.0),
+        _gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0),
+        _gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 125.0),
+        _gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0),
+    ]
+
+    # Expects - x2 fast
+    true_moras = [
+        _gen_mora("コ", "k", 1 * 0.01067, "o", 2 * 0.01067, 50.0),
+        _gen_mora("ン", None, None, "N", 2 * 0.01067, 50.0),
+        _gen_mora("、", None, None, "pau", 1 * 0.01067, 0.0),
+        _gen_mora("ヒ", "h", 1 * 0.01067, "i", 2 * 0.01067, 125.0),
+        _gen_mora("ホ", "h", 2 * 0.01067, "O", 1 * 0.01067, 0.0),
+    ]
+
+    # Outputs
+    moras = apply_speed_scale(input_moras, query)
+
+    assert moras == true_moras
+
+
+def test_apply_pitch_scale():
+    """Test `apply_pitch_scale`."""
+    # Inputs
+    query = _gen_query(pitchScale=2.0)
+    input_moras = [
+        _gen_mora("コ", "k", 0.0, "o", 0.0, 50.0),
+        _gen_mora("ン", None, None, "N", 0.0, 50.0),
+        _gen_mora("、", None, None, "pau", 0.0, 0.0),
+        _gen_mora("ヒ", "h", 0.0, "i", 0.0, 125.0),
+        _gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
+    ]
+
+    # Expects - x4 value scaled
+    true_moras = [
+        _gen_mora("コ", "k", 0.0, "o", 0.0, 200.0),
+        _gen_mora("ン", None, None, "N", 0.0, 200.0),
+        _gen_mora("、", None, None, "pau", 0.0, 0.0),
+        _gen_mora("ヒ", "h", 0.0, "i", 0.0, 500.0),
+        _gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
+    ]
+
+    # Outputs
+    moras = apply_pitch_scale(input_moras, query)
+
+    assert moras == true_moras
+
+
+def test_apply_intonation_scale():
+    """Test `apply_intonation_scale`."""
+    # Inputs
+    query = _gen_query(intonationScale=0.5)
+    input_moras = [
+        _gen_mora("コ", "k", 0.0, "o", 0.0, 200.0),
+        _gen_mora("ン", None, None, "N", 0.0, 200.0),
+        _gen_mora("、", None, None, "pau", 0.0, 0.0),
+        _gen_mora("ヒ", "h", 0.0, "i", 0.0, 500.0),
+        _gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
+    ]
+
+    # Expects - mean=300 var x0.5 intonation scaling
+    true_moras = [
+        _gen_mora("コ", "k", 0.0, "o", 0.0, 250.0),
+        _gen_mora("ン", None, None, "N", 0.0, 250.0),
+        _gen_mora("、", None, None, "pau", 0.0, 0.0),
+        _gen_mora("ヒ", "h", 0.0, "i", 0.0, 400.0),
+        _gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
+    ]
+
+    # Outputs
+    moras = apply_intonation_scale(input_moras, query)
+
+    assert moras == true_moras
+
+
+def test_apply_volume_scale():
+    """Test `apply_volume_scale`."""
+    # Inputs
+    query = _gen_query(volumeScale=3.0)
+    input_wave = numpy.array([0.0, 1.0, 2.0])
+
+    # Expects - x3 scale
+    true_wave = numpy.array([0.0, 3.0, 6.0])
+
+    # Outputs
+    wave = apply_volume_scale(input_wave, query)
+
+    assert numpy.allclose(wave, true_wave)
+
+
+def test_apply_output_sampling_rate():
+    """Test `apply_output_sampling_rate`."""
+    # Inputs
+    query = _gen_query(outputSamplingRate=12000)
+    input_wave = numpy.array([1.0 for _ in range(120)])
+    input_sr_wave = 24000
+
+    # Expects - half sampling rate
+    true_wave = numpy.array([1.0 for _ in range(60)])
+    assert true_wave.shape == (60,), "Prerequisites"
+
+    # Outputs
+    wave = apply_output_sampling_rate(input_wave, input_sr_wave, query)
+
+    assert wave.shape[0] == true_wave.shape[0]
+
+
+def test_apply_output_stereo():
+    """Test `apply_output_stereo`."""
+    # Inputs
+    query = _gen_query(outputStereo=True)
+    input_wave = numpy.array([1.0, 0.0, 2.0])
+
+    # Expects - Stereo :: (Time, Channel)
+    true_wave = numpy.array([[1.0, 1.0], [0.0, 0.0], [2.0, 2.0]])
+
+    # Outputs
+    wave = apply_output_stereo(input_wave, query)
+
+    assert numpy.array_equal(wave, true_wave)
+
+
 def test_calc_frame_per_phoneme():
     """Test `calc_frame_per_phoneme`."""
     # Inputs
@@ -325,7 +459,7 @@ def test_feat_to_framescale():
     assert true_frame_per_phoneme.shape[0] == len(phoneme_data_list), "Prerequisites"
 
     # Outputs
-    flatten_moras = pad_with_silence(flatten_moras, query)
+    flatten_moras = apply_prepost_silence(flatten_moras, query)
     frame_per_phoneme = calc_frame_per_phoneme(query, flatten_moras)
     f0 = calc_frame_pitch(query, flatten_moras, phoneme_data_list, frame_per_phoneme)
     frame_phoneme = calc_frame_phoneme(phoneme_data_list, frame_per_phoneme)
diff --git a/voicevox_engine/synthesis_engine/synthesis_engine.py b/voicevox_engine/synthesis_engine/synthesis_engine.py
index 9bd7dde56..9fa12d3a5 100644
--- a/voicevox_engine/synthesis_engine/synthesis_engine.py
+++ b/voicevox_engine/synthesis_engine/synthesis_engine.py
@@ -1,8 +1,10 @@
+import math
 import threading
 from itertools import chain
 from typing import List, Optional, Tuple
 
 import numpy
+from numpy import ndarray
 from soxr import resample
 
 from ..acoustic_feature_extractor import OjtPhoneme
@@ -112,8 +114,9 @@ def generate_silence_mora(length: float) -> Mora:
     return Mora(text="　", vowel="sil", vowel_length=length, pitch=0.0)
 
 
-def pad_with_silence(moras: list[Mora], query: AudioQuery) -> list[Mora]:
-    """モーラ列の先頭/最後尾へqueryに基づいた無音モーラを追加
+def apply_prepost_silence(moras: list[Mora], query: AudioQuery) -> list[Mora]:
+    """
+    前後無音（`prePhonemeLength` & `postPhonemeLength`）の適用
     Parameters
     ----------
     moras : List[Mora]
@@ -131,6 +134,27 @@ def pad_with_silence(moras: list[Mora], query: AudioQuery) -> list[Mora]:
     return moras
 
 
+def apply_speed_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]:
+    """
+    話速スケール（`speedScale`）の適用
+    Parameters
+    ----------
+    moras : list[Mora]
+        モーラ系列
+    query : AudioQuery
+        音声合成クエリ
+    Returns
+    -------
+    moras : list[Mora]
+        話速スケールが適用されたモーラ系列
+    """
+    for mora in moras:
+        mora.vowel_length /= query.speedScale
+        if mora.consonant_length:
+            mora.consonant_length /= query.speedScale
+    return moras
+
+
 def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]):
     """
     音素あたりのフレーム長を算出
@@ -145,6 +169,9 @@ def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]):
     frame_per_phoneme : NDArray[]
         音素あたりのフレーム長。端数丸め。
     """
+    # Apply: グローバル特徴量による補正（話速）
+    moras = apply_speed_scale(moras, query)
+
     # 音素あたりの継続長
     sec_per_phoneme = numpy.array(
         [
@@ -157,10 +184,6 @@ def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]):
         ],
         dtype=numpy.float32,
     )
-
-    # 話速による継続長の補正
-    sec_per_phoneme /= query.speedScale
-
     # 音素あたりのフレーム長。端数丸め。
     framerate = 24000 / 256  # framerate 93.75 [frame/sec]
     frame_per_phoneme = numpy.round(sec_per_phoneme * framerate).astype(numpy.int32)
@@ -168,6 +191,48 @@ def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]):
     return frame_per_phoneme
 
 
+def apply_pitch_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]:
+    """
+    音高スケール（`pitchScale`）の適用
+    Parameters
+    ----------
+    moras : list[Mora]
+        モーラ系列
+    query : AudioQuery
+        音声合成クエリ
+    Returns
+    -------
+    moras : list[Mora]
+        音高スケールが適用されたモーラ系列
+    """
+    for mora in moras:
+        mora.pitch *= 2**query.pitchScale
+    return moras
+
+
+def apply_intonation_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]:
+    """
+    抑揚スケール（`intonationScale`）の適用
+    Parameters
+    ----------
+    moras : list[Mora]
+        モーラ系列
+    query : AudioQuery
+        音声合成クエリ
+    Returns
+    -------
+    moras : list[Mora]
+        抑揚スケールが適用されたモーラ系列
+    """
+    # 有声音素 (f0>0) の平均値に対する乖離度をスケール
+    voiced = list(filter(lambda mora: mora.pitch > 0, moras))
+    mean_f0 = numpy.mean(list(map(lambda mora: mora.pitch, voiced))).item()
+    if mean_f0 != math.nan:  # 空リスト -> NaN
+        for mora in voiced:
+            mora.pitch = (mora.pitch - mean_f0) * query.intonationScale + mean_f0
+    return moras
+
+
 def calc_frame_pitch(
     query: AudioQuery,
     moras: List[Mora],
@@ -191,30 +256,41 @@ def calc_frame_pitch(
     frame_f0 : NDArray[]
         フレームごとの基本周波数系列
     """
+    moras = apply_pitch_scale(moras, query)
+    moras = apply_intonation_scale(moras, query)
+
     # TODO: Better function name (c.f. VOICEVOX/voicevox_engine#790)
     # モーラごとの基本周波数
     f0 = numpy.array([mora.pitch for mora in moras], dtype=numpy.float32)
 
-    # 音高スケールによる補正
-    f0 *= 2**query.pitchScale
-
-    # 抑揚スケールによる補正。有声音素 (f0>0) の平均値に対する乖離度をスケール
-    voiced = f0 > 0
-    mean_f0 = f0[voiced].mean()
-    if not numpy.isnan(mean_f0):
-        f0[voiced] = (f0[voiced] - mean_f0) * query.intonationScale + mean_f0
-
-    # フレームごとのピッチ化
+    # Rescale: 時間スケールの変更（モーラ -> フレーム）
     # 母音インデックスに基づき "音素あたりのフレーム長" を "モーラあたりのフレーム長" に集約
     vowel_indexes = numpy.array(split_mora(phonemes)[2])
     frame_per_mora = [
         a.sum() for a in numpy.split(frame_per_phoneme, vowel_indexes[:-1] + 1)
     ]
-    # モーラの基本周波数を子音・母音に割当てフレーム化
     frame_f0 = numpy.repeat(f0, frame_per_mora)
     return frame_f0
 
 
+def apply_volume_scale(wave: numpy.ndarray, query: AudioQuery) -> numpy.ndarray:
+    """
+    音量スケール（`volumeScale`）の適用
+    Parameters
+    ----------
+    wave : numpy.ndarray
+        音声波形
+    query : AudioQuery
+        音声合成クエリ
+    Returns
+    -------
+    wave : numpy.ndarray
+        音量スケールが適用された音声波形
+    """
+    wave *= query.volumeScale
+    return wave
+
+
 def calc_frame_phoneme(phonemes: List[OjtPhoneme], frame_per_phoneme: numpy.ndarray):
     """
     フレームごとの音素列の生成（onehot化 + フレーム化）
@@ -230,11 +306,59 @@ def calc_frame_phoneme(phonemes: List[OjtPhoneme], frame_per_phoneme: numpy.ndar
         フレームごとの音素系列
     """
     # TODO: Better function name (c.f. VOICEVOX/voicevox_engine#790)
+    # Convert: Core入力形式への変換（onehotベクトル系列）
     onehot_phoneme = numpy.stack([p.onehot for p in phonemes])
+
+    # Rescale: 時間スケールの変更（音素 -> フレーム）
     frame_phoneme = numpy.repeat(onehot_phoneme, frame_per_phoneme, axis=0)
     return frame_phoneme
 
 
+def apply_output_sampling_rate(
+    wave: ndarray, sr_wave: int, query: AudioQuery
+) -> ndarray:
+    """
+    出力サンプリングレート（`outputSamplingRate`）の適用
+    Parameters
+    ----------
+    wave : ndarray
+        音声波形
+    sr_wave : int
+        `wave`のサンプリングレート
+    query : AudioQuery
+        音声合成クエリ
+    Returns
+    -------
+    wave : ndarray
+        出力サンプリングレートが適用された音声波形
+    """
+    # サンプリングレート一致のときはスルー
+    if sr_wave == query.outputSamplingRate:
+        return wave
+
+    wave = resample(wave, sr_wave, query.outputSamplingRate)
+    return wave
+
+
+def apply_output_stereo(wave: ndarray, query: AudioQuery) -> ndarray:
+    """
+    ステレオ出力（`outputStereo`）の適用
+    Parameters
+    ----------
+    wave : ndarray
+        音声波形
+    query : AudioQuery
+        音声合成クエリ
+    Returns
+    -------
+    wave : ndarray
+        ステレオ出力設定が適用された音声波形
+    """
+    if query.outputStereo:
+        wave = numpy.array([wave, wave]).T
+    return wave
+
+
 class SynthesisEngine(SynthesisEngineBase):
     """音声合成器（core）の管理/実行/プロキシと音声合成フロー"""
 
@@ -493,7 +617,7 @@ def _synthesis_impl(self, query: AudioQuery, style_id: int):
         # AccentPhraseをすべてMoraおよびOjtPhonemeの形に分解し、処理可能な形にする
         flatten_moras, phoneme_data_list = pre_process(query.accent_phrases)
 
-        flatten_moras = pad_with_silence(flatten_moras, query)
+        flatten_moras = apply_prepost_silence(flatten_moras, query)
         frame_per_phoneme = calc_frame_per_phoneme(query, flatten_moras)
         f0 = calc_frame_pitch(
             query, flatten_moras, phoneme_data_list, frame_per_phoneme
@@ -509,21 +633,10 @@ def _synthesis_impl(self, query: AudioQuery, style_id: int):
                 phoneme=phoneme,
                 style_id=numpy.array(style_id, dtype=numpy.int64).reshape(-1),
             )
+            sr_wave = self.default_sampling_rate
 
-        # volume: ゲイン適用
-        wave *= query.volumeScale
-
-        # 出力サンプリングレートがデフォルト(decode forwarderによるもの、24kHz)でなければ、それを適用する
-        if query.outputSamplingRate != self.default_sampling_rate:
-            wave = resample(
-                wave,
-                self.default_sampling_rate,
-                query.outputSamplingRate,
-            )
-
-        # ステレオ変換
-        # 出力設定がステレオなのであれば、ステレオ化する
-        if query.outputStereo:
-            wave = numpy.array([wave, wave]).T
+        wave = apply_volume_scale(wave, query)
+        wave = apply_output_sampling_rate(wave, sr_wave, query)
+        wave = apply_output_stereo(wave, query)
 
         return wave