From d2daaa5201762b9f0a043f69afed6633c0ddba7f Mon Sep 17 00:00:00 2001
From: Ryo Yamashita <qryxip@gmail.com>
Date: Sat, 16 Nov 2024 11:12:29 +0900
Subject: [PATCH] =?UTF-8?q?feat:=20`pause=5Flength{,=5Fscale}`=E3=82=92?=
 =?UTF-8?q?=E3=83=87=E3=83=95=E3=82=A9=E3=83=AB=E3=83=88=E5=80=A4=E9=99=90?=
 =?UTF-8?q?=E5=AE=9A=E3=81=A7=E5=8F=97=E3=81=91=E5=85=A5=E3=82=8C=E3=82=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

VOICEVOX/voicevox_engine#1308 と VOICEVOX/voicevox_engine#1425 の一部
を参考にコードを書いた。

TODO: @X-20A さんの許諾を取るか、"hihoライセンス"経由で取り込む旨を書く

Co-Authored-By: https://github.com/VOICEVOX/voicevox_core/issues/874#issuecomment-2482637392
Co-Authored-By: sabonerune <102559104+sabonerune@users.noreply.github.com>
Refs: https://github.com/VOICEVOX/voicevox_core/issues/874#issuecomment-2482637392
---
 crates/voicevox_core/src/engine/model.rs      | 140 +++++++++++++++++-
 crates/voicevox_core/src/synthesizer.rs       |   2 +
 .../jp/hiroshiba/voicevoxcore/AudioQuery.java |  13 ++
 .../python/voicevox_core/_models.py           |   6 +
 4 files changed, 160 insertions(+), 1 deletion(-)
diff --git a/crates/voicevox_core/src/engine/model.rs b/crates/voicevox_core/src/engine/model.rs
index 20203feb3..fad10f493 100644
--- a/crates/voicevox_core/src/engine/model.rs
+++ b/crates/voicevox_core/src/engine/model.rs
@@ -1,4 +1,7 @@
-use serde::{Deserialize, Serialize};
+use std::fmt;
+
+use duplicate::duplicate_item;
+use serde::{de, Deserialize, Deserializer, Serialize, Serializer};
 
 /* 各フィールドのjsonフィールド名はsnake_caseとする*/
 
@@ -64,6 +67,20 @@ pub struct AudioQuery {
     pub output_sampling_rate: u32,
     /// 音声データをステレオ出力するか否か。
     pub output_stereo: bool,
+    /// 句読点などの無音時間。`null`のときは無視される。デフォルト値は`null`。
+    #[serde(
+        default,
+        deserialize_with = "deserialize_pause_length",
+        serialize_with = "serialize_pause_length"
+    )]
+    pub pause_length: (),
+    /// 読点などの無音時間（倍率）。デフォルト値は`1`。
+    #[serde(
+        default,
+        deserialize_with = "deserialize_pause_length_scale",
+        serialize_with = "serialize_pause_length_scale"
+    )]
+    pub pause_length_scale: (),
     /// \[読み取り専用\] AquesTalk風記法。
     ///
     /// [`Synthesizer::audio_query`]が返すもののみ`Some`となる。入力としてのAudioQueryでは無視され
@@ -73,6 +90,87 @@ pub struct AudioQuery {
     pub kana: Option<String>,
 }
 
+fn deserialize_pause_length<'de, D>(deserializer: D) -> Result<(), D::Error>
+where
+    D: Deserializer<'de>,
+{
+    return deserializer.deserialize_any(Visitor);
+
+    struct Visitor;
+
+    impl<'de> de::Visitor<'de> for Visitor {
+        type Value = ();
+
+        fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
+            formatter.write_str("`null`")
+        }
+
+        #[duplicate_item(
+            method        T;
+            [ visit_i64 ] [ i64 ];
+            [ visit_u64 ] [ u64 ];
+            [ visit_f64 ] [ f64 ];
+        )]
+        fn method<E>(self, _: T) -> Result<Self::Value, E>
+        where
+            E: de::Error,
+        {
+            Err(E::custom("currently `pause_length` must be `null`"))
+        }
+
+        fn visit_unit<E>(self) -> Result<Self::Value, E> {
+            Ok(())
+        }
+    }
+}
+
+fn serialize_pause_length<S>(_: &(), serializer: S) -> Result<S::Ok, S::Error>
+where
+    S: Serializer,
+{
+    serializer.serialize_unit()
+}
+
+fn deserialize_pause_length_scale<'de, D>(deserializer: D) -> Result<(), D::Error>
+where
+    D: Deserializer<'de>,
+{
+    return deserializer.deserialize_any(Visitor);
+
+    struct Visitor;
+
+    impl<'de> de::Visitor<'de> for Visitor {
+        type Value = ();
+
+        fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
+            formatter.write_str("`1.`")
+        }
+
+        #[duplicate_item(
+            method        T       ONE;
+            [ visit_i64 ] [ i64 ] [ 1 ];
+            [ visit_u64 ] [ u64 ] [ 1 ];
+            [ visit_f64 ] [ f64 ] [ 1. ];
+        )]
+        fn method<E>(self, v: T) -> Result<Self::Value, E>
+        where
+            E: de::Error,
+        {
+            if v != ONE {
+                return Err(E::custom("currently `pause_length_scale` must be `1.`"));
+            }
+            Ok(())
+        }
+    }
+}
+
+fn serialize_pause_length_scale<S>(_: &(), serializer: S) -> Result<S::Ok, S::Error>
+where
+    S: Serializer,
+{
+    (1.).serialize(serializer)
+}
+
 impl AudioQuery {
     pub(crate) fn with_kana(self, kana: Option<String>) -> Self {
         Self { kana, ..self }
@@ -99,6 +197,8 @@ mod tests {
             post_phoneme_length: 0.0,
             output_sampling_rate: 0,
             output_stereo: false,
+            pause_length: (),
+            pause_length_scale: (),
             kana: None,
         };
         let val = serde_json::to_value(audio_query_model).unwrap();
@@ -152,4 +252,42 @@ mod tests {
         }))?;
         Ok(())
     }
+
+    // TODO: 型的に自明になったらこのテストは削除する
+    #[rstest]
+    fn it_denies_non_null_for_pause_length() {
+        serde_json::from_value::<AudioQuery>(json!({
+            "accent_phrases": [],
+            "speed_scale": 1.0,
+            "pitch_scale": 0.0,
+            "intonation_scale": 1.0,
+            "volume_scale": 1.0,
+            "pre_phoneme_length": 0.1,
+            "post_phoneme_length": 0.1,
+            "output_sampling_rate": 24000,
+            "output_stereo": false,
+            "pause_length": "aaaaa"
+        }))
+        .map(|_| ())
+        .unwrap_err();
+    }
+
+    // TODO: 型的に自明になったらこのテストは削除する
+    #[rstest]
+    fn it_denies_non_float_for_pause_length_scale() {
+        serde_json::from_value::<AudioQuery>(json!({
+            "accent_phrases": [],
+            "speed_scale": 1.0,
+            "pitch_scale": 0.0,
+            "intonation_scale": 1.0,
+            "volume_scale": 1.0,
+            "pre_phoneme_length": 0.1,
+            "post_phoneme_length": 0.1,
+            "output_sampling_rate": 24000,
+            "output_stereo": false,
+            "pause_length_scale": "aaaaa",
+        }))
+        .map(|_| ())
+        .unwrap_err();
+    }
 }
diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs
index b68d1ab2c..ebe06e270 100644
--- a/crates/voicevox_core/src/synthesizer.rs
+++ b/crates/voicevox_core/src/synthesizer.rs
@@ -1185,6 +1185,8 @@ mod inner {
                 post_phoneme_length: 0.1,
                 output_sampling_rate: DEFAULT_SAMPLING_RATE,
                 output_stereo: false,
+                pause_length: (),
+                pause_length_scale: (),
                 kana: Some(kana),
             }
         }
diff --git a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/AudioQuery.java b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/AudioQuery.java
index 2f50c7235..afc735034 100644
--- a/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/AudioQuery.java
+++ b/crates/voicevox_core_java_api/lib/src/main/java/jp/hiroshiba/voicevoxcore/AudioQuery.java
@@ -55,6 +55,17 @@ public class AudioQuery {
   @Expose
   public boolean outputStereo;
 
+  /** 句読点などの無音時間。{@code null}のときは無視される。デフォルト値は{@code null}。 */
+  @SerializedName("pause_length")
+  @Expose
+  @Nullable
+  public Double pauseLength;
+
+  /** 読点などの無音時間（倍率）。デフォルト値は{@code 1.}。 */
+  @SerializedName("pause_length_scale")
+  @Expose
+  public double pauseLengthScale;
+
   /**
    * [読み取り専用] AquesTalk風記法。
    *
@@ -75,6 +86,8 @@ public AudioQuery() {
     this.prePhonemeLength = 0.1;
     this.postPhonemeLength = 0.1;
     this.outputSamplingRate = 24000;
+    this.pauseLength = null;
+    this.pauseLengthScale = 1.0;
     this.kana = null;
   }
 }
diff --git a/crates/voicevox_core_python_api/python/voicevox_core/_models.py b/crates/voicevox_core_python_api/python/voicevox_core/_models.py
index 941ed84fc..9af47148a 100644
--- a/crates/voicevox_core_python_api/python/voicevox_core/_models.py
+++ b/crates/voicevox_core_python_api/python/voicevox_core/_models.py
@@ -208,6 +208,12 @@ class AudioQuery:
     output_stereo: bool
     """音声データをステレオ出力するか否か。"""
 
+    pause_length: None = None
+    """句読点などの無音時間。 ``None`` のときは無視される。デフォルト値は ``None`` 。"""
+
+    pause_length_scale: float = 1.0
+    """読点などの無音時間（倍率）。デフォルト値は ``1.0`` 。"""
+
     kana: Optional[str] = None
     """
     [読み取り専用] AquesTalk風記法。