From cd17924897cc8e668866a6b986a0c25daa2d0dc7 Mon Sep 17 00:00:00 2001 From: phenylshima <49227365+femshima@users.noreply.github.com> Date: Tue, 27 Feb 2024 00:45:06 +0900 Subject: [PATCH] =?UTF-8?q?jlabel=E3=82=92=E5=B0=8E=E5=85=A5=E3=81=99?= =?UTF-8?q?=E3=82=8B=20(#742)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add jlabel * create convert less allocation less unwrap move to full_context_label proper error handling switch change order fix bugs * remove unused code * remove debug * refactor * implement workarounds * explicitly return None * refactor * fmt * use rust functionality if possible * refactor SplitByKey * refactor generate_moras * refactor generate_accent_phrases Co-authored-by: cm-ayf * better extraction from Label * unify mora_to_text * bring removed comment back * create anyhow directly * use `into` Co-authored-by: cm-ayf * remove unnecessary as_deref * remove as usize * use easy_ext::ext * use itertools::Itertools * copy chunk_by from Rust * fix test name * use smallvec * stricter condition of saturated position * simpler match * add test for label parser and accent phrase generator * test openjtalk as well * split test using rstest_reuse * fix clippy * remove duplicate test --------- Co-authored-by: cm-ayf --- Cargo.lock | 65 ++- Cargo.toml | 2 + crates/voicevox_core/Cargo.toml | 2 + .../src/engine/full_context_label.rs | 537 ++++++++---------- crates/voicevox_core/src/engine/mod.rs | 4 +- crates/voicevox_core/src/synthesizer.rs | 6 +- crates/voicevox_core/src/text_analyzer.rs | 86 +-- 7 files changed, 299 insertions(+), 403 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d85693071..d1b395e9b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1060,7 +1060,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.38", + "syn 2.0.48", ] [[package]] @@ -1071,7 +1071,7 @@ checksum = "836a9bbc7ad63342d6d6e7b815ccab164bc77a2d95d84bc3117a8c0d5c98e2d5" dependencies = [ "darling_core", "quote", - "syn 2.0.38", + "syn 2.0.48", ] [[package]] @@ -1258,7 +1258,7 @@ checksum = "48c69b3965971f5d0ea6a6dd26b55cdd517ae0e1425dc8d94e482a5915bd7ddf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.38", + "syn 2.0.48", ] [[package]] @@ -1271,7 +1271,7 @@ dependencies = [ "num-traits", "proc-macro2", "quote", - "syn 2.0.38", + "syn 2.0.48", ] [[package]] @@ -1504,7 +1504,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn 2.0.38", + "syn 2.0.48", ] [[package]] @@ -2020,6 +2020,15 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4217ad341ebadf8d8e724e264f13e593e0648f5b3e94b3896a5df283be015ecc" +[[package]] +name = "jlabel" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f040b22c55628977296069dbf8635be49cc510999c048a1f1bdb56d00983148" +dependencies = [ + "thiserror", +] + [[package]] name = "jni" version = "0.21.1" @@ -2579,7 +2588,7 @@ dependencies = [ "proc-macro-error", "proc-macro2", "quote", - "syn 2.0.38", + "syn 2.0.48", ] [[package]] @@ -2655,7 +2664,7 @@ dependencies = [ "regex", "regex-syntax 0.7.5", "structmeta", - "syn 2.0.38", + "syn 2.0.48", ] [[package]] @@ -2856,9 +2865,9 @@ checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" [[package]] name = "proc-macro2" -version = "1.0.69" +version = "1.0.78" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da" +checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" dependencies = [ "unicode-ident", ] @@ -2961,9 +2970,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.33" +version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" +checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" dependencies = [ "proc-macro2", ] @@ -3224,7 +3233,7 @@ dependencies = [ "quote", "rand 0.8.5", "rustc_version 0.4.0", - "syn 2.0.38", + "syn 2.0.48", ] [[package]] @@ -3406,7 +3415,7 @@ checksum = "d9735b638ccc51c28bf6914d90a2e9725b377144fc612c49a611fddd1b631d68" dependencies = [ "proc-macro2", "quote", - "syn 2.0.38", + "syn 2.0.48", ] [[package]] @@ -3488,7 +3497,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.38", + "syn 2.0.48", ] [[package]] @@ -3600,9 +3609,9 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.10.0" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" +checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" [[package]] name = "snafu" @@ -3731,7 +3740,7 @@ dependencies = [ "proc-macro2", "quote", "structmeta-derive", - "syn 2.0.38", + "syn 2.0.48", ] [[package]] @@ -3742,7 +3751,7 @@ checksum = "a60bcaff7397072dca0017d1db428e30d5002e00b6847703e2e42005c95fbe00" dependencies = [ "proc-macro2", "quote", - "syn 2.0.38", + "syn 2.0.48", ] [[package]] @@ -3809,9 +3818,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.38" +version = "2.0.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e96b79aaa137db8f61e26363a0c9b47d8b4ec75da28b7d1d614c2303e232408b" +checksum = "0f3531638e407dfc0814761abb7c00a5b54992b849452a0646b7f65c9f770f3f" dependencies = [ "proc-macro2", "quote", @@ -3890,22 +3899,22 @@ checksum = "949517c0cf1bf4ee812e2e07e08ab448e3ae0d23472aee8a06c985f0c8815b16" [[package]] name = "thiserror" -version = "1.0.37" +version = "1.0.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10deb33631e3c9018b9baf9dcbbc4f737320d2b576bac10f6aefa048fa407e3e" +checksum = "d54378c645627613241d077a3a79db965db602882668f9136ac42af9ecb730ad" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.37" +version = "1.0.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "982d17546b47146b28f7c22e3d08465f6b8903d0ea13c1660d9d84a6e7adcdbb" +checksum = "fa0faa943b50f3db30a20aa7e265dbc66076993efed8463e8de414e5d06d3471" dependencies = [ "proc-macro2", "quote", - "syn 1.0.102", + "syn 2.0.48", ] [[package]] @@ -4023,7 +4032,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.38", + "syn 2.0.48", ] [[package]] @@ -4370,6 +4379,7 @@ dependencies = [ "humansize", "indexmap 2.0.0", "itertools 0.10.5", + "jlabel", "nanoid", "ndarray", "once_cell", @@ -4383,6 +4393,7 @@ dependencies = [ "rstest_reuse", "serde", "serde_json", + "smallvec", "tempfile", "test_util", "thiserror", @@ -4459,7 +4470,7 @@ dependencies = [ "indexmap 2.0.0", "proc-macro2", "quote", - "syn 2.0.38", + "syn 2.0.48", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index fb42feb2a..5c2ceb5be 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -40,6 +40,7 @@ indexmap = "2.0.0" indicatif = "0.17.3" inventory = "0.3.4" itertools = "0.10.5" +jlabel = "0.1.2" jni = "0.21.1" libc = "0.2.134" libloading = "0.7.3" @@ -66,6 +67,7 @@ rstest_reuse = "0.6.0" serde = "1.0.145" serde_json = "1.0.85" serde_with = "3.3.0" +smallvec = "1.13.1" strum = "0.24.1" surf = "2.3.2" syn = "2.0.38" diff --git a/crates/voicevox_core/Cargo.toml b/crates/voicevox_core/Cargo.toml index 1bb228e16..02f1860cf 100644 --- a/crates/voicevox_core/Cargo.toml +++ b/crates/voicevox_core/Cargo.toml @@ -23,6 +23,7 @@ fs-err = { workspace = true, features = ["tokio"] } futures.workspace = true indexmap = { workspace = true, features = ["serde"] } itertools.workspace = true +jlabel.workspace = true nanoid.workspace = true ndarray.workspace = true once_cell.workspace = true @@ -33,6 +34,7 @@ rayon.workspace = true regex.workspace = true serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true, features = ["preserve_order"] } +smallvec.workspace = true tempfile.workspace = true thiserror.workspace = true tokio = { workspace = true, features = ["rt"] } # FIXME: feature-gateする diff --git a/crates/voicevox_core/src/engine/full_context_label.rs b/crates/voicevox_core/src/engine/full_context_label.rs index edda52394..d4099a900 100644 --- a/crates/voicevox_core/src/engine/full_context_label.rs +++ b/crates/voicevox_core/src/engine/full_context_label.rs @@ -1,10 +1,11 @@ -use std::collections::HashMap; +use std::str::FromStr; -use crate::engine::open_jtalk::FullcontextExtractor; -use derive_getters::Getters; -use derive_new::new; -use once_cell::sync::Lazy; -use regex::Regex; +use crate::{ + engine::{self, open_jtalk::FullcontextExtractor, MoraModel}, + AccentPhraseModel, +}; +use jlabel::{Label, Mora}; +use smallvec::SmallVec; // FIXME: 入力テキストをここで持って、メッセージに含む #[derive(thiserror::Error, Debug)] @@ -20,322 +21,261 @@ enum ErrorKind { #[display(fmt = "Open JTalkで解釈することができませんでした")] OpenJtalk, - #[display(fmt = "label parse error label: {label}")] - LabelParse { label: String }, + #[display(fmt = "jlabelでラベルを解釈することができませんでした")] + Jlabel, - #[display(fmt = "too long mora mora_phonemes: {mora_phonemes:?}")] - TooLongMora { mora_phonemes: Vec }, - - #[display(fmt = "invalid mora: {mora:?}")] - InvalidMora { mora: Box }, + #[display(fmt = "too long mora")] + TooLongMora, } type Result = std::result::Result; -#[derive(new, Getters, Clone, PartialEq, Eq, Debug)] -pub struct Phoneme { - contexts: HashMap, - label: String, -} - -static P3_REGEX: Lazy = Lazy::new(|| Regex::new(r"(\-(.*?)\+)").unwrap()); -static A2_REGEX: Lazy = Lazy::new(|| Regex::new(r"(\+(\d+|xx)\+)").unwrap()); -static A3_REGEX: Lazy = Lazy::new(|| Regex::new(r"(\+(\d+|xx)/B:)").unwrap()); -static F1_REGEX: Lazy = Lazy::new(|| Regex::new(r"(/F:(\d+|xx)_)").unwrap()); -static F2_REGEX: Lazy = Lazy::new(|| Regex::new(r"(_(\d+|xx)\#)").unwrap()); -static F3_REGEX: Lazy = Lazy::new(|| Regex::new(r"(\#(\d+|xx)_)").unwrap()); -static F5_REGEX: Lazy = Lazy::new(|| Regex::new(r"(@(\d+|xx)_)").unwrap()); -static H1_REGEX: Lazy = Lazy::new(|| Regex::new(r"(/H:(\d+|xx)_)").unwrap()); -static I3_REGEX: Lazy = Lazy::new(|| Regex::new(r"(@(\d+|xx)\+)").unwrap()); -static J1_REGEX: Lazy = Lazy::new(|| Regex::new(r"(/J:(\d+|xx)_)").unwrap()); - -fn string_feature_by_regex(re: &Regex, label: &str) -> std::result::Result { - if let Some(caps) = re.captures(label) { - Ok(caps[2].to_string()) - } else { - Err(ErrorKind::LabelParse { - label: label.into(), - }) - } -} - -impl Phoneme { - fn from_label(label: impl Into) -> std::result::Result { - let mut contexts = HashMap::::with_capacity(10); - let label = label.into(); - contexts.insert("p3".into(), string_feature_by_regex(&P3_REGEX, &label)?); - contexts.insert("a2".into(), string_feature_by_regex(&A2_REGEX, &label)?); - contexts.insert("a3".into(), string_feature_by_regex(&A3_REGEX, &label)?); - contexts.insert("f1".into(), string_feature_by_regex(&F1_REGEX, &label)?); - contexts.insert("f2".into(), string_feature_by_regex(&F2_REGEX, &label)?); - contexts.insert("f3".into(), string_feature_by_regex(&F3_REGEX, &label)?); - contexts.insert("f5".into(), string_feature_by_regex(&F5_REGEX, &label)?); - contexts.insert("h1".into(), string_feature_by_regex(&H1_REGEX, &label)?); - contexts.insert("i3".into(), string_feature_by_regex(&I3_REGEX, &label)?); - contexts.insert("j1".into(), string_feature_by_regex(&J1_REGEX, &label)?); - - Ok(Self::new(contexts, label)) - } - - pub fn phoneme(&self) -> &str { - self.contexts.get("p3").unwrap().as_str() - } - - pub fn is_pause(&self) -> bool { - self.contexts.get("f1").unwrap().as_str() == "xx" - } -} - -#[derive(new, Getters, Clone, PartialEq, Eq, Debug)] -pub struct Mora { - consonant: Option, - vowel: Phoneme, +pub(crate) fn extract_full_context_label( + open_jtalk: &impl FullcontextExtractor, + text: impl AsRef, +) -> Result> { + let labels = open_jtalk + .extract_fullcontext(text.as_ref()) + .map_err(|source| FullContextLabelError { + context: ErrorKind::OpenJtalk, + source: Some(source), + })?; + + let parsed_labels = labels + .into_iter() + .map(|s| Label::from_str(&s)) + .collect::, _>>() + .map_err(|source| FullContextLabelError { + context: ErrorKind::Jlabel, + source: Some(source.into()), + })?; + + generate_accent_phrases(&parsed_labels).map_err(|context| FullContextLabelError { + context, + source: None, + }) } -impl Mora { - pub fn set_context(&mut self, key: impl Into, value: impl Into) { - let key = key.into(); - let value = value.into(); - if let Some(ref mut consonant) = self.consonant { - consonant.contexts.insert(key.clone(), value.clone()); +fn generate_accent_phrases( + utterance: &[Label], +) -> std::result::Result, ErrorKind> { + let mut accent_phrases = Vec::with_capacity( + utterance + .first() + .map(|label| label.utterance.accent_phrase_count.into()) + .unwrap_or(0), + ); + + let split = utterance.chunk_by(|a, b| { + a.breath_group_curr == b.breath_group_curr && a.accent_phrase_curr == b.accent_phrase_curr + }); + for labels in split { + let moras = generate_moras(labels)?; + if moras.is_empty() { + continue; } - self.vowel.contexts.insert(key, value); - } - pub fn phonemes(&self) -> Vec { - if self.consonant.is_some() { - vec![ - self.consonant().as_ref().unwrap().clone(), - self.vowel.clone(), - ] + let Some(Label { + accent_phrase_curr: Some(ap_curr), + breath_group_curr: Some(bg_curr), + .. + }) = labels.first() + else { + continue; + }; + + // Breath Groupの中で最後のアクセント句かつ,Utteranceの中で最後のBreath Groupでない場合は次がpauになる + let pause_mora = if ap_curr.accent_phrase_position_backward == 1 + && bg_curr.breath_group_position_backward != 1 + { + Some(MoraModel::new( + "、".into(), + None, + None, + "pau".into(), + 0., + 0., + )) } else { - vec![self.vowel.clone()] - } - } + None + }; - #[allow(dead_code)] - pub fn labels(&self) -> Vec { - self.phonemes().iter().map(|p| p.label().clone()).collect() + // workaround for VOICEVOX/voicevox_engine#55 + let accent = usize::from(ap_curr.accent_position).min(moras.len()); + + accent_phrases.push(AccentPhraseModel::new( + moras, + accent, + pause_mora, + ap_curr.is_interrogative, + )) } + Ok(accent_phrases) } -#[derive(new, Getters, Clone, Debug, PartialEq, Eq)] -pub struct AccentPhrase { - moras: Vec, - accent: usize, - is_interrogative: bool, -} - -impl AccentPhrase { - fn from_phonemes(mut phonemes: Vec) -> std::result::Result { - let mut moras = Vec::with_capacity(phonemes.len()); - let mut mora_phonemes = Vec::with_capacity(phonemes.len()); - for i in 0..phonemes.len() { - { - let phoneme = phonemes.get_mut(i).unwrap(); - if phoneme.contexts().get("a2").map(|s| s.as_str()) == Some("49") { - break; - } - mora_phonemes.push(phoneme.clone()); +fn generate_moras(accent_phrase: &[Label]) -> std::result::Result, ErrorKind> { + let mut moras = Vec::with_capacity(accent_phrase.len()); + + let split = accent_phrase.chunk_by(|a, b| a.mora == b.mora); + for labels in split { + let labels: SmallVec<[&Label; 3]> = + labels.iter().filter(|label| label.mora.is_some()).collect(); + match labels[..] { + [consonant, vowel] => { + let mora = generate_mora(Some(consonant), vowel); + moras.push(mora); } - - if i + 1 == phonemes.len() - || phonemes.get(i).unwrap().contexts().get("a2").unwrap() - != phonemes.get(i + 1).unwrap().contexts().get("a2").unwrap() - { - if mora_phonemes.len() == 1 { - moras.push(Mora::new(None, mora_phonemes[0].clone())); - } else if mora_phonemes.len() == 2 { - moras.push(Mora::new( - Some(mora_phonemes[0].clone()), - mora_phonemes[1].clone(), - )); - } else { - return Err(ErrorKind::TooLongMora { mora_phonemes }); - } - mora_phonemes.clear(); + [vowel] => { + let mora = generate_mora(None, vowel); + moras.push(mora); + } + // silやpau以外の音素がないモーラは含めない + [] => {} + + // 音素が3つ以上ある場合: + // position_forwardとposition_backwardが飽和している場合は無視する + [Label { + mora: + Some(Mora { + position_forward: 49, + position_backward: 49, + .. + }), + .. + }, ..] => {} + _ => { + return Err(ErrorKind::TooLongMora); } - } - - let mora = &moras[0]; - let mut accent: usize = mora - .vowel() - .contexts() - .get("f2") - .ok_or_else(|| ErrorKind::InvalidMora { - mora: mora.clone().into(), - })? - .parse() - .map_err(|_| ErrorKind::InvalidMora { - mora: mora.clone().into(), - })?; - - let is_interrogative = moras - .last() - .unwrap() - .vowel() - .contexts() - .get("f3") - .map(|s| s.as_str()) - == Some("1"); - // workaround for VOICEVOX/voicevox_engine#55 - if accent > moras.len() { - accent = moras.len(); - } - - Ok(Self::new(moras, accent, is_interrogative)) - } - - #[allow(dead_code)] - pub fn set_context(&mut self, key: impl Into, value: impl Into) { - let key = key.into(); - let value = value.into(); - for mora in self.moras.iter_mut() { - mora.set_context(&key, &value); } } - - pub fn phonemes(&self) -> Vec { - self.moras.iter().flat_map(|m| m.phonemes()).collect() - } - - #[allow(dead_code)] - pub fn labels(&self) -> Vec { - self.phonemes().iter().map(|p| p.label().clone()).collect() - } - - #[allow(dead_code)] - pub fn merge(&self, accent_phrase: AccentPhrase) -> AccentPhrase { - let mut moras = self.moras().clone(); - let is_interrogative = *accent_phrase.is_interrogative(); - moras.extend(accent_phrase.moras); - AccentPhrase::new(moras, *self.accent(), is_interrogative) - } + Ok(moras) } -#[derive(new, Getters, Clone, PartialEq, Eq, Debug)] -pub struct BreathGroup { - accent_phrases: Vec, +fn generate_mora(consonant: Option<&Label>, vowel: &Label) -> MoraModel { + let consonant_phoneme = consonant.and_then(|c| c.phoneme.c.to_owned()); + let vowel_phoneme = vowel.phoneme.c.as_deref().unwrap(); + MoraModel::new( + mora_to_text(consonant_phoneme.as_deref(), vowel_phoneme), + consonant_phoneme, + consonant.and(Some(0.0)), + vowel_phoneme.to_string(), + 0.0, + 0.0, + ) } -impl BreathGroup { - fn from_phonemes(phonemes: Vec) -> std::result::Result { - let mut accent_phrases = Vec::with_capacity(phonemes.len()); - let mut accent_phonemes = Vec::with_capacity(phonemes.len()); - for i in 0..phonemes.len() { - accent_phonemes.push(phonemes.get(i).unwrap().clone()); - if i + 1 == phonemes.len() - || phonemes.get(i).unwrap().contexts().get("i3").unwrap() - != phonemes.get(i + 1).unwrap().contexts().get("i3").unwrap() - || phonemes.get(i).unwrap().contexts().get("f5").unwrap() - != phonemes.get(i + 1).unwrap().contexts().get("f5").unwrap() - { - accent_phrases.push(AccentPhrase::from_phonemes(accent_phonemes.clone())?); - accent_phonemes.clear(); - } +pub fn mora_to_text(consonant: Option<&str>, vowel: &str) -> String { + let mora_text = format!( + "{}{}", + consonant.unwrap_or(""), + match vowel { + phoneme @ ("A" | "I" | "U" | "E" | "O") => phoneme.to_lowercase(), + phoneme => phoneme.to_string(), } + ); + // もしカタカナに変換できなければ、引数で与えた文字列がそのまま返ってくる + engine::mora2text(&mora_text).to_string() +} - Ok(Self::new(accent_phrases)) +// FIXME: Remove `chunk_by` module after Rust 1.77.0 is released as stable. +use chunk_by::*; +mod chunk_by { + // Implementations in this module were copied from + // [Rust](https://github.com/rust-lang/rust/blob/746a58d4359786e4aebb372a30829706fa5a968f/library/core/src/slice/iter.rs). + + // MIT License Notice + + // Permission is hereby granted, free of charge, to any + // person obtaining a copy of this software and associated + // documentation files (the "Software"), to deal in the + // Software without restriction, including without + // limitation the rights to use, copy, modify, merge, + // publish, distribute, sublicense, and/or sell copies of + // the Software, and to permit persons to whom the Software + // is furnished to do so, subject to the following + // conditions: + // + // The above copyright notice and this permission notice + // shall be included in all copies or substantial portions + // of the Software. + // + // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF + // ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED + // TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A + // PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT + // SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + // CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR + // IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + // DEALINGS IN THE SOFTWARE. + + pub struct ChunkBy<'a, T, P> { + slice: &'a [T], + predicate: P, } - - #[allow(dead_code)] - pub fn set_context(&mut self, key: impl Into, value: impl Into) { - let key = key.into(); - let value = value.into(); - for accent_phrase in self.accent_phrases.iter_mut() { - accent_phrase.set_context(&key, &value); + impl<'a, T, P> ChunkBy<'a, T, P> { + pub(super) fn new(slice: &'a [T], predicate: P) -> Self { + ChunkBy { slice, predicate } } } - - pub fn phonemes(&self) -> Vec { - self.accent_phrases() - .iter() - .flat_map(|a| a.phonemes()) - .collect() - } - - #[allow(dead_code)] - pub fn labels(&self) -> Vec { - self.phonemes().iter().map(|p| p.label().clone()).collect() - } -} - -#[derive(new, Getters, Clone, PartialEq, Eq, Debug)] -pub struct Utterance { - breath_groups: Vec, - pauses: Vec, -} - -impl Utterance { - fn from_phonemes(phonemes: Vec) -> std::result::Result { - let mut breath_groups = vec![]; - let mut group_phonemes = Vec::with_capacity(phonemes.len()); - let mut pauses = vec![]; - for phoneme in phonemes.into_iter() { - if !phoneme.is_pause() { - group_phonemes.push(phoneme); + impl<'a, T, P> Iterator for ChunkBy<'a, T, P> + where + P: FnMut(&T, &T) -> bool, + { + type Item = &'a [T]; + + #[inline] + fn next(&mut self) -> Option { + if self.slice.is_empty() { + None } else { - pauses.push(phoneme); - - if !group_phonemes.is_empty() { - breath_groups.push(BreathGroup::from_phonemes(group_phonemes.clone())?); - group_phonemes.clear(); + let mut len = 1; + let mut iter = self.slice.windows(2); + while let Some([l, r]) = iter.next() { + if (self.predicate)(l, r) { + len += 1 + } else { + break; + } } + let (head, tail) = self.slice.split_at(len); + self.slice = tail; + Some(head) } } - Ok(Self::new(breath_groups, pauses)) - } - - #[allow(dead_code)] - pub fn set_context(&mut self, key: impl Into, value: impl Into) { - let key = key.into(); - let value = value.into(); - for breath_group in self.breath_groups.iter_mut() { - breath_group.set_context(&key, &value); - } - } - #[allow(dead_code)] - pub fn phonemes(&self) -> Vec { - // TODO:実装が中途半端なのであとでちゃんと実装する必要があるらしい - // https://github.com/VOICEVOX/voicevox_core/pull/174#discussion_r919982651 - let mut phonemes = Vec::with_capacity(self.breath_groups.len()); - - for i in 0..self.pauses().len() { - phonemes.push(self.pauses().get(i).unwrap().clone()); - if i < self.pauses().len() - 1 { - let p = self.breath_groups().get(i).unwrap().phonemes(); - phonemes.extend(p); + #[inline] + fn size_hint(&self) -> (usize, Option) { + if self.slice.is_empty() { + (0, Some(0)) + } else { + (1, Some(self.slice.len())) } } - phonemes } - #[allow(dead_code)] - pub fn labels(&self) -> Vec { - self.phonemes().iter().map(|p| p.label().clone()).collect() + #[easy_ext::ext(TChunkBy)] + impl [T] { + pub fn chunk_by(&self, pred: F) -> ChunkBy<'_, T, F> + where + F: FnMut(&T, &T) -> bool, + { + ChunkBy::new(self, pred) + } } - pub(crate) fn extract_full_context_label( - open_jtalk: &impl FullcontextExtractor, - text: impl AsRef, - ) -> Result { - let labels = open_jtalk - .extract_fullcontext(text.as_ref()) - .map_err(|source| FullContextLabelError { - context: ErrorKind::OpenJtalk, - source: Some(source), - })?; - - labels - .into_iter() - .map(Phoneme::from_label) - .collect::, _>>() - .and_then(Self::from_phonemes) - .map_err(|context| FullContextLabelError { - context, - source: None, - }) + #[cfg(test)] + mod tests { + use super::TChunkBy; + + #[test] + fn chunk_by() { + let mut split = [0, 0, 1, 1, 1, -5].chunk_by(|a, b| a == b); + assert_eq!(split.next(), Some([0, 0].as_slice())); + assert_eq!(split.next(), Some([1, 1, 1].as_slice())); + assert_eq!(split.next(), Some([-5].as_slice())); + assert_eq!(split.next(), None); + } } } @@ -346,11 +286,17 @@ mod tests { use ::test_util::OPEN_JTALK_DIC_DIR; use rstest::rstest; + use std::str::FromStr; + use crate::{ - engine::{open_jtalk::FullcontextExtractor, MoraModel}, - text_analyzer::{OpenJTalkAnalyzer, TextAnalyzer}, + engine::{ + full_context_label::{extract_full_context_label, generate_accent_phrases}, + open_jtalk::FullcontextExtractor, + MoraModel, + }, AccentPhraseModel, }; + use jlabel::Label; fn mora(text: &str, consonant: Option<&str>, vowel: &str) -> MoraModel { MoraModel::new( @@ -587,6 +533,19 @@ mod tests { assert_eq!(&open_jtalk.extract_fullcontext(text).unwrap(), labels); } + #[apply(label_cases)] + fn parse_labels(_text: &str, labels: &[&str], accent_phrase: &[AccentPhraseModel]) { + let parsed_labels = labels + .iter() + .map(|s| Label::from_str(s).unwrap()) + .collect::>(); + + assert_eq!( + &generate_accent_phrases(&parsed_labels).unwrap(), + accent_phrase + ); + } + #[apply(label_cases)] #[tokio::test] async fn extract_fullcontext( @@ -597,7 +556,9 @@ mod tests { let open_jtalk = crate::tokio::OpenJtalk::new(OPEN_JTALK_DIC_DIR) .await .unwrap(); - let analyzer = OpenJTalkAnalyzer::new(open_jtalk); - assert_eq!(analyzer.analyze(text).unwrap(), accent_phrase); + assert_eq!( + &extract_full_context_label(&open_jtalk, text).unwrap(), + accent_phrase + ); } } diff --git a/crates/voicevox_core/src/engine/mod.rs b/crates/voicevox_core/src/engine/mod.rs index 1c7422e76..95fe3d562 100644 --- a/crates/voicevox_core/src/engine/mod.rs +++ b/crates/voicevox_core/src/engine/mod.rs @@ -6,7 +6,9 @@ mod mora_list; pub(crate) mod open_jtalk; pub(crate) use self::acoustic_feature_extractor::OjtPhoneme; -pub(crate) use self::full_context_label::{FullContextLabelError, Utterance}; +pub(crate) use self::full_context_label::{ + extract_full_context_label, mora_to_text, FullContextLabelError, +}; pub(crate) use self::kana_parser::{create_kana, parse_kana, KanaParseError}; pub use self::model::{AccentPhraseModel, AudioQueryModel, MoraModel}; pub(crate) use self::mora_list::mora2text; diff --git a/crates/voicevox_core/src/synthesizer.rs b/crates/voicevox_core/src/synthesizer.rs index 1ee16ace0..a3c34489e 100644 --- a/crates/voicevox_core/src/synthesizer.rs +++ b/crates/voicevox_core/src/synthesizer.rs @@ -80,7 +80,7 @@ pub(crate) mod blocking { use enum_map::enum_map; use crate::{ - engine::{create_kana, MoraModel, OjtPhoneme}, + engine::{create_kana, mora_to_text, MoraModel, OjtPhoneme}, error::ErrorRepr, infer::{ domain::{ @@ -92,7 +92,7 @@ pub(crate) mod blocking { InferenceSessionOptions, }, numerics::F32Ext as _, - text_analyzer::{mora_to_text, KanaAnalyzer, OpenJTalkAnalyzer, TextAnalyzer}, + text_analyzer::{KanaAnalyzer, OpenJTalkAnalyzer, TextAnalyzer}, AccentPhraseModel, AudioQueryModel, FullcontextExtractor, Result, StyleId, SupportedDevices, SynthesisOptions, VoiceModelId, VoiceModelMeta, }; @@ -376,7 +376,7 @@ pub(crate) mod blocking { let pitch = (*last_mora.pitch() + ADJUST_PITCH).min(MAX_PITCH); MoraModel::new( - mora_to_text(last_mora.vowel()), + mora_to_text(None, last_mora.vowel()), None, None, last_mora.vowel().clone(), diff --git a/crates/voicevox_core/src/text_analyzer.rs b/crates/voicevox_core/src/text_analyzer.rs index 5ecb89d56..b684eac7c 100644 --- a/crates/voicevox_core/src/text_analyzer.rs +++ b/crates/voicevox_core/src/text_analyzer.rs @@ -1,5 +1,5 @@ use crate::{ - engine::{self, parse_kana, MoraModel, Utterance}, + engine::{extract_full_context_label, parse_kana}, AccentPhraseModel, FullcontextExtractor, Result, }; @@ -35,88 +35,6 @@ impl TextAnalyzer for OpenJTalkAnalyzer { if text.is_empty() { return Ok(Vec::new()); } - let utterance = Utterance::extract_full_context_label(&self.0, text)?; - Ok(utterance_to_accent_phrases(utterance)) + Ok(extract_full_context_label(&self.0, text)?) } } - -fn utterance_to_accent_phrases(utterance: Utterance) -> Vec { - let accent_phrases: Vec = utterance.breath_groups().iter().enumerate().fold( - Vec::new(), - |mut accum_vec, (i, breath_group)| { - accum_vec.extend(breath_group.accent_phrases().iter().enumerate().map( - |(j, accent_phrase)| { - let moras = accent_phrase - .moras() - .iter() - .map(|mora| { - let mora_text = mora - .phonemes() - .iter() - .map(|phoneme| phoneme.phoneme().to_string()) - .collect::>() - .join(""); - - let (consonant, consonant_length) = - if let Some(consonant) = mora.consonant() { - (Some(consonant.phoneme().to_string()), Some(0.)) - } else { - (None, None) - }; - - MoraModel::new( - mora_to_text(mora_text), - consonant, - consonant_length, - mora.vowel().phoneme().into(), - 0., - 0., - ) - }) - .collect(); - - let pause_mora = if i != utterance.breath_groups().len() - 1 - && j == breath_group.accent_phrases().len() - 1 - { - Some(MoraModel::new( - "、".into(), - None, - None, - "pau".into(), - 0., - 0., - )) - } else { - None - }; - - AccentPhraseModel::new( - moras, - *accent_phrase.accent(), - pause_mora, - *accent_phrase.is_interrogative(), - ) - }, - )); - - accum_vec - }, - ); - - accent_phrases -} - -pub fn mora_to_text(mora: impl AsRef) -> String { - let last_char = mora.as_ref().chars().last().unwrap(); - let mora = if ['A', 'I', 'U', 'E', 'O'].contains(&last_char) { - format!( - "{}{}", - &mora.as_ref()[0..mora.as_ref().len() - 1], - last_char.to_lowercase() - ) - } else { - mora.as_ref().to_string() - }; - // もしカタカナに変換できなければ、引数で与えた文字列がそのまま返ってくる - engine::mora2text(&mora).to_string() -}