From 3483887d920ae9988ed02073d91be8678a40b1cf Mon Sep 17 00:00:00 2001 From: Gray Suitcase <41382894+PickledChair@users.noreply.github.com> Date: Fri, 24 Feb 2023 01:36:55 +0900 Subject: [PATCH] =?UTF-8?q?python=20(FFI)=20example=20=E3=82=92=E5=89=8A?= =?UTF-8?q?=E9=99=A4=20(#432)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 3 +- crates/voicevox_core_python_api/README.md | 2 +- example/pyo3/.gitignore | 8 - example/pyo3/README.md | 49 ----- example/pyo3/run.py | 78 -------- example/python/.gitignore | 141 +------------- example/python/README.md | 60 ++++-- example/python/core.py | 227 ---------------------- example/python/requirements.txt | 1 - example/python/run.py | 105 ++++++---- 10 files changed, 111 insertions(+), 563 deletions(-) delete mode 100644 example/pyo3/.gitignore delete mode 100644 example/pyo3/README.md delete mode 100644 example/pyo3/run.py delete mode 100644 example/python/core.py delete mode 100644 example/python/requirements.txt diff --git a/README.md b/README.md index 46f266e4a..d4b90e00b 100644 --- a/README.md +++ b/README.md @@ -89,8 +89,7 @@ sudo apt install libgomp1 現在このリポジトリでは次のサンプルが提供されています。実行方法についてはそれぞれのディレクトリ内にある README を参照してください -- [Python](./example/python) -- [Python(pip)](./example/pyo3) +- [Python(pip)](./example/python) - [C++(UNIX CMake)](./example/cpp/unix) - [C++(Windows Visual Studio)](./example/cpp/windows) diff --git a/crates/voicevox_core_python_api/README.md b/crates/voicevox_core_python_api/README.md index 10df981c4..b5d71d819 100644 --- a/crates/voicevox_core_python_api/README.md +++ b/crates/voicevox_core_python_api/README.md @@ -70,4 +70,4 @@ venv を作ったらその venv 上で Maturin をインストールします。 ## サンプル実行 -`maturin develop` で editable な状態でインストールした後、[example/pyo3](../../example/pyo3) にてサンプルを実行できます。 +`maturin develop` で editable な状態でインストールした後、[example/python](../../example/python) にてサンプルを実行できます。 diff --git a/example/pyo3/.gitignore b/example/pyo3/.gitignore deleted file mode 100644 index c4d7f114d..000000000 --- a/example/pyo3/.gitignore +++ /dev/null @@ -1,8 +0,0 @@ -# OpenJTalk-dictionary's dir -open_jtalk_dic_utf_8-* - -# shared library -*.so -*.so.* -*.dylib -*.dll diff --git a/example/pyo3/README.md b/example/pyo3/README.md deleted file mode 100644 index 05ff6e7e8..000000000 --- a/example/pyo3/README.md +++ /dev/null @@ -1,49 +0,0 @@ -# Python サンプルコード (PyO3 によるバインディング経由) - -voicevox_core ライブラリ の Python バインディングを使った音声合成のサンプルコードです。 -`pip install`で導入することができます。 - -## 準備 - -TODO - -- Python インタプリタ ≧3.8 + venv -- voicevox_core_python_api の whl (`pip install`) -- onnxruntime の DLL (/README.md と同様) -- open_jtalk_dic_utf_8-1.11 (/README.md と同様) - -## 実行 - -Open JTalk 辞書ディレクトリ、読み上げさせたい文章、出力 wav ファイルのパスの 3 つを指定して run.py を実行します。 - -```console -❯ python ./run.py -h -usage: run.py [-h] [--mode MODE] open_jtalk_dict_dir text out - -positional arguments: - open_jtalk_dict_dir Open JTalkの辞書ディレクトリ - text 読み上げさせたい文章 - out 出力wavファイルのパス - -optional arguments: - -h, --help show this help message and exit - --mode MODE モード ("AUTO", "CPU", "GPU") -``` - -```console -❯ # python ./run.py <読み上げさせたい文章> <出力wavファイルのパス> -❯ python ./run.py ./open_jtalk_dic_utf_8-1.11 これはテストです ./audio.wav -[DEBUG] run.py: voicevox_core.METAS=[Meta(name='四国めたん', styles=[Style(name='あまあま', id=0)], speaker_uuid='7ffcb7ce-00ec-4bdc-82cd-45a8889e43ff', version='0.0.1'), Meta(name='ずんだもん', styles=[Style(name='あまあま', id=1)], speaker_uuid='388f246b-8c41-4ac1-8e2d-5d79f3ff56d9', version='0.0.1')] -[DEBUG] run.py: voicevox_core.SUPPORTED_DEVICES=SupportedDevices(cpu=True, cuda=True, dml=False) -[INFO] run.py: Initializing (acceleration_mode=, open_jtalk_dict_dir=PosixPath('open_jtalk_dic_utf_8-1.11')) -[DEBUG] run.py: core.is_gpu_mode=True -[INFO] run.py: Loading model 0 -[DEBUG] run.py: core.is_model_loaded(0)=True -[INFO] run.py: Creating an AudioQuery from 'これはテストです' -[INFO] run.py: Synthesizing with {"accent_phrases": [{"moras": [{"text": "コ", "consonant": "k", "consonant_length": 0.063058704, "vowel": "o", "vowel_length": 0.08937682, "pitch": 5.5699596}, {"text": "レ", "consonant": "r", "consonant_length": 0.047547057, "vowel": "e", "vowel_length": 0.07596417, "pitch": 5.6643105}, {"text": "ワ", "consonant": "w", "consonant_length": 0.053706698, "vowel": "a", "vowel_length": 0.10348523, "pitch": 5.7773285}], "accent": 3, "pause_mora": null, "is_interrogative": false}, {"moras": [{"text": "テ", "consonant": "t", "consonant_length": 0.06311223, "vowel": "e", "vowel_length": 0.07596652, "pitch": 5.881741}, {"text": "ス", "consonant": "s", "consonant_length": 0.038565055, "vowel": "U", "vowel_length": 0.050694168, "pitch": 0.0}, {"text": "ト", "consonant": "t", "consonant_length": 0.06685759, "vowel": "o", "vowel_length": 0.0753997, "pitch": 5.737323}, {"text": "デ", "consonant": "d", "consonant_length": 0.058399618, "vowel": "e", "vowel_length": 0.09201351, "pitch": 5.4747167}, {"text": "ス", "consonant": "s", "consonant_length": 0.08852549, "vowel": "U", "vowel_length": 0.1281984, "pitch": 0.0}], "accent": 1, "pause_mora": null, "is_interrogative": false}], "speed_scale": 1.0, "pitch_scale": 0.0, "intonation_scale": 1.0, "volume_scale": 1.0, "pre_phoneme_length": 0.1, "post_phoneme_length": 0.1, "output_sampling_rate": 24000, "output_stereo": false, "kana": "コレワ'/テ'_ストデ_ス"} -[INFO] run.py: Wrote `audio.wav` -[DEBUG] lib.rs: Destructing a VoicevoxCore -``` - -正常に実行されれば音声合成の結果である wav ファイルが生成されます。 -この例の場合、`"これはテストです"`という読み上げの wav ファイルが audio.wav という名前で生成されます。 diff --git a/example/pyo3/run.py b/example/pyo3/run.py deleted file mode 100644 index f60f4e0fe..000000000 --- a/example/pyo3/run.py +++ /dev/null @@ -1,78 +0,0 @@ -import dataclasses -import json -import logging -from argparse import ArgumentParser -from pathlib import Path -from typing import Tuple - -import voicevox_core -from voicevox_core import AccelerationMode, AudioQuery, VoicevoxCore - -SPEAKER_ID = 0 - - -def main() -> None: - logging.basicConfig( - format="[%(levelname)s] %(filename)s: %(message)s", level="DEBUG" - ) - logger = logging.getLogger(__name__) - - (acceleration_mode, open_jtalk_dict_dir, text, out) = parse_args() - - logger.debug("%s", f"{voicevox_core.METAS=}") - logger.debug("%s", f"{voicevox_core.SUPPORTED_DEVICES=}") - - logger.info("%s", f"Initializing ({acceleration_mode=}, {open_jtalk_dict_dir=})") - core = VoicevoxCore( - acceleration_mode=acceleration_mode, open_jtalk_dict_dir=open_jtalk_dict_dir - ) - - logger.debug("%s", f"{core.is_gpu_mode=}") - - logger.info("%s", f"Loading model {SPEAKER_ID}") - core.load_model(SPEAKER_ID) - - logger.debug("%s", f"{core.is_model_loaded(0)=}") - - logger.info("%s", f"Creating an AudioQuery from {text!r}") - audio_query = core.audio_query(text, SPEAKER_ID) - - logger.info("%s", f"Synthesizing with {display_as_json(audio_query)}") - wav = core.synthesis(audio_query, SPEAKER_ID) - - out.write_bytes(wav) - logger.info("%s", f"Wrote `{out}`") - - -def parse_args() -> Tuple[AccelerationMode, Path, str, Path]: - argparser = ArgumentParser() - argparser.add_argument( - "--mode", - default="AUTO", - type=AccelerationMode, - help='モード ("AUTO", "CPU", "GPU")', - ) - argparser.add_argument( - "open_jtalk_dict_dir", - type=Path, - help="Open JTalkの辞書ディレクトリ", - ) - argparser.add_argument( - "text", - help="読み上げさせたい文章", - ) - argparser.add_argument( - "out", - type=Path, - help="出力wavファイルのパス", - ) - args = argparser.parse_args() - return (args.mode, args.open_jtalk_dict_dir, args.text, args.out) - - -def display_as_json(audio_query: AudioQuery) -> str: - return json.dumps(dataclasses.asdict(audio_query), ensure_ascii=False) - - -if __name__ == "__main__": - main() diff --git a/example/python/.gitignore b/example/python/.gitignore index 117b718ec..c4d7f114d 100644 --- a/example/python/.gitignore +++ b/example/python/.gitignore @@ -1,147 +1,8 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - # OpenJTalk-dictionary's dir open_jtalk_dic_utf_8-* # shared library +*.so *.so.* *.dylib *.dll -voicevox_core/ diff --git a/example/python/README.md b/example/python/README.md index 606c6b24f..05ff6e7e8 100644 --- a/example/python/README.md +++ b/example/python/README.md @@ -1,23 +1,49 @@ -# Python のサンプルコード +# Python サンプルコード (PyO3 によるバインディング経由) -python から voicevox_core ライブラリを使用するためのサンプルコードです。 +voicevox_core ライブラリ の Python バインディングを使った音声合成のサンプルコードです。 +`pip install`で導入することができます。 -## サンプル実行方法 +## 準備 -まず、この README があるディレクトリで、[Downloader を使用して voicevox_core をダウンロードします](../../docs/downloads/download.md#default)。 -次に下記コマンドを実行して python のサンプルを実行します。 +TODO -```bash -# サンプルコード実行のための依存モジュールのインストール -pip install -r requirements.txt -python run.py \ - --text "これは本当に実行できているんですか" \ - --speaker_id 1 +- Python インタプリタ ≧3.8 + venv +- voicevox_core_python_api の whl (`pip install`) +- onnxruntime の DLL (/README.md と同様) +- open_jtalk_dic_utf_8-1.11 (/README.md と同様) -# 引数の紹介 -# --text 読み上げるテキスト -# --speaker_id 話者ID -# --use_gpu GPUを使う -# --f0_speaker_id 音高の話者ID(デフォルト値はspeaker_id) -# --f0_correct 音高の補正値(デフォルト値は0。+-0.3くらいで結果が大きく変わります) +## 実行 + +Open JTalk 辞書ディレクトリ、読み上げさせたい文章、出力 wav ファイルのパスの 3 つを指定して run.py を実行します。 + +```console +❯ python ./run.py -h +usage: run.py [-h] [--mode MODE] open_jtalk_dict_dir text out + +positional arguments: + open_jtalk_dict_dir Open JTalkの辞書ディレクトリ + text 読み上げさせたい文章 + out 出力wavファイルのパス + +optional arguments: + -h, --help show this help message and exit + --mode MODE モード ("AUTO", "CPU", "GPU") ``` + +```console +❯ # python ./run.py <読み上げさせたい文章> <出力wavファイルのパス> +❯ python ./run.py ./open_jtalk_dic_utf_8-1.11 これはテストです ./audio.wav +[DEBUG] run.py: voicevox_core.METAS=[Meta(name='四国めたん', styles=[Style(name='あまあま', id=0)], speaker_uuid='7ffcb7ce-00ec-4bdc-82cd-45a8889e43ff', version='0.0.1'), Meta(name='ずんだもん', styles=[Style(name='あまあま', id=1)], speaker_uuid='388f246b-8c41-4ac1-8e2d-5d79f3ff56d9', version='0.0.1')] +[DEBUG] run.py: voicevox_core.SUPPORTED_DEVICES=SupportedDevices(cpu=True, cuda=True, dml=False) +[INFO] run.py: Initializing (acceleration_mode=, open_jtalk_dict_dir=PosixPath('open_jtalk_dic_utf_8-1.11')) +[DEBUG] run.py: core.is_gpu_mode=True +[INFO] run.py: Loading model 0 +[DEBUG] run.py: core.is_model_loaded(0)=True +[INFO] run.py: Creating an AudioQuery from 'これはテストです' +[INFO] run.py: Synthesizing with {"accent_phrases": [{"moras": [{"text": "コ", "consonant": "k", "consonant_length": 0.063058704, "vowel": "o", "vowel_length": 0.08937682, "pitch": 5.5699596}, {"text": "レ", "consonant": "r", "consonant_length": 0.047547057, "vowel": "e", "vowel_length": 0.07596417, "pitch": 5.6643105}, {"text": "ワ", "consonant": "w", "consonant_length": 0.053706698, "vowel": "a", "vowel_length": 0.10348523, "pitch": 5.7773285}], "accent": 3, "pause_mora": null, "is_interrogative": false}, {"moras": [{"text": "テ", "consonant": "t", "consonant_length": 0.06311223, "vowel": "e", "vowel_length": 0.07596652, "pitch": 5.881741}, {"text": "ス", "consonant": "s", "consonant_length": 0.038565055, "vowel": "U", "vowel_length": 0.050694168, "pitch": 0.0}, {"text": "ト", "consonant": "t", "consonant_length": 0.06685759, "vowel": "o", "vowel_length": 0.0753997, "pitch": 5.737323}, {"text": "デ", "consonant": "d", "consonant_length": 0.058399618, "vowel": "e", "vowel_length": 0.09201351, "pitch": 5.4747167}, {"text": "ス", "consonant": "s", "consonant_length": 0.08852549, "vowel": "U", "vowel_length": 0.1281984, "pitch": 0.0}], "accent": 1, "pause_mora": null, "is_interrogative": false}], "speed_scale": 1.0, "pitch_scale": 0.0, "intonation_scale": 1.0, "volume_scale": 1.0, "pre_phoneme_length": 0.1, "post_phoneme_length": 0.1, "output_sampling_rate": 24000, "output_stereo": false, "kana": "コレワ'/テ'_ストデ_ス"} +[INFO] run.py: Wrote `audio.wav` +[DEBUG] lib.rs: Destructing a VoicevoxCore +``` + +正常に実行されれば音声合成の結果である wav ファイルが生成されます。 +この例の場合、`"これはテストです"`という読み上げの wav ファイルが audio.wav という名前で生成されます。 diff --git a/example/python/core.py b/example/python/core.py deleted file mode 100644 index 4df8bb6cf..000000000 --- a/example/python/core.py +++ /dev/null @@ -1,227 +0,0 @@ -from ctypes import * -import platform -import os -from pathlib import Path -import json -from typing import List, Optional, TypedDict, Union -import numpy - -# numpy ndarray types -int64_dim1_type = numpy.ctypeslib.ndpointer(dtype=numpy.int64, ndim=1) -float32_dim1_type = numpy.ctypeslib.ndpointer(dtype=numpy.float32, ndim=1) -int64_dim2_type = numpy.ctypeslib.ndpointer(dtype=numpy.int64, ndim=2) -float32_dim2_type = numpy.ctypeslib.ndpointer(dtype=numpy.float32, ndim=2) - -get_os = platform.system() - -lib_file = "" -if get_os == "Windows": - lib_file = "core.dll" -elif get_os == "Darwin": - lib_file = "libcore.dylib" -elif get_os == "Linux": - lib_file = "libcore.so" - -# ライブラリ読み込み -core_dll_path = Path(os.path.dirname(__file__) + f"/voicevox_core/{lib_file}") -if not os.path.exists(core_dll_path): - raise Exception(f"coreライブラリファイルが{core_dll_path}に存在しません") -lib = cdll.LoadLibrary(str(core_dll_path)) - -# 関数型定義 -lib.initialize.argtypes = (c_bool, c_int, c_bool) -lib.initialize.restype = c_bool - -lib.load_model.argtypes = (c_int64,) -lib.load_model.restype = c_bool - -lib.is_model_loaded.argtypes = (c_int64,) -lib.is_model_loaded.restype = c_bool - -lib.finalize.argtypes = () - -lib.metas.restype = c_char_p - -lib.supported_devices.restype = c_char_p - -lib.yukarin_s_forward.argtypes = ( - c_int64, int64_dim1_type, int64_dim1_type, float32_dim1_type) -lib.yukarin_s_forward.restype = c_bool - -lib.yukarin_sa_forward.argtypes = (c_int64, int64_dim2_type, int64_dim2_type, int64_dim2_type, - int64_dim2_type, int64_dim2_type, int64_dim2_type, int64_dim1_type, float32_dim2_type) -lib.yukarin_sa_forward.restype = c_bool - -lib.decode_forward.argtypes = ( - c_int64, c_int64, float32_dim2_type, float32_dim2_type, int64_dim1_type, float32_dim1_type) -lib.decode_forward.restype = c_bool - -lib.last_error_message.restype = c_char_p - -lib.voicevox_load_openjtalk_dict.argtypes = (c_char_p,) -lib.voicevox_load_openjtalk_dict.restype = c_int - -lib.voicevox_audio_query.argtypes = (c_char_p, c_int64, POINTER(c_char_p)) -lib.voicevox_audio_query.restype = c_int - -lib.voicevox_audio_query_from_kana.argtypes = (c_char_p, c_int64, POINTER(c_char_p)) -lib.voicevox_audio_query_from_kana.restype = c_int - -lib.voicevox_synthesis.argtypes = (c_char_p, c_int64, POINTER(c_int), POINTER(POINTER(c_uint8))) -lib.voicevox_synthesis.restype = c_int - -lib.voicevox_tts.argtypes = (c_char_p, c_int64, POINTER(c_int), POINTER(POINTER(c_uint8))) -lib.voicevox_tts.restype = c_int - -lib.voicevox_tts_from_kana.argtypes = (c_char_p, c_int64, POINTER(c_int), POINTER(POINTER(c_uint8))) -lib.voicevox_tts_from_kana.restype = c_int - -lib.voicevox_audio_query_json_free.argtypes = (c_char_p,) - -lib.voicevox_wav_free.argtypes = (POINTER(c_uint8),) - -lib.voicevox_error_result_to_message.argtypes = (c_int,) -lib.voicevox_load_openjtalk_dict.argtypes = (c_char_p,) - -# ラッパー関数 -def initialize(use_gpu: bool, cpu_num_threads=0, load_all_models=True): - success = lib.initialize(use_gpu, cpu_num_threads, load_all_models) - if not success: - raise Exception(lib.last_error_message().decode()) - -def load_model(speaker_id: int): - success = lib.load_model(speaker_id) - if not success: - raise Exception(lib.last_error_message().decode()) - -def is_model_loaded(speaker_id: int) -> bool: - return lib.is_model_loaded(speaker_id) - -def metas() -> str: - return lib.metas().decode() - - -def supported_devices() -> str: - return lib.supported_devices().decode() - - -def yukarin_s_forward(length: int, phoneme_list: numpy.ndarray, speaker_id: numpy.ndarray) -> numpy.ndarray: - output = numpy.zeros((length, ), dtype=numpy.float32) - success = lib.yukarin_s_forward(length, phoneme_list, speaker_id, output) - if not success: - raise Exception(lib.last_error_message().decode()) - return output - - -def yukarin_sa_forward( - length: int, - vowel_phoneme_list, - consonant_phoneme_list, - start_accent_list, - end_accent_list, - start_accent_phrase_list, - end_accent_phrase_list, - speaker_id -): - output = numpy.empty((len(speaker_id), length,), dtype=numpy.float32) - success = lib.yukarin_sa_forward( - length, vowel_phoneme_list, consonant_phoneme_list, start_accent_list, end_accent_list, start_accent_phrase_list, end_accent_phrase_list, speaker_id, output - ) - if not success: - raise Exception(lib.last_error_message().decode()) - return output - - -def decode_forward(length: int, phoneme_size: int, f0, phoneme, speaker_id): - output = numpy.empty((length*256,), dtype=numpy.float32) - success = lib.decode_forward( - length, phoneme_size, f0, phoneme, speaker_id, output - ) - if not success: - raise Exception(lib.last_error_message().decode()) - return output - -def voicevox_load_openjtalk_dict(dict_path: str): - errno = lib.voicevox_load_openjtalk_dict(dict_path.encode()) - if errno != 0: - raise Exception(lib.voicevox_error_result_to_message(errno).decode()) - -def voicevox_audio_query(text: str, speaker_id: int) -> "AudioQuery": - output_json = c_char_p() - errno = lib.voicevox_audio_query(text.encode(), speaker_id, byref(output_json)) - if errno != 0: - raise Exception(lib.voicevox_error_result_to_message(errno).decode()) - audio_query = json.loads(output_json.value) - lib.voicevox_audio_query_json_free(output_json) - return audio_query - -def voicevox_audio_query_from_kana(text: str, speaker_id: int) -> "AudioQuery": - output_json = c_char_p() - errno = lib.voicevox_audio_query_from_kana(text.encode(), speaker_id, byref(output_json)) - if errno != 0: - raise Exception(lib.voicevox_error_result_to_message(errno).decode()) - audio_query = json.loads(output_json.value) - lib.voicevox_audio_query_json_free(output_json) - return audio_query - -def voicevox_synthesis(audio_query: "AudioQuery", speaker_id: int) -> bytes: - output_binary_size = c_int() - output_wav = POINTER(c_uint8)() - errno = lib.voicevox_synthesis(json.dumps(audio_query).encode(), speaker_id, byref(output_binary_size), byref(output_wav)) - if errno != 0: - raise Exception(lib.voicevox_error_result_to_message(errno).decode()) - output = create_string_buffer(output_binary_size.value * sizeof(c_uint8)) - memmove(output, output_wav, output_binary_size.value * sizeof(c_uint8)) - lib.voicevox_wav_free(output_wav) - return output - -def voicevox_tts(text: str, speaker_id: int) -> bytes: - output_binary_size = c_int() - output_wav = POINTER(c_uint8)() - errno = lib.voicevox_tts(text.encode(), speaker_id, byref(output_binary_size), byref(output_wav)) - if errno != 0: - raise Exception(lib.voicevox_error_result_to_message(errno).decode()) - output = create_string_buffer(output_binary_size.value * sizeof(c_uint8)) - memmove(output, output_wav, output_binary_size.value * sizeof(c_uint8)) - lib.voicevox_wav_free(output_wav) - return output - -def voicevox_tts_from_kana(text: str, speaker_id: int) -> bytes: - output_binary_size = c_int() - output_wav = POINTER(c_uint8)() - errno = lib.voicevox_tts_from_kana(text.encode(), speaker_id, byref(output_binary_size), byref(output_wav)) - if errno != 0: - raise Exception(lib.voicevox_error_result_to_message(errno).decode()) - output = create_string_buffer(output_binary_size.value * sizeof(c_uint8)) - memmove(output, output_wav, output_binary_size.value * sizeof(c_uint8)) - lib.voicevox_wav_free(output_wav) - return output - -def finalize(): - lib.finalize() - -class AudioQuery(TypedDict): - accent_phrases: List["AccentPhrase"] - speedScale: float - pitchScale: float - intonationScale: float - volumeScale: float - prePhonemeLength: float - postPhonemeLength: float - outputSamplingRate: int - outputStereo: bool - kana: Optional[str] - -class AccentPhrase(TypedDict): - moras: List["Mora"] - accent: int - pause_mora: Optional["Mora"] - is_interrogative: bool - -class Mora(TypedDict): - text: str - consonant: Optional[str] - consonant_length: Optional[float] - vowel: str - vowel_length: float - pitch: float diff --git a/example/python/requirements.txt b/example/python/requirements.txt deleted file mode 100644 index 24ce15ab7..000000000 --- a/example/python/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -numpy diff --git a/example/python/run.py b/example/python/run.py index 638b1f029..f60f4e0fe 100644 --- a/example/python/run.py +++ b/example/python/run.py @@ -1,53 +1,78 @@ -import argparse +import dataclasses +import json +import logging +from argparse import ArgumentParser +from pathlib import Path +from typing import Tuple -import core +import voicevox_core +from voicevox_core import AccelerationMode, AudioQuery, VoicevoxCore +SPEAKER_ID = 0 -def run( - use_gpu: bool, - text: str, - speaker_id: int, - cpu_num_threads: int, - openjtalk_dict: str, - output: str, -) -> None: - # コアの初期化 - core.initialize(use_gpu, cpu_num_threads, load_all_models=False) - # openjtalk辞書のロード - core.voicevox_load_openjtalk_dict(openjtalk_dict) +def main() -> None: + logging.basicConfig( + format="[%(levelname)s] %(filename)s: %(message)s", level="DEBUG" + ) + logger = logging.getLogger(__name__) + + (acceleration_mode, open_jtalk_dict_dir, text, out) = parse_args() - # 話者のロード - core.load_model(speaker_id) + logger.debug("%s", f"{voicevox_core.METAS=}") + logger.debug("%s", f"{voicevox_core.SUPPORTED_DEVICES=}") - # AudioQueryの生成 - audio_query = core.voicevox_audio_query(text, speaker_id) + logger.info("%s", f"Initializing ({acceleration_mode=}, {open_jtalk_dict_dir=})") + core = VoicevoxCore( + acceleration_mode=acceleration_mode, open_jtalk_dict_dir=open_jtalk_dict_dir + ) - # 音声合成 - wavefmt = core.voicevox_synthesis(audio_query, speaker_id) + logger.debug("%s", f"{core.is_gpu_mode=}") - # 保存 - with open(output, "wb") as f: - f.write(wavefmt) + logger.info("%s", f"Loading model {SPEAKER_ID}") + core.load_model(SPEAKER_ID) - core.finalize() + logger.debug("%s", f"{core.is_model_loaded(0)=}") + logger.info("%s", f"Creating an AudioQuery from {text!r}") + audio_query = core.audio_query(text, SPEAKER_ID) -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--use_gpu", action="store_true") - parser.add_argument("--text", required=True) - parser.add_argument("--speaker_id", type=int, required=True) - parser.add_argument("--cpu_num_threads", type=int, default=0) - parser.add_argument( - "--openjtalk_dict", - type=str, - default="voicevox_core/open_jtalk_dic_utf_8-1.11" + logger.info("%s", f"Synthesizing with {display_as_json(audio_query)}") + wav = core.synthesis(audio_query, SPEAKER_ID) + + out.write_bytes(wav) + logger.info("%s", f"Wrote `{out}`") + + +def parse_args() -> Tuple[AccelerationMode, Path, str, Path]: + argparser = ArgumentParser() + argparser.add_argument( + "--mode", + default="AUTO", + type=AccelerationMode, + help='モード ("AUTO", "CPU", "GPU")', ) - parser.add_argument("--output", type=str) - - args = parser.parse_args() - if args.output is None: - args.output = f"{args.text}-{args.speaker_id}.wav" + argparser.add_argument( + "open_jtalk_dict_dir", + type=Path, + help="Open JTalkの辞書ディレクトリ", + ) + argparser.add_argument( + "text", + help="読み上げさせたい文章", + ) + argparser.add_argument( + "out", + type=Path, + help="出力wavファイルのパス", + ) + args = argparser.parse_args() + return (args.mode, args.open_jtalk_dict_dir, args.text, args.out) + - run(**vars(args)) +def display_as_json(audio_query: AudioQuery) -> str: + return json.dumps(dataclasses.asdict(audio_query), ensure_ascii=False) + + +if __name__ == "__main__": + main()