From 4115f97bf0cf3151662056435c6274d73149238f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=A8=E5=AD=90=E6=9D=8E?= <104406518+LKZMuZiLi@users.noreply.github.com> Date: Sat, 28 Oct 2023 23:10:24 +0800 Subject: [PATCH] Add C# TTS API (#399) --- ...SherpaOnnxGeneratedAudioResultPlayAudio.cs | 44 ++++ dotnet-examples/TTS/Program.cs | 62 ++++++ .../TTS/Struct/SherpaOnnxGeneratedAudio.cs | 198 ++++++++++++++++++ .../TTS/Struct/SherpaOnnxOfflineTtsConfig.cs | 10 + .../Struct/SherpaOnnxOfflineTtsModelConfig.cs | 23 ++ .../SherpaOnnxOfflineTtsVitsModelConfig.cs | 48 +++++ dotnet-examples/TTS/TTSCore.cs | 70 +++++++ 7 files changed, 455 insertions(+) create mode 100644 dotnet-examples/TTS/PlayAudioPartial/SherpaOnnxGeneratedAudioResultPlayAudio.cs create mode 100644 dotnet-examples/TTS/Program.cs create mode 100644 dotnet-examples/TTS/Struct/SherpaOnnxGeneratedAudio.cs create mode 100644 dotnet-examples/TTS/Struct/SherpaOnnxOfflineTtsConfig.cs create mode 100644 dotnet-examples/TTS/Struct/SherpaOnnxOfflineTtsModelConfig.cs create mode 100644 dotnet-examples/TTS/Struct/SherpaOnnxOfflineTtsVitsModelConfig.cs create mode 100644 dotnet-examples/TTS/TTSCore.cs diff --git a/dotnet-examples/TTS/PlayAudioPartial/SherpaOnnxGeneratedAudioResultPlayAudio.cs b/dotnet-examples/TTS/PlayAudioPartial/SherpaOnnxGeneratedAudioResultPlayAudio.cs new file mode 100644 index 000000000..1eb1e3568 --- /dev/null +++ b/dotnet-examples/TTS/PlayAudioPartial/SherpaOnnxGeneratedAudioResultPlayAudio.cs @@ -0,0 +1,44 @@ +using NAudio.Wave; + +namespace TTS.Struct +{ + public sealed partial class SherpaOnnxGeneratedAudioResult + { + private WaveOutEvent waveOut; + private WaveFormat waveFormat; + private BufferedWaveProvider bufferedWaveProvider; + + private int bufferLength = 1; + + public TimeSpan? AudioDuration => bufferedWaveProvider?.BufferedDuration; + + public float PlayProgress => (waveOut?.GetPosition() * 1.0f / bufferLength).Value; + + public void Play() + { + waveOut ??= new WaveOutEvent(); + + waveFormat ??= new WaveFormat(sample_rate, AudioDataBit, Channels); // 32-bit 浮点,单声道 + + if (bufferedWaveProvider == null) + { + bufferedWaveProvider ??= new BufferedWaveProvider(waveFormat); + + var buffer = AudioByteData; + + bufferLength = buffer.Length; + + bufferedWaveProvider.AddSamples(buffer, 0, bufferLength); + bufferedWaveProvider.BufferLength = bufferLength; + waveOut.Init(bufferedWaveProvider); + } + waveOut.Play(); + } + + public void Stop() + { + waveOut?.Stop(); + } + + } +} diff --git a/dotnet-examples/TTS/Program.cs b/dotnet-examples/TTS/Program.cs new file mode 100644 index 000000000..7081040d5 --- /dev/null +++ b/dotnet-examples/TTS/Program.cs @@ -0,0 +1,62 @@ +using System.Text; +using TTS; +using TTS.Struct; + +internal class Program +{ + private static void Main(string[] args) + { + SherpaOnnxOfflineTtsConfig sherpaOnnxOfflineTtsConfig = new SherpaOnnxOfflineTtsConfig(); + sherpaOnnxOfflineTtsConfig.model = new SherpaOnnxOfflineTtsModelConfig + { + debug = 0, + num_threads = 4, + provider = "cpu", + vits = new SherpaOnnxOfflineTtsVitsModelConfig + { + lexicon = "vits-zh-aishell3/lexicon.txt", + model = "vits-zh-aishell3/vits-aishell3.onnx", + tokens = "vits-zh-aishell3/tokens.txt", + + noise_scale = 0.667f, + noise_scale_w = 0.8f, + length_scale = 1, + }, + + }; + + TTSCore i = new TTSCore(sherpaOnnxOfflineTtsConfig); + + Console.InputEncoding = Encoding.Unicode; + Console.OutputEncoding = Encoding.UTF8; + + while (true) + { + var str = Console.ReadLine(); + var audioResult = i.ToSpeech(str, 40, 1f); + + // audioResult.WriteWAVFile("123.wav");保存本地 + + audioResult.Play(); + + int lastIndex = -1; + while (audioResult.PlayProgress <= 1f) + { + int index = (int)(audioResult.PlayProgress * (str.Length - 1)); + if (lastIndex != index) + { + Console.Write(str[index]); + lastIndex = index; + } + Thread.Sleep(100); + } + + if (++lastIndex < str.Length) + Console.Write(str[lastIndex]); + + Console.WriteLine(); + + } + + } +} diff --git a/dotnet-examples/TTS/Struct/SherpaOnnxGeneratedAudio.cs b/dotnet-examples/TTS/Struct/SherpaOnnxGeneratedAudio.cs new file mode 100644 index 000000000..affc3a034 --- /dev/null +++ b/dotnet-examples/TTS/Struct/SherpaOnnxGeneratedAudio.cs @@ -0,0 +1,198 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Runtime.InteropServices; +using System.Text; +using System.Threading.Tasks; + +namespace TTS.Struct +{ + /// + /// 生成语音结果 + /// + public sealed partial class SherpaOnnxGeneratedAudioResult : IDisposable + { + public const string Filename = "sherpa-onnx-c-api"; + + /// + /// 销毁非托管内存 + /// + /// + [DllImport(Filename)] + private static extern void SherpaOnnxDestroyOfflineTtsGeneratedAudio(IntPtr ttsGenerateIntptr); + + [DllImport(Filename)] + private static extern int SherpaOnnxWriteWave(IntPtr q, int n, int sample_rate, string filename); + + /// + /// 音频数据比特 + /// + public const int AudioDataBit = 16; + /// + /// 单通道 + /// + public const int Channels = 1; + + /// + /// 原生句柄 + /// + internal IntPtr thisHandle; + + internal readonly IntPtr audioData; + internal readonly int dataSize; + + /// + /// 采样率 + /// + public readonly int sample_rate; + + /// + /// 音频数据指针 + /// + public IntPtr AudioDataIntPtr => audioData; + + /// + /// 数据的大小 + /// + public unsafe int AudioDataLength + { + get + { + return dataSize; + + //float* buffer = (float*)audioData; + //while (*buffer != 0) + // ++buffer; + //return (int)(buffer - (float*)audioData); + } + } + + /// + /// 获得音频数据 float[] + /// 这个内部创建一个数组 + /// + public unsafe float[] AudioFloatData + { + get + { + int length = AudioDataLength; + + float[] floatAudioData = new float[length]; + Marshal.Copy(audioData, floatAudioData, 0, floatAudioData.Length); + return floatAudioData; + } + } + + + /// + /// 获得音频数据 byte[] + /// 这个内部创建一个数组 + /// + public byte[] AudioByteData + { + get + { + byte[] bytes = new byte[AudioDataLength * 2]; + ReadData(bytes, 0); + return bytes; + } + } + + internal SherpaOnnxGeneratedAudioResult(IntPtr intPtr, SherpaOnnxGeneratedAudio sherpaOnnx) + { + this.thisHandle = intPtr; + this.audioData = sherpaOnnx.audioData; + this.dataSize = sherpaOnnx.dataSize; + this.sample_rate = sherpaOnnx.sample_rate; + } + + ~SherpaOnnxGeneratedAudioResult() + { + Dispose(); + } + + /// + /// 读取数据 + /// 没有垃圾产生,自己传递数组进来 + /// + /// 数组 + /// 数组那个位置写入 + /// 写入了多少个 + public int ReadData(float[] audioFloats, int offset) + { + int length = AudioDataLength; + + int c = audioFloats.Length - offset; + length = c >= length ? length : c; + + Marshal.Copy(audioData, audioFloats, offset, length); + return length; + } + + /// + /// 读取数据 + /// 这个内部转换成byte[] 音频数组 + /// 没有垃圾产生,自己传递数组进来 + /// + /// 数组,这个长度需要是AudioDataLength*2大小 + /// 数组那个位置写入 + /// 写入了多少个 + public int ReadData(byte[] audioFloats, int offset) + { + //因为是16bit存储音频数据,所以float会转换成两个字节存储 + var audiodata = AudioFloatData; + + int length = audiodata.Length * 2; + + int c = audioFloats.Length - offset; + c = c % 2 == 0 ? c : c - 1; + + length = c >= length ? length : c; + + int p = length / 2; + + for (int i = 0; i < p; i++) + { + short value = (short)(audiodata[i] * short.MaxValue); + + audioFloats[offset++] = (byte)value; + audioFloats[offset++] = (byte)(value >> 8); + } + + return length; + + } + + /// + /// 写入WAV音频数据 + /// + /// + /// + public bool WriteWAVFile(string filename) + { + return 1 == SherpaOnnxWriteWave(audioData, this.dataSize, this.sample_rate, filename); + } + + public void Dispose() + { + if (this.thisHandle != IntPtr.Zero) + { + SherpaOnnxDestroyOfflineTtsGeneratedAudio(this.thisHandle); + GC.SuppressFinalize(this); + this.thisHandle = IntPtr.Zero; + } + } + } + + [StructLayout(LayoutKind.Sequential)] + internal struct SherpaOnnxGeneratedAudio + { + internal readonly IntPtr audioData; + internal readonly int dataSize; + + /// + /// 采样率 + /// + public readonly int sample_rate; + } +} diff --git a/dotnet-examples/TTS/Struct/SherpaOnnxOfflineTtsConfig.cs b/dotnet-examples/TTS/Struct/SherpaOnnxOfflineTtsConfig.cs new file mode 100644 index 000000000..1d20264eb --- /dev/null +++ b/dotnet-examples/TTS/Struct/SherpaOnnxOfflineTtsConfig.cs @@ -0,0 +1,10 @@ +using System.Runtime.InteropServices; + +namespace TTS.Struct +{ + [StructLayout(LayoutKind.Sequential)] + public struct SherpaOnnxOfflineTtsConfig + { + public SherpaOnnxOfflineTtsModelConfig model; + } +} diff --git a/dotnet-examples/TTS/Struct/SherpaOnnxOfflineTtsModelConfig.cs b/dotnet-examples/TTS/Struct/SherpaOnnxOfflineTtsModelConfig.cs new file mode 100644 index 000000000..46dd55859 --- /dev/null +++ b/dotnet-examples/TTS/Struct/SherpaOnnxOfflineTtsModelConfig.cs @@ -0,0 +1,23 @@ +using System.Runtime.InteropServices; + +namespace TTS.Struct +{ + [StructLayout(LayoutKind.Sequential)] + public struct SherpaOnnxOfflineTtsModelConfig + { + /// + /// 模型配置 + /// + public SherpaOnnxOfflineTtsVitsModelConfig vits; + /// + /// 线程数 + /// + public int num_threads; + public int debug; + /// + /// 使用cpu + /// + [MarshalAs(UnmanagedType.LPStr)] + public string provider; + } +} diff --git a/dotnet-examples/TTS/Struct/SherpaOnnxOfflineTtsVitsModelConfig.cs b/dotnet-examples/TTS/Struct/SherpaOnnxOfflineTtsVitsModelConfig.cs new file mode 100644 index 000000000..4b37d81b0 --- /dev/null +++ b/dotnet-examples/TTS/Struct/SherpaOnnxOfflineTtsVitsModelConfig.cs @@ -0,0 +1,48 @@ +using System.Runtime.InteropServices; + +namespace TTS.Struct +{ + [StructLayout(LayoutKind.Sequential)] + public struct SherpaOnnxOfflineTtsVitsModelConfig + { + /// + /// 模型 + /// "vits-zh-aishell3/vits-aishell3.onnx" + /// + [MarshalAs(UnmanagedType.LPStr)] + public string model; + /// + /// 词典文件 + /// "vits-zh-aishell3/lexicon.txt" + /// + [MarshalAs(UnmanagedType.LPStr)] + public string lexicon; + + [MarshalAs(UnmanagedType.LPStr)] + public string tokens; + + /// + /// VITS模型的noise_scale (float,默认值= 0.667) + /// + public float noise_scale = 0.667f; + /// + /// VITS模型的noise_scale_w (float,默认值= 0.8) + /// + public float noise_scale_w = 0.8f; + /// + /// 演讲的速度。大→慢;小→更快。(float, default = 1) + /// + public float length_scale = 1f; + + public SherpaOnnxOfflineTtsVitsModelConfig() + { + noise_scale = 0.667f; + noise_scale_w = 0.8f; + length_scale = 1f; + + model = "vits-zh-aishell3/vits-aishell3.onnx"; + lexicon = "vits-zh-aishell3/lexicon.txt"; + tokens = "vits-zh-aishell3/tokens.txt"; + } + } +} diff --git a/dotnet-examples/TTS/TTSCore.cs b/dotnet-examples/TTS/TTSCore.cs new file mode 100644 index 000000000..63735a8cc --- /dev/null +++ b/dotnet-examples/TTS/TTSCore.cs @@ -0,0 +1,70 @@ +using System.Runtime.InteropServices; +using TTS.Struct; + +namespace TTS +{ + internal sealed class TTSCore : IDisposable + { + public const string Filename = "sherpa-onnx-c-api"; + + [DllImport(Filename)] + private static extern IntPtr SherpaOnnxCreateOfflineTts(SherpaOnnxOfflineTtsConfig handle); + + [DllImport(Filename)] + private static extern IntPtr SherpaOnnxOfflineTtsGenerate(IntPtr createOfflineTtsIntptr, IntPtr text, int sid, float speed); + + [DllImport(Filename)] + private static extern void SherpaOnnxDestroyOfflineTts(IntPtr intPtr); + + /// + /// 原生句柄 + /// + private IntPtr thisHandle; + + public TTSCore(SherpaOnnxOfflineTtsConfig modelConfig) + { + thisHandle = SherpaOnnxCreateOfflineTts(modelConfig); + } + + /// + /// 文字转语音 + /// + /// 文字 + /// 音色 + /// 速度 + /// + public SherpaOnnxGeneratedAudioResult ToSpeech(string text, int sid, float speed = 1f) + { + var result = SherpaOnnxOfflineTtsGenerate(thisHandle, Marshal.StringToCoTaskMemUTF8(text), sid, speed); + SherpaOnnxGeneratedAudio impl = (SherpaOnnxGeneratedAudio)Marshal.PtrToStructure(result, typeof(SherpaOnnxGeneratedAudio)); + return new SherpaOnnxGeneratedAudioResult(result, impl); + } + + /// + /// 文字转语音 + /// + /// 文字 + /// 音色 + /// 速度 + /// + public Task ToSpeechAsync(string text, int sid, float speed = 1f) + { + return Task.Run(() => ToSpeech(text, sid, speed)); + } + + ~TTSCore() + { + Dispose(); + } + + public void Dispose() + { + if (this.thisHandle != IntPtr.Zero) + { + SherpaOnnxDestroyOfflineTts(this.thisHandle); + GC.SuppressFinalize(this); + this.thisHandle = IntPtr.Zero; + } + } + } +}