From 4115f97bf0cf3151662056435c6274d73149238f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9C=A8=E5=AD=90=E6=9D=8E?=
<104406518+LKZMuZiLi@users.noreply.github.com>
Date: Sat, 28 Oct 2023 23:10:24 +0800
Subject: [PATCH] Add C# TTS API (#399)
---
...SherpaOnnxGeneratedAudioResultPlayAudio.cs | 44 ++++
dotnet-examples/TTS/Program.cs | 62 ++++++
.../TTS/Struct/SherpaOnnxGeneratedAudio.cs | 198 ++++++++++++++++++
.../TTS/Struct/SherpaOnnxOfflineTtsConfig.cs | 10 +
.../Struct/SherpaOnnxOfflineTtsModelConfig.cs | 23 ++
.../SherpaOnnxOfflineTtsVitsModelConfig.cs | 48 +++++
dotnet-examples/TTS/TTSCore.cs | 70 +++++++
7 files changed, 455 insertions(+)
create mode 100644 dotnet-examples/TTS/PlayAudioPartial/SherpaOnnxGeneratedAudioResultPlayAudio.cs
create mode 100644 dotnet-examples/TTS/Program.cs
create mode 100644 dotnet-examples/TTS/Struct/SherpaOnnxGeneratedAudio.cs
create mode 100644 dotnet-examples/TTS/Struct/SherpaOnnxOfflineTtsConfig.cs
create mode 100644 dotnet-examples/TTS/Struct/SherpaOnnxOfflineTtsModelConfig.cs
create mode 100644 dotnet-examples/TTS/Struct/SherpaOnnxOfflineTtsVitsModelConfig.cs
create mode 100644 dotnet-examples/TTS/TTSCore.cs
diff --git a/dotnet-examples/TTS/PlayAudioPartial/SherpaOnnxGeneratedAudioResultPlayAudio.cs b/dotnet-examples/TTS/PlayAudioPartial/SherpaOnnxGeneratedAudioResultPlayAudio.cs
new file mode 100644
index 000000000..1eb1e3568
--- /dev/null
+++ b/dotnet-examples/TTS/PlayAudioPartial/SherpaOnnxGeneratedAudioResultPlayAudio.cs
@@ -0,0 +1,44 @@
+using NAudio.Wave;
+
+namespace TTS.Struct
+{
+ public sealed partial class SherpaOnnxGeneratedAudioResult
+ {
+ private WaveOutEvent waveOut;
+ private WaveFormat waveFormat;
+ private BufferedWaveProvider bufferedWaveProvider;
+
+ private int bufferLength = 1;
+
+ public TimeSpan? AudioDuration => bufferedWaveProvider?.BufferedDuration;
+
+ public float PlayProgress => (waveOut?.GetPosition() * 1.0f / bufferLength).Value;
+
+ public void Play()
+ {
+ waveOut ??= new WaveOutEvent();
+
+ waveFormat ??= new WaveFormat(sample_rate, AudioDataBit, Channels); // 32-bit 浮点,单声道
+
+ if (bufferedWaveProvider == null)
+ {
+ bufferedWaveProvider ??= new BufferedWaveProvider(waveFormat);
+
+ var buffer = AudioByteData;
+
+ bufferLength = buffer.Length;
+
+ bufferedWaveProvider.AddSamples(buffer, 0, bufferLength);
+ bufferedWaveProvider.BufferLength = bufferLength;
+ waveOut.Init(bufferedWaveProvider);
+ }
+ waveOut.Play();
+ }
+
+ public void Stop()
+ {
+ waveOut?.Stop();
+ }
+
+ }
+}
diff --git a/dotnet-examples/TTS/Program.cs b/dotnet-examples/TTS/Program.cs
new file mode 100644
index 000000000..7081040d5
--- /dev/null
+++ b/dotnet-examples/TTS/Program.cs
@@ -0,0 +1,62 @@
+using System.Text;
+using TTS;
+using TTS.Struct;
+
+internal class Program
+{
+ private static void Main(string[] args)
+ {
+ SherpaOnnxOfflineTtsConfig sherpaOnnxOfflineTtsConfig = new SherpaOnnxOfflineTtsConfig();
+ sherpaOnnxOfflineTtsConfig.model = new SherpaOnnxOfflineTtsModelConfig
+ {
+ debug = 0,
+ num_threads = 4,
+ provider = "cpu",
+ vits = new SherpaOnnxOfflineTtsVitsModelConfig
+ {
+ lexicon = "vits-zh-aishell3/lexicon.txt",
+ model = "vits-zh-aishell3/vits-aishell3.onnx",
+ tokens = "vits-zh-aishell3/tokens.txt",
+
+ noise_scale = 0.667f,
+ noise_scale_w = 0.8f,
+ length_scale = 1,
+ },
+
+ };
+
+ TTSCore i = new TTSCore(sherpaOnnxOfflineTtsConfig);
+
+ Console.InputEncoding = Encoding.Unicode;
+ Console.OutputEncoding = Encoding.UTF8;
+
+ while (true)
+ {
+ var str = Console.ReadLine();
+ var audioResult = i.ToSpeech(str, 40, 1f);
+
+ // audioResult.WriteWAVFile("123.wav");保存本地
+
+ audioResult.Play();
+
+ int lastIndex = -1;
+ while (audioResult.PlayProgress <= 1f)
+ {
+ int index = (int)(audioResult.PlayProgress * (str.Length - 1));
+ if (lastIndex != index)
+ {
+ Console.Write(str[index]);
+ lastIndex = index;
+ }
+ Thread.Sleep(100);
+ }
+
+ if (++lastIndex < str.Length)
+ Console.Write(str[lastIndex]);
+
+ Console.WriteLine();
+
+ }
+
+ }
+}
diff --git a/dotnet-examples/TTS/Struct/SherpaOnnxGeneratedAudio.cs b/dotnet-examples/TTS/Struct/SherpaOnnxGeneratedAudio.cs
new file mode 100644
index 000000000..affc3a034
--- /dev/null
+++ b/dotnet-examples/TTS/Struct/SherpaOnnxGeneratedAudio.cs
@@ -0,0 +1,198 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Runtime.InteropServices;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace TTS.Struct
+{
+ ///
+ /// 生成语音结果
+ ///
+ public sealed partial class SherpaOnnxGeneratedAudioResult : IDisposable
+ {
+ public const string Filename = "sherpa-onnx-c-api";
+
+ ///
+ /// 销毁非托管内存
+ ///
+ ///
+ [DllImport(Filename)]
+ private static extern void SherpaOnnxDestroyOfflineTtsGeneratedAudio(IntPtr ttsGenerateIntptr);
+
+ [DllImport(Filename)]
+ private static extern int SherpaOnnxWriteWave(IntPtr q, int n, int sample_rate, string filename);
+
+ ///
+ /// 音频数据比特
+ ///
+ public const int AudioDataBit = 16;
+ ///
+ /// 单通道
+ ///
+ public const int Channels = 1;
+
+ ///
+ /// 原生句柄
+ ///
+ internal IntPtr thisHandle;
+
+ internal readonly IntPtr audioData;
+ internal readonly int dataSize;
+
+ ///
+ /// 采样率
+ ///
+ public readonly int sample_rate;
+
+ ///
+ /// 音频数据指针
+ ///
+ public IntPtr AudioDataIntPtr => audioData;
+
+ ///
+ /// 数据的大小
+ ///
+ public unsafe int AudioDataLength
+ {
+ get
+ {
+ return dataSize;
+
+ //float* buffer = (float*)audioData;
+ //while (*buffer != 0)
+ // ++buffer;
+ //return (int)(buffer - (float*)audioData);
+ }
+ }
+
+ ///
+ /// 获得音频数据 float[]
+ /// 这个内部创建一个数组
+ ///
+ public unsafe float[] AudioFloatData
+ {
+ get
+ {
+ int length = AudioDataLength;
+
+ float[] floatAudioData = new float[length];
+ Marshal.Copy(audioData, floatAudioData, 0, floatAudioData.Length);
+ return floatAudioData;
+ }
+ }
+
+
+ ///
+ /// 获得音频数据 byte[]
+ /// 这个内部创建一个数组
+ ///
+ public byte[] AudioByteData
+ {
+ get
+ {
+ byte[] bytes = new byte[AudioDataLength * 2];
+ ReadData(bytes, 0);
+ return bytes;
+ }
+ }
+
+ internal SherpaOnnxGeneratedAudioResult(IntPtr intPtr, SherpaOnnxGeneratedAudio sherpaOnnx)
+ {
+ this.thisHandle = intPtr;
+ this.audioData = sherpaOnnx.audioData;
+ this.dataSize = sherpaOnnx.dataSize;
+ this.sample_rate = sherpaOnnx.sample_rate;
+ }
+
+ ~SherpaOnnxGeneratedAudioResult()
+ {
+ Dispose();
+ }
+
+ ///
+ /// 读取数据
+ /// 没有垃圾产生,自己传递数组进来
+ ///
+ /// 数组
+ /// 数组那个位置写入
+ /// 写入了多少个
+ public int ReadData(float[] audioFloats, int offset)
+ {
+ int length = AudioDataLength;
+
+ int c = audioFloats.Length - offset;
+ length = c >= length ? length : c;
+
+ Marshal.Copy(audioData, audioFloats, offset, length);
+ return length;
+ }
+
+ ///
+ /// 读取数据
+ /// 这个内部转换成byte[] 音频数组
+ /// 没有垃圾产生,自己传递数组进来
+ ///
+ /// 数组,这个长度需要是AudioDataLength*2大小
+ /// 数组那个位置写入
+ /// 写入了多少个
+ public int ReadData(byte[] audioFloats, int offset)
+ {
+ //因为是16bit存储音频数据,所以float会转换成两个字节存储
+ var audiodata = AudioFloatData;
+
+ int length = audiodata.Length * 2;
+
+ int c = audioFloats.Length - offset;
+ c = c % 2 == 0 ? c : c - 1;
+
+ length = c >= length ? length : c;
+
+ int p = length / 2;
+
+ for (int i = 0; i < p; i++)
+ {
+ short value = (short)(audiodata[i] * short.MaxValue);
+
+ audioFloats[offset++] = (byte)value;
+ audioFloats[offset++] = (byte)(value >> 8);
+ }
+
+ return length;
+
+ }
+
+ ///
+ /// 写入WAV音频数据
+ ///
+ ///
+ ///
+ public bool WriteWAVFile(string filename)
+ {
+ return 1 == SherpaOnnxWriteWave(audioData, this.dataSize, this.sample_rate, filename);
+ }
+
+ public void Dispose()
+ {
+ if (this.thisHandle != IntPtr.Zero)
+ {
+ SherpaOnnxDestroyOfflineTtsGeneratedAudio(this.thisHandle);
+ GC.SuppressFinalize(this);
+ this.thisHandle = IntPtr.Zero;
+ }
+ }
+ }
+
+ [StructLayout(LayoutKind.Sequential)]
+ internal struct SherpaOnnxGeneratedAudio
+ {
+ internal readonly IntPtr audioData;
+ internal readonly int dataSize;
+
+ ///
+ /// 采样率
+ ///
+ public readonly int sample_rate;
+ }
+}
diff --git a/dotnet-examples/TTS/Struct/SherpaOnnxOfflineTtsConfig.cs b/dotnet-examples/TTS/Struct/SherpaOnnxOfflineTtsConfig.cs
new file mode 100644
index 000000000..1d20264eb
--- /dev/null
+++ b/dotnet-examples/TTS/Struct/SherpaOnnxOfflineTtsConfig.cs
@@ -0,0 +1,10 @@
+using System.Runtime.InteropServices;
+
+namespace TTS.Struct
+{
+ [StructLayout(LayoutKind.Sequential)]
+ public struct SherpaOnnxOfflineTtsConfig
+ {
+ public SherpaOnnxOfflineTtsModelConfig model;
+ }
+}
diff --git a/dotnet-examples/TTS/Struct/SherpaOnnxOfflineTtsModelConfig.cs b/dotnet-examples/TTS/Struct/SherpaOnnxOfflineTtsModelConfig.cs
new file mode 100644
index 000000000..46dd55859
--- /dev/null
+++ b/dotnet-examples/TTS/Struct/SherpaOnnxOfflineTtsModelConfig.cs
@@ -0,0 +1,23 @@
+using System.Runtime.InteropServices;
+
+namespace TTS.Struct
+{
+ [StructLayout(LayoutKind.Sequential)]
+ public struct SherpaOnnxOfflineTtsModelConfig
+ {
+ ///
+ /// 模型配置
+ ///
+ public SherpaOnnxOfflineTtsVitsModelConfig vits;
+ ///
+ /// 线程数
+ ///
+ public int num_threads;
+ public int debug;
+ ///
+ /// 使用cpu
+ ///
+ [MarshalAs(UnmanagedType.LPStr)]
+ public string provider;
+ }
+}
diff --git a/dotnet-examples/TTS/Struct/SherpaOnnxOfflineTtsVitsModelConfig.cs b/dotnet-examples/TTS/Struct/SherpaOnnxOfflineTtsVitsModelConfig.cs
new file mode 100644
index 000000000..4b37d81b0
--- /dev/null
+++ b/dotnet-examples/TTS/Struct/SherpaOnnxOfflineTtsVitsModelConfig.cs
@@ -0,0 +1,48 @@
+using System.Runtime.InteropServices;
+
+namespace TTS.Struct
+{
+ [StructLayout(LayoutKind.Sequential)]
+ public struct SherpaOnnxOfflineTtsVitsModelConfig
+ {
+ ///
+ /// 模型
+ /// "vits-zh-aishell3/vits-aishell3.onnx"
+ ///
+ [MarshalAs(UnmanagedType.LPStr)]
+ public string model;
+ ///
+ /// 词典文件
+ /// "vits-zh-aishell3/lexicon.txt"
+ ///
+ [MarshalAs(UnmanagedType.LPStr)]
+ public string lexicon;
+
+ [MarshalAs(UnmanagedType.LPStr)]
+ public string tokens;
+
+ ///
+ /// VITS模型的noise_scale (float,默认值= 0.667)
+ ///
+ public float noise_scale = 0.667f;
+ ///
+ /// VITS模型的noise_scale_w (float,默认值= 0.8)
+ ///
+ public float noise_scale_w = 0.8f;
+ ///
+ /// 演讲的速度。大→慢;小→更快。(float, default = 1)
+ ///
+ public float length_scale = 1f;
+
+ public SherpaOnnxOfflineTtsVitsModelConfig()
+ {
+ noise_scale = 0.667f;
+ noise_scale_w = 0.8f;
+ length_scale = 1f;
+
+ model = "vits-zh-aishell3/vits-aishell3.onnx";
+ lexicon = "vits-zh-aishell3/lexicon.txt";
+ tokens = "vits-zh-aishell3/tokens.txt";
+ }
+ }
+}
diff --git a/dotnet-examples/TTS/TTSCore.cs b/dotnet-examples/TTS/TTSCore.cs
new file mode 100644
index 000000000..63735a8cc
--- /dev/null
+++ b/dotnet-examples/TTS/TTSCore.cs
@@ -0,0 +1,70 @@
+using System.Runtime.InteropServices;
+using TTS.Struct;
+
+namespace TTS
+{
+ internal sealed class TTSCore : IDisposable
+ {
+ public const string Filename = "sherpa-onnx-c-api";
+
+ [DllImport(Filename)]
+ private static extern IntPtr SherpaOnnxCreateOfflineTts(SherpaOnnxOfflineTtsConfig handle);
+
+ [DllImport(Filename)]
+ private static extern IntPtr SherpaOnnxOfflineTtsGenerate(IntPtr createOfflineTtsIntptr, IntPtr text, int sid, float speed);
+
+ [DllImport(Filename)]
+ private static extern void SherpaOnnxDestroyOfflineTts(IntPtr intPtr);
+
+ ///
+ /// 原生句柄
+ ///
+ private IntPtr thisHandle;
+
+ public TTSCore(SherpaOnnxOfflineTtsConfig modelConfig)
+ {
+ thisHandle = SherpaOnnxCreateOfflineTts(modelConfig);
+ }
+
+ ///
+ /// 文字转语音
+ ///
+ /// 文字
+ /// 音色
+ /// 速度
+ ///
+ public SherpaOnnxGeneratedAudioResult ToSpeech(string text, int sid, float speed = 1f)
+ {
+ var result = SherpaOnnxOfflineTtsGenerate(thisHandle, Marshal.StringToCoTaskMemUTF8(text), sid, speed);
+ SherpaOnnxGeneratedAudio impl = (SherpaOnnxGeneratedAudio)Marshal.PtrToStructure(result, typeof(SherpaOnnxGeneratedAudio));
+ return new SherpaOnnxGeneratedAudioResult(result, impl);
+ }
+
+ ///
+ /// 文字转语音
+ ///
+ /// 文字
+ /// 音色
+ /// 速度
+ ///
+ public Task ToSpeechAsync(string text, int sid, float speed = 1f)
+ {
+ return Task.Run(() => ToSpeech(text, sid, speed));
+ }
+
+ ~TTSCore()
+ {
+ Dispose();
+ }
+
+ public void Dispose()
+ {
+ if (this.thisHandle != IntPtr.Zero)
+ {
+ SherpaOnnxDestroyOfflineTts(this.thisHandle);
+ GC.SuppressFinalize(this);
+ this.thisHandle = IntPtr.Zero;
+ }
+ }
+ }
+}