From 5f1dce3ba60a7a8a5550a6327154f46376920c92 Mon Sep 17 00:00:00 2001
From: Florian Lux <lux.florian@gmail.com>
Date: Mon, 10 Apr 2023 19:49:01 +0200
Subject: [PATCH] final preparations for release

---
 InferenceInterfaces/ControllableInterface.py |  4 +++-
 InferenceInterfaces/ToucanTTSInterface.py    |  4 ++--
 InferenceInterfaces/UtteranceCloner.py       | 25 +++++++++++++-------
 run_controllable_GUI.py                      |  2 +-
 run_interactive_demo.py                      |  2 +-
 run_prosody_override.py                      |  4 +---
 6 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/InferenceInterfaces/ControllableInterface.py b/InferenceInterfaces/ControllableInterface.py
index 87124661..8e543aaf 100644
--- a/InferenceInterfaces/ControllableInterface.py
+++ b/InferenceInterfaces/ControllableInterface.py
@@ -119,4 +119,6 @@ def read(self,
                               energy_variance_scale=energy_variance_scale,
                               pause_duration_scaling_factor=pause_duration_scaling_factor,
                               return_plot_as_filepath=True)
-        return 24000, wav, fig
+        wav = wav.cpu().numpy()
+        wav = [val for val in wav for _ in (0, 1)]  # doubling the sampling rate for better compatibility (24kHz is not as standard as 48kHz)
+        return 48000, wav, fig
diff --git a/InferenceInterfaces/ToucanTTSInterface.py b/InferenceInterfaces/ToucanTTSInterface.py
index d17459b7..accce848 100644
--- a/InferenceInterfaces/ToucanTTSInterface.py
+++ b/InferenceInterfaces/ToucanTTSInterface.py
@@ -286,7 +286,7 @@ def read_to_file(self,
                                                pitch_variance_scale=pitch_variance_scale,
                                                energy_variance_scale=energy_variance_scale).cpu()), 0)
                     wav = torch.cat((wav, silence), 0)
-        wav = [val for val in wav for _ in (0, 1)]
+        wav = [val for val in wav for _ in (0, 1)]  # doubling the sampling rate for better compatibility (24kHz is not as standard as 48kHz)
         soundfile.write(file=file_location, data=wav, samplerate=48000)
 
     def read_aloud(self,
@@ -304,7 +304,7 @@ def read_aloud(self,
                    pitch_variance_scale=pitch_variance_scale,
                    energy_variance_scale=energy_variance_scale).cpu()
         wav = torch.cat((wav, torch.zeros([12000])), 0)
-        wav = [val for val in wav for _ in (0, 1)]
+        wav = [val for val in wav for _ in (0, 1)]  # doubling the sampling rate for better compatibility (24kHz is not as standard as 48kHz)
         sounddevice.play(wav, samplerate=48000)
         if blocking:
             sounddevice.wait()
diff --git a/InferenceInterfaces/UtteranceCloner.py b/InferenceInterfaces/UtteranceCloner.py
index b2cb94ea..26704688 100644
--- a/InferenceInterfaces/UtteranceCloner.py
+++ b/InferenceInterfaces/UtteranceCloner.py
@@ -17,11 +17,18 @@
 
 
 class UtteranceCloner:
+    """
+    Clone the prosody of an utterance, but exchange the speaker (or don't)
 
-    def __init__(self, model_id, device):
-        self.tts = ToucanTTSInterface(device=device, tts_model_path=model_id)
+    Useful for Privacy Applications
+    """
+
+    def __init__(self, model_id, device, language="en", speed_over_quality=False):
+        if (device == torch.device("cpu") or device == "cpu") and not speed_over_quality:
+            print("Warning: You are running BigVGAN on CPU. Consider either switching to GPU or setting the speed_over_quality option to True.")
+        self.tts = ToucanTTSInterface(device=device, tts_model_path=model_id, faster_vocoder=speed_over_quality)
         self.ap = AudioPreprocessor(input_sr=16000, output_sr=16000, melspec_buckets=80, hop_length=256, n_fft=1024, cut_silence=False)
-        self.tf = ArticulatoryCombinedTextFrontend(language="en")
+        self.tf = ArticulatoryCombinedTextFrontend(language=language)
         self.device = device
         acoustic_checkpoint_path = os.path.join(MODELS_DIR, "Aligner", "aligner.pt")
         self.aligner_weights = torch.load(acoustic_checkpoint_path, map_location='cpu')["asr_model"]
@@ -153,12 +160,12 @@ def clone_utterance(self,
         start_sil = torch.zeros([silence_frames_start * 3]).to(self.device)  # timestamps are from 16kHz, but now we're using 48kHz, so upsampling required
         end_sil = torch.zeros([silence_frames_end * 3]).to(self.device)  # timestamps are from 16kHz, but now we're using 48kHz, so upsampling required
         cloned_speech = self.tts(reference_transcription, view=False, durations=duration, pitch=pitch, energy=energy)
-        cloned_utt = torch.cat((start_sil, cloned_speech, end_sil), dim=0)
+        cloned_utt = torch.cat((start_sil, cloned_speech, end_sil), dim=0).cpu().numpy()
         if filename_of_result is not None:
-            sf.write(file=filename_of_result, data=cloned_utt.cpu().numpy(), samplerate=24000)
+            sf.write(file=filename_of_result, data=cloned_utt, samplerate=24000)
         if clone_speaker_identity:
             self.tts.default_utterance_embedding = prev_embedding.to(self.device)  # return to normal
-        return cloned_utt.cpu().numpy()
+        return cloned_utt
 
     def biblical_accurate_angel_mode(self,
                                      path_to_reference_audio,
@@ -178,8 +185,8 @@ def biblical_accurate_angel_mode(self,
             self.tts.set_utterance_embedding(path_to_reference_audio=path)
             list_of_cloned_speeches.append(self.tts(reference_transcription, view=False, durations=duration, pitch=pitch, energy=energy))
         cloned_speech = torch.stack(list_of_cloned_speeches).mean(dim=0)
-        cloned_utt = torch.cat((start_sil, cloned_speech, end_sil), dim=0)
+        cloned_utt = torch.cat((start_sil, cloned_speech, end_sil), dim=0).cpu().numpy()
         if filename_of_result is not None:
-            sf.write(file=filename_of_result, data=cloned_utt.cpu().numpy(), samplerate=24000)
+            sf.write(file=filename_of_result, data=cloned_utt, samplerate=24000)
         self.tts.default_utterance_embedding = prev_embedding.to(self.device)  # return to normal
-        return cloned_utt.cpu().numpy()
+        return cloned_utt
diff --git a/run_controllable_GUI.py b/run_controllable_GUI.py
index fcdde863..72592fd4 100644
--- a/run_controllable_GUI.py
+++ b/run_controllable_GUI.py
@@ -120,7 +120,7 @@ def read(self,
                                                  0.0,  # slider 4 did not have a meaningful interpretation, too many properties mixed
                                                  emb5,
                                                  emb6)
-        return (sr, float2pcm(wav.cpu().numpy())), fig
+        return (sr, float2pcm(wav)), fig
 
 
 if __name__ == '__main__':
diff --git a/run_interactive_demo.py b/run_interactive_demo.py
index a455d716..82132954 100644
--- a/run_interactive_demo.py
+++ b/run_interactive_demo.py
@@ -10,7 +10,7 @@
 if __name__ == '__main__':
     warnings.filterwarnings("ignore", category=UserWarning)
 
-    PATH_TO_TTS_MODEL = os.path.join(MODELS_DIR, "ToucanTTS_Nancy", "best.pt")
+    PATH_TO_TTS_MODEL = os.path.join(MODELS_DIR, "ToucanTTS_Meta", "best.pt")
     PATH_TO_VOCODER_MODEL = os.path.join(MODELS_DIR, "BigVGAN", "best.pt")
     PATH_TO_REFERENCE_SPEAKER = ""  # audios/speaker_references_for_testing/female_high_voice.wav
     LANGUAGE = "en"
diff --git a/run_prosody_override.py b/run_prosody_override.py
index 848a3999..9e95f555 100644
--- a/run_prosody_override.py
+++ b/run_prosody_override.py
@@ -16,7 +16,5 @@
                                     filename_of_result="audios/test_cloned_angelic.wav",
                                     list_of_speaker_references_for_ensemble=["audios/speaker_references_for_testing/female_high_voice.wav",
                                                                              "audios/speaker_references_for_testing/female_mid_voice.wav",
-                                                                             "audios/speaker_references_for_testing/male_low_voice.wav",
-                                                                             "audios/LibriTTS/174/168635/174_168635_000019_000001.wav",
-                                                                             "audios/test.wav"],
+                                                                             "audios/speaker_references_for_testing/male_low_voice.wav"],
                                     lang="en")