From 5f1dce3ba60a7a8a5550a6327154f46376920c92 Mon Sep 17 00:00:00 2001 From: Florian Lux Date: Mon, 10 Apr 2023 19:49:01 +0200 Subject: [PATCH] final preparations for release --- InferenceInterfaces/ControllableInterface.py | 4 +++- InferenceInterfaces/ToucanTTSInterface.py | 4 ++-- InferenceInterfaces/UtteranceCloner.py | 25 +++++++++++++------- run_controllable_GUI.py | 2 +- run_interactive_demo.py | 2 +- run_prosody_override.py | 4 +--- 6 files changed, 24 insertions(+), 17 deletions(-) diff --git a/InferenceInterfaces/ControllableInterface.py b/InferenceInterfaces/ControllableInterface.py index 87124661..8e543aaf 100644 --- a/InferenceInterfaces/ControllableInterface.py +++ b/InferenceInterfaces/ControllableInterface.py @@ -119,4 +119,6 @@ def read(self, energy_variance_scale=energy_variance_scale, pause_duration_scaling_factor=pause_duration_scaling_factor, return_plot_as_filepath=True) - return 24000, wav, fig + wav = wav.cpu().numpy() + wav = [val for val in wav for _ in (0, 1)] # doubling the sampling rate for better compatibility (24kHz is not as standard as 48kHz) + return 48000, wav, fig diff --git a/InferenceInterfaces/ToucanTTSInterface.py b/InferenceInterfaces/ToucanTTSInterface.py index d17459b7..accce848 100644 --- a/InferenceInterfaces/ToucanTTSInterface.py +++ b/InferenceInterfaces/ToucanTTSInterface.py @@ -286,7 +286,7 @@ def read_to_file(self, pitch_variance_scale=pitch_variance_scale, energy_variance_scale=energy_variance_scale).cpu()), 0) wav = torch.cat((wav, silence), 0) - wav = [val for val in wav for _ in (0, 1)] + wav = [val for val in wav for _ in (0, 1)] # doubling the sampling rate for better compatibility (24kHz is not as standard as 48kHz) soundfile.write(file=file_location, data=wav, samplerate=48000) def read_aloud(self, @@ -304,7 +304,7 @@ def read_aloud(self, pitch_variance_scale=pitch_variance_scale, energy_variance_scale=energy_variance_scale).cpu() wav = torch.cat((wav, torch.zeros([12000])), 0) - wav = [val for val in wav for _ in (0, 1)] + wav = [val for val in wav for _ in (0, 1)] # doubling the sampling rate for better compatibility (24kHz is not as standard as 48kHz) sounddevice.play(wav, samplerate=48000) if blocking: sounddevice.wait() diff --git a/InferenceInterfaces/UtteranceCloner.py b/InferenceInterfaces/UtteranceCloner.py index b2cb94ea..26704688 100644 --- a/InferenceInterfaces/UtteranceCloner.py +++ b/InferenceInterfaces/UtteranceCloner.py @@ -17,11 +17,18 @@ class UtteranceCloner: + """ + Clone the prosody of an utterance, but exchange the speaker (or don't) - def __init__(self, model_id, device): - self.tts = ToucanTTSInterface(device=device, tts_model_path=model_id) + Useful for Privacy Applications + """ + + def __init__(self, model_id, device, language="en", speed_over_quality=False): + if (device == torch.device("cpu") or device == "cpu") and not speed_over_quality: + print("Warning: You are running BigVGAN on CPU. Consider either switching to GPU or setting the speed_over_quality option to True.") + self.tts = ToucanTTSInterface(device=device, tts_model_path=model_id, faster_vocoder=speed_over_quality) self.ap = AudioPreprocessor(input_sr=16000, output_sr=16000, melspec_buckets=80, hop_length=256, n_fft=1024, cut_silence=False) - self.tf = ArticulatoryCombinedTextFrontend(language="en") + self.tf = ArticulatoryCombinedTextFrontend(language=language) self.device = device acoustic_checkpoint_path = os.path.join(MODELS_DIR, "Aligner", "aligner.pt") self.aligner_weights = torch.load(acoustic_checkpoint_path, map_location='cpu')["asr_model"] @@ -153,12 +160,12 @@ def clone_utterance(self, start_sil = torch.zeros([silence_frames_start * 3]).to(self.device) # timestamps are from 16kHz, but now we're using 48kHz, so upsampling required end_sil = torch.zeros([silence_frames_end * 3]).to(self.device) # timestamps are from 16kHz, but now we're using 48kHz, so upsampling required cloned_speech = self.tts(reference_transcription, view=False, durations=duration, pitch=pitch, energy=energy) - cloned_utt = torch.cat((start_sil, cloned_speech, end_sil), dim=0) + cloned_utt = torch.cat((start_sil, cloned_speech, end_sil), dim=0).cpu().numpy() if filename_of_result is not None: - sf.write(file=filename_of_result, data=cloned_utt.cpu().numpy(), samplerate=24000) + sf.write(file=filename_of_result, data=cloned_utt, samplerate=24000) if clone_speaker_identity: self.tts.default_utterance_embedding = prev_embedding.to(self.device) # return to normal - return cloned_utt.cpu().numpy() + return cloned_utt def biblical_accurate_angel_mode(self, path_to_reference_audio, @@ -178,8 +185,8 @@ def biblical_accurate_angel_mode(self, self.tts.set_utterance_embedding(path_to_reference_audio=path) list_of_cloned_speeches.append(self.tts(reference_transcription, view=False, durations=duration, pitch=pitch, energy=energy)) cloned_speech = torch.stack(list_of_cloned_speeches).mean(dim=0) - cloned_utt = torch.cat((start_sil, cloned_speech, end_sil), dim=0) + cloned_utt = torch.cat((start_sil, cloned_speech, end_sil), dim=0).cpu().numpy() if filename_of_result is not None: - sf.write(file=filename_of_result, data=cloned_utt.cpu().numpy(), samplerate=24000) + sf.write(file=filename_of_result, data=cloned_utt, samplerate=24000) self.tts.default_utterance_embedding = prev_embedding.to(self.device) # return to normal - return cloned_utt.cpu().numpy() + return cloned_utt diff --git a/run_controllable_GUI.py b/run_controllable_GUI.py index fcdde863..72592fd4 100644 --- a/run_controllable_GUI.py +++ b/run_controllable_GUI.py @@ -120,7 +120,7 @@ def read(self, 0.0, # slider 4 did not have a meaningful interpretation, too many properties mixed emb5, emb6) - return (sr, float2pcm(wav.cpu().numpy())), fig + return (sr, float2pcm(wav)), fig if __name__ == '__main__': diff --git a/run_interactive_demo.py b/run_interactive_demo.py index a455d716..82132954 100644 --- a/run_interactive_demo.py +++ b/run_interactive_demo.py @@ -10,7 +10,7 @@ if __name__ == '__main__': warnings.filterwarnings("ignore", category=UserWarning) - PATH_TO_TTS_MODEL = os.path.join(MODELS_DIR, "ToucanTTS_Nancy", "best.pt") + PATH_TO_TTS_MODEL = os.path.join(MODELS_DIR, "ToucanTTS_Meta", "best.pt") PATH_TO_VOCODER_MODEL = os.path.join(MODELS_DIR, "BigVGAN", "best.pt") PATH_TO_REFERENCE_SPEAKER = "" # audios/speaker_references_for_testing/female_high_voice.wav LANGUAGE = "en" diff --git a/run_prosody_override.py b/run_prosody_override.py index 848a3999..9e95f555 100644 --- a/run_prosody_override.py +++ b/run_prosody_override.py @@ -16,7 +16,5 @@ filename_of_result="audios/test_cloned_angelic.wav", list_of_speaker_references_for_ensemble=["audios/speaker_references_for_testing/female_high_voice.wav", "audios/speaker_references_for_testing/female_mid_voice.wav", - "audios/speaker_references_for_testing/male_low_voice.wav", - "audios/LibriTTS/174/168635/174_168635_000019_000001.wav", - "audios/test.wav"], + "audios/speaker_references_for_testing/male_low_voice.wav"], lang="en")