Skip to content

Commit

Permalink
final preparations for release
Browse files Browse the repository at this point in the history
  • Loading branch information
Flux9665 committed Apr 10, 2023
1 parent 5144760 commit 5f1dce3
Show file tree
Hide file tree
Showing 6 changed files with 24 additions and 17 deletions.
4 changes: 3 additions & 1 deletion InferenceInterfaces/ControllableInterface.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,4 +119,6 @@ def read(self,
energy_variance_scale=energy_variance_scale,
pause_duration_scaling_factor=pause_duration_scaling_factor,
return_plot_as_filepath=True)
return 24000, wav, fig
wav = wav.cpu().numpy()
wav = [val for val in wav for _ in (0, 1)] # doubling the sampling rate for better compatibility (24kHz is not as standard as 48kHz)
return 48000, wav, fig
4 changes: 2 additions & 2 deletions InferenceInterfaces/ToucanTTSInterface.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ def read_to_file(self,
pitch_variance_scale=pitch_variance_scale,
energy_variance_scale=energy_variance_scale).cpu()), 0)
wav = torch.cat((wav, silence), 0)
wav = [val for val in wav for _ in (0, 1)]
wav = [val for val in wav for _ in (0, 1)] # doubling the sampling rate for better compatibility (24kHz is not as standard as 48kHz)
soundfile.write(file=file_location, data=wav, samplerate=48000)

def read_aloud(self,
Expand All @@ -304,7 +304,7 @@ def read_aloud(self,
pitch_variance_scale=pitch_variance_scale,
energy_variance_scale=energy_variance_scale).cpu()
wav = torch.cat((wav, torch.zeros([12000])), 0)
wav = [val for val in wav for _ in (0, 1)]
wav = [val for val in wav for _ in (0, 1)] # doubling the sampling rate for better compatibility (24kHz is not as standard as 48kHz)
sounddevice.play(wav, samplerate=48000)
if blocking:
sounddevice.wait()
25 changes: 16 additions & 9 deletions InferenceInterfaces/UtteranceCloner.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,18 @@


class UtteranceCloner:
"""
Clone the prosody of an utterance, but exchange the speaker (or don't)
def __init__(self, model_id, device):
self.tts = ToucanTTSInterface(device=device, tts_model_path=model_id)
Useful for Privacy Applications
"""

def __init__(self, model_id, device, language="en", speed_over_quality=False):
if (device == torch.device("cpu") or device == "cpu") and not speed_over_quality:
print("Warning: You are running BigVGAN on CPU. Consider either switching to GPU or setting the speed_over_quality option to True.")
self.tts = ToucanTTSInterface(device=device, tts_model_path=model_id, faster_vocoder=speed_over_quality)
self.ap = AudioPreprocessor(input_sr=16000, output_sr=16000, melspec_buckets=80, hop_length=256, n_fft=1024, cut_silence=False)
self.tf = ArticulatoryCombinedTextFrontend(language="en")
self.tf = ArticulatoryCombinedTextFrontend(language=language)
self.device = device
acoustic_checkpoint_path = os.path.join(MODELS_DIR, "Aligner", "aligner.pt")
self.aligner_weights = torch.load(acoustic_checkpoint_path, map_location='cpu')["asr_model"]
Expand Down Expand Up @@ -153,12 +160,12 @@ def clone_utterance(self,
start_sil = torch.zeros([silence_frames_start * 3]).to(self.device) # timestamps are from 16kHz, but now we're using 48kHz, so upsampling required
end_sil = torch.zeros([silence_frames_end * 3]).to(self.device) # timestamps are from 16kHz, but now we're using 48kHz, so upsampling required
cloned_speech = self.tts(reference_transcription, view=False, durations=duration, pitch=pitch, energy=energy)
cloned_utt = torch.cat((start_sil, cloned_speech, end_sil), dim=0)
cloned_utt = torch.cat((start_sil, cloned_speech, end_sil), dim=0).cpu().numpy()
if filename_of_result is not None:
sf.write(file=filename_of_result, data=cloned_utt.cpu().numpy(), samplerate=24000)
sf.write(file=filename_of_result, data=cloned_utt, samplerate=24000)
if clone_speaker_identity:
self.tts.default_utterance_embedding = prev_embedding.to(self.device) # return to normal
return cloned_utt.cpu().numpy()
return cloned_utt

def biblical_accurate_angel_mode(self,
path_to_reference_audio,
Expand All @@ -178,8 +185,8 @@ def biblical_accurate_angel_mode(self,
self.tts.set_utterance_embedding(path_to_reference_audio=path)
list_of_cloned_speeches.append(self.tts(reference_transcription, view=False, durations=duration, pitch=pitch, energy=energy))
cloned_speech = torch.stack(list_of_cloned_speeches).mean(dim=0)
cloned_utt = torch.cat((start_sil, cloned_speech, end_sil), dim=0)
cloned_utt = torch.cat((start_sil, cloned_speech, end_sil), dim=0).cpu().numpy()
if filename_of_result is not None:
sf.write(file=filename_of_result, data=cloned_utt.cpu().numpy(), samplerate=24000)
sf.write(file=filename_of_result, data=cloned_utt, samplerate=24000)
self.tts.default_utterance_embedding = prev_embedding.to(self.device) # return to normal
return cloned_utt.cpu().numpy()
return cloned_utt
2 changes: 1 addition & 1 deletion run_controllable_GUI.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def read(self,
0.0, # slider 4 did not have a meaningful interpretation, too many properties mixed
emb5,
emb6)
return (sr, float2pcm(wav.cpu().numpy())), fig
return (sr, float2pcm(wav)), fig


if __name__ == '__main__':
Expand Down
2 changes: 1 addition & 1 deletion run_interactive_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
if __name__ == '__main__':
warnings.filterwarnings("ignore", category=UserWarning)

PATH_TO_TTS_MODEL = os.path.join(MODELS_DIR, "ToucanTTS_Nancy", "best.pt")
PATH_TO_TTS_MODEL = os.path.join(MODELS_DIR, "ToucanTTS_Meta", "best.pt")
PATH_TO_VOCODER_MODEL = os.path.join(MODELS_DIR, "BigVGAN", "best.pt")
PATH_TO_REFERENCE_SPEAKER = "" # audios/speaker_references_for_testing/female_high_voice.wav
LANGUAGE = "en"
Expand Down
4 changes: 1 addition & 3 deletions run_prosody_override.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,5 @@
filename_of_result="audios/test_cloned_angelic.wav",
list_of_speaker_references_for_ensemble=["audios/speaker_references_for_testing/female_high_voice.wav",
"audios/speaker_references_for_testing/female_mid_voice.wav",
"audios/speaker_references_for_testing/male_low_voice.wav",
"audios/LibriTTS/174/168635/174_168635_000019_000001.wav",
"audios/test.wav"],
"audios/speaker_references_for_testing/male_low_voice.wav"],
lang="en")

0 comments on commit 5f1dce3

Please sign in to comment.