From de0a1cba6958af784d46e6c50bfa7d777bcb7644 Mon Sep 17 00:00:00 2001 From: flux9665 Date: Tue, 13 Aug 2024 17:15:09 +0200 Subject: [PATCH] fix utterance cloner --- InferenceInterfaces/UtteranceCloner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/InferenceInterfaces/UtteranceCloner.py b/InferenceInterfaces/UtteranceCloner.py index cddeda0ef..17c382ce0 100644 --- a/InferenceInterfaces/UtteranceCloner.py +++ b/InferenceInterfaces/UtteranceCloner.py @@ -150,7 +150,7 @@ def clone_utterance(self, self.tts.set_language(lang) start_sil = numpy.zeros([int(silence_frames_start * 1.5)]) # timestamps are from 16kHz, but now we're using 24000Hz, so upsampling required end_sil = numpy.zeros([int(silence_frames_end * 1.5)]) # timestamps are from 16kHz, but now we're using 24000Hz, so upsampling required - cloned_speech, sr = self.tts(transcription_of_intonation_reference, view=False, durations=duration, pitch=pitch, energy=energy) + cloned_speech, sr = self.tts(transcription_of_intonation_reference, view=False, durations=duration, pitch=pitch.transpose(0, 1), energy=energy.transpose(0, 1)) cloned_utt = numpy.concatenate([start_sil, cloned_speech, end_sil], axis=0) if filename_of_result is not None: sf.write(file=filename_of_result, data=float2pcm(cloned_utt), samplerate=sr, subtype="PCM_16")