diff --git a/InferenceInterfaces/UtteranceCloner.py b/InferenceInterfaces/UtteranceCloner.py index cddeda0ef..17c382ce0 100644 --- a/InferenceInterfaces/UtteranceCloner.py +++ b/InferenceInterfaces/UtteranceCloner.py @@ -150,7 +150,7 @@ def clone_utterance(self, self.tts.set_language(lang) start_sil = numpy.zeros([int(silence_frames_start * 1.5)]) # timestamps are from 16kHz, but now we're using 24000Hz, so upsampling required end_sil = numpy.zeros([int(silence_frames_end * 1.5)]) # timestamps are from 16kHz, but now we're using 24000Hz, so upsampling required - cloned_speech, sr = self.tts(transcription_of_intonation_reference, view=False, durations=duration, pitch=pitch, energy=energy) + cloned_speech, sr = self.tts(transcription_of_intonation_reference, view=False, durations=duration, pitch=pitch.transpose(0, 1), energy=energy.transpose(0, 1)) cloned_utt = numpy.concatenate([start_sil, cloned_speech, end_sil], axis=0) if filename_of_result is not None: sf.write(file=filename_of_result, data=float2pcm(cloned_utt), samplerate=sr, subtype="PCM_16")