diff --git a/InferenceInterfaces/UtteranceCloner.py b/InferenceInterfaces/UtteranceCloner.py
index cddeda0ef..17c382ce0 100644
--- a/InferenceInterfaces/UtteranceCloner.py
+++ b/InferenceInterfaces/UtteranceCloner.py
@@ -150,7 +150,7 @@ def clone_utterance(self,
         self.tts.set_language(lang)
         start_sil = numpy.zeros([int(silence_frames_start * 1.5)])  # timestamps are from 16kHz, but now we're using 24000Hz, so upsampling required
         end_sil = numpy.zeros([int(silence_frames_end * 1.5)])  # timestamps are from 16kHz, but now we're using 24000Hz, so upsampling required
-        cloned_speech, sr = self.tts(transcription_of_intonation_reference, view=False, durations=duration, pitch=pitch, energy=energy)
+        cloned_speech, sr = self.tts(transcription_of_intonation_reference, view=False, durations=duration, pitch=pitch.transpose(0, 1), energy=energy.transpose(0, 1))
         cloned_utt = numpy.concatenate([start_sil, cloned_speech, end_sil], axis=0)
         if filename_of_result is not None:
             sf.write(file=filename_of_result, data=float2pcm(cloned_utt), samplerate=sr, subtype="PCM_16")