From de0a1cba6958af784d46e6c50bfa7d777bcb7644 Mon Sep 17 00:00:00 2001
From: flux9665 <lux.florian@gmail.com>
Date: Tue, 13 Aug 2024 17:15:09 +0200
Subject: [PATCH] fix utterance cloner

---
 InferenceInterfaces/UtteranceCloner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/InferenceInterfaces/UtteranceCloner.py b/InferenceInterfaces/UtteranceCloner.py
index cddeda0ef..17c382ce0 100644
--- a/InferenceInterfaces/UtteranceCloner.py
+++ b/InferenceInterfaces/UtteranceCloner.py
@@ -150,7 +150,7 @@ def clone_utterance(self,
         self.tts.set_language(lang)
         start_sil = numpy.zeros([int(silence_frames_start * 1.5)])  # timestamps are from 16kHz, but now we're using 24000Hz, so upsampling required
         end_sil = numpy.zeros([int(silence_frames_end * 1.5)])  # timestamps are from 16kHz, but now we're using 24000Hz, so upsampling required
-        cloned_speech, sr = self.tts(transcription_of_intonation_reference, view=False, durations=duration, pitch=pitch, energy=energy)
+        cloned_speech, sr = self.tts(transcription_of_intonation_reference, view=False, durations=duration, pitch=pitch.transpose(0, 1), energy=energy.transpose(0, 1))
         cloned_utt = numpy.concatenate([start_sil, cloned_speech, end_sil], axis=0)
         if filename_of_result is not None:
             sf.write(file=filename_of_result, data=float2pcm(cloned_utt), samplerate=sr, subtype="PCM_16")