From 7bc24a4f61c4d8336789f8b04a6805f4deae5da9 Mon Sep 17 00:00:00 2001
From: Florian Lux <lux.florian@gmail.com>
Date: Fri, 28 Jun 2024 00:52:09 +0200
Subject: [PATCH] make sure the channels are ordered correctly in a reference

---
 InferenceInterfaces/ToucanTTSInterface.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/InferenceInterfaces/ToucanTTSInterface.py b/InferenceInterfaces/ToucanTTSInterface.py
index 20623505..5cde6e10 100644
--- a/InferenceInterfaces/ToucanTTSInterface.py
+++ b/InferenceInterfaces/ToucanTTSInterface.py
@@ -105,6 +105,9 @@ def set_utterance_embedding(self, path_to_reference_audio="", embedding=None):
             speaker_embs = list()
             for path in path_to_reference_audio:
                 wave, sr = soundfile.read(path)
+                if len(wave.shape) > 1:  # oh no, we found a stereo audio!
+                    if len(wave[0]) == 2:  # let's figure out whether we need to switch the axes
+                        wave = wave.transpose()  # if yes, we switch the axes.
                 wave = librosa.to_mono(wave)
                 wave = Resample(orig_freq=sr, new_freq=16000).to(self.device)(torch.tensor(wave, device=self.device, dtype=torch.float32))
                 speaker_embedding = self.speaker_embedding_func_ecapa.encode_batch(wavs=wave.to(self.device).squeeze().unsqueeze(0)).squeeze()