From 1fce252e9597ab1557e966c7e70d0bc107284aa2 Mon Sep 17 00:00:00 2001
From: Dmitry Bogdanov <dmitry.bogdanov@upf.edu>
Date: Wed, 12 Jul 2023 18:46:30 +0200
Subject: [PATCH 01/26] Catch unexpected errors in test script execution

If the script failes, the parsing will results with empty dependency
list. Check Python test script exit code and raise an exception.
---
 src/examples/python/show_algo_dependencies.py | 25 +++++++++++--------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/examples/python/show_algo_dependencies.py b/src/examples/python/show_algo_dependencies.py
index 93009b267..fdc18911e 100644
--- a/src/examples/python/show_algo_dependencies.py
+++ b/src/examples/python/show_algo_dependencies.py
@@ -26,7 +26,6 @@
 import argparse
 
 
-
 def find_dependencies(mode, algo):
     if algo in ['MusicExtractor', 'FreesoundExtractor']:
         # FIXME These are special algorithms that instantiate all dependencies
@@ -56,6 +55,10 @@ def find_dependencies(mode, algo):
 
     proc = subprocess.Popen([sys.executable, "-c", code], stderr=subprocess.PIPE)
     stderr = proc.communicate()[1].decode('utf8').split('\n')
+    exit_code = proc.wait()
+
+    if exit_code != 0:
+        raise Exception(f"Python script running {algo} algorithms failed with error:\n" + "-" * 80 + "\n" + "\n".join(stderr))
 
     # function to assign nested dict elements by a list of nested keys
     def set_val(d, keys, val):
@@ -71,7 +74,7 @@ def set_val(d, keys, val):
     # NOTE: the code relies heavily on indentification of output in Essentia's logger
     for line in stderr:
         if line.startswith("[Factory   ] "):
-        
+
             line = line.replace("[Factory   ] ", "")
             if line.count("Streaming: Creating algorithm: "):
                 tab, a = line.split("Streaming: Creating algorithm: ")
@@ -79,12 +82,12 @@ def set_val(d, keys, val):
             elif line.count("Standard : Creating algorithm: "):
                 tab, a = line.split("Standard : Creating algorithm: ")
                 m = "standard"
-            else: 
+            else:
                 continue
 
             lines.append(line)
             algos.append((m, a))
-            
+
             indent = len(tab)
 
             if indent < previous_indent:
@@ -93,10 +96,10 @@ def set_val(d, keys, val):
                 previous_key = previous_key[:-1]
 
             set_val(tree, previous_key + [(m,a)], {})
-            previous_key += [(m, a)]          
+            previous_key += [(m, a)]
 
             previous_indent = indent
- 
+
     algos = sorted(list(set(algos)))
     #algos = sorted(list(set(algos) - set([(mode, algo)])))
     return algos, tree, lines
@@ -125,12 +128,12 @@ def print_dependencies(algos, tree=None, lines=None):
 
     parser = argparse.ArgumentParser(description="Analyze Essentia's algorithm dependencies.")
 
-    parser.add_argument("-a", "--algorithm", dest="algo", 
+    parser.add_argument("-a", "--algorithm", dest="algo",
                                              help="algorithm to inspect",
                                              action="append",
                                              choices=set(essentia.standard.algorithmNames() + essentia.streaming.algorithmNames()))
-    parser.add_argument("-m", "--mode", dest="mode", 
-                                        help="mode (streaming, standard)", 
+    parser.add_argument("-m", "--mode", dest="mode",
+                                        help="mode (streaming, standard)",
                                         choices=set(("standard", "streaming")))
 
     args = vars(parser.parse_args())
@@ -151,7 +154,7 @@ def print_dependencies(algos, tree=None, lines=None):
     else:
         print("Algorithm was not specified. Analyze dependencies for all algorithms")
 
-    if args['mode']: 
+    if args['mode']:
         algos = [(a, m) for a, m in algos if m==args['mode']]
     else:
         print("Mode was not specified. Analyze dependencies for both modes")
@@ -164,7 +167,7 @@ def print_dependencies(algos, tree=None, lines=None):
 
     for algo, mode in algos:
         print("---------- %s : %s ----------" % (mode, algo))
-        dependencies, tree, _ = find_dependencies(mode, algo)  
+        dependencies, tree, _ = find_dependencies(mode, algo)
         #print_dependencies(dependencies, tree)
         print_dependencies(dependencies)
         all_dependencies += dependencies

From 062928c99e3e2cdaeb9a59139044c3a98c2fe071 Mon Sep 17 00:00:00 2001
From: Dmitry Bogdanov <dmitry.bogdanov@upf.edu>
Date: Thu, 13 Jul 2023 13:16:50 +0200
Subject: [PATCH 02/26] Update research papers

---
 doc/sphinxdoc/research_papers.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/doc/sphinxdoc/research_papers.md b/doc/sphinxdoc/research_papers.md
index acee4385a..e1212c1e9 100644
--- a/doc/sphinxdoc/research_papers.md
+++ b/doc/sphinxdoc/research_papers.md
@@ -79,6 +79,8 @@ Indexing music by mood: design and integration of an automatic content-based ann
 
 ## Emotion detection
 
+- Azuaje, G., Liew, K., Epure, E., Yada, S., Wakamiya, S., & Aramaki, E. (2023). Visualyre: multimodal album art generation for independent musicians. Personal and Ubiquitous Computing, 1-12.
+
 - S. Chowdhury, and G. Widmer. On perceived emotion in expressive piano performance: Further experimental evidence for the relevance of mid-level perceptual features. In International Society for Music Information Retrieval (ISMIR 2021), 2021.
 
 - Byun, S. W., Lee, S. P. A Study on a Speech Emotion Recognition System with Effective Acoustic Features Using Deep Learning Algorithms. Applied Sciences, 11(4), 1890, 2021.

From 9ceef0a3a7900ad51eb42aa5b6c6789b4bec6a47 Mon Sep 17 00:00:00 2001
From: Dmitry Bogdanov <dmitry.bogdanov@upf.edu>
Date: Thu, 13 Jul 2023 19:52:02 +0200
Subject: [PATCH 03/26] Fix AudioWriter from failing after libavcodec updates

Since FFmpeg 4.4.2 the code interfacing with libavcodec started to fail
with "Invalid argument" error.

This commit:
- Fixes the "Invalid argument" error by specifying explicitly the number of
  channels in frames.
- Improves error message for diagnostics.
---
 src/essentia/utils/audiocontext.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/essentia/utils/audiocontext.cpp b/src/essentia/utils/audiocontext.cpp
index dc75898bc..8907887d8 100644
--- a/src/essentia/utils/audiocontext.cpp
+++ b/src/essentia/utils/audiocontext.cpp
@@ -310,6 +310,7 @@ void AudioContext::encodePacket(int size) {
   frame->nb_samples = _codecCtx->frame_size;
   frame->format = _codecCtx->sample_fmt;
   frame->channel_layout = _codecCtx->channel_layout;
+  frame->channels = _codecCtx->channels;
 
   int result = avcodec_fill_audio_frame(frame, _codecCtx->channels, _codecCtx->sample_fmt,
                                         bufferFmt, outputPlaneSize * _codecCtx->channels, 0);
@@ -328,8 +329,13 @@ void AudioContext::encodePacket(int size) {
   packet.size = 0;
 
   int got_output;
-  if (avcodec_encode_audio2(_codecCtx, &packet, frame, &got_output) < 0) {
-     throw EssentiaException("Error while encoding audio frame");
+  result = avcodec_encode_audio2(_codecCtx, &packet, frame, &got_output);
+  if (result < 0) {
+    char errstring[1204];
+    av_strerror(result, errstring, sizeof(errstring));
+    ostringstream msg;
+    msg << "Error while encoding audio frame: " << errstring;
+    throw EssentiaException(msg);
   }
 
   if (got_output) { // packet is not empty, write the frame in the media file

From 49317df0d4f4498b3e35644f3fcbaa948d2c9c74 Mon Sep 17 00:00:00 2001
From: Dmitry Bogdanov <dmitry.bogdanov@upf.edu>
Date: Fri, 21 Jul 2023 00:17:22 +0200
Subject: [PATCH 04/26] Fix some spelling mistakes in docs

---
 src/algorithms/standard/iffta.h        | 2 +-
 src/algorithms/standard/ifftk.h        | 2 +-
 src/algorithms/standard/ifftw.h        | 2 +-
 src/algorithms/standard/ifftwcomplex.h | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/algorithms/standard/iffta.h b/src/algorithms/standard/iffta.h
index 081bdd37d..2d52add34 100644
--- a/src/algorithms/standard/iffta.h
+++ b/src/algorithms/standard/iffta.h
@@ -49,7 +49,7 @@ class IFFTA : public Algorithm {
 
   void declareParameters() {
     declareParameter("size", "the expected size of the input frame. This is purely optional and only targeted at optimizing the creation time of the FFT object", "[1,inf)", 1024);
-    declareParameter("normalize", "wheter to normalize the output by the FFT length.", "{true,false}", true);
+    declareParameter("normalize", "whether to normalize the output by the FFT length.", "{true,false}", true);
   }
 
 
diff --git a/src/algorithms/standard/ifftk.h b/src/algorithms/standard/ifftk.h
index dfc7065ac..8b61c33b0 100644
--- a/src/algorithms/standard/ifftk.h
+++ b/src/algorithms/standard/ifftk.h
@@ -44,7 +44,7 @@ class IFFTK : public Algorithm {
 
   void declareParameters() {
     declareParameter("size", "the expected size of the input frame. This is purely optional and only targeted at optimizing the creation time of the FFT object", "[1,inf)", 1024);
-    declareParameter("normalize", "wheter to normalize the output by the FFT length.", "{true,false}", true);
+    declareParameter("normalize", "whether to normalize the output by the FFT length.", "{true,false}", true);
   }
 
 
diff --git a/src/algorithms/standard/ifftw.h b/src/algorithms/standard/ifftw.h
index 6308f15ec..8ff0cc9e9 100644
--- a/src/algorithms/standard/ifftw.h
+++ b/src/algorithms/standard/ifftw.h
@@ -44,7 +44,7 @@ class IFFTW : public Algorithm {
 
   void declareParameters() {
     declareParameter("size", "the expected size of the input frame. This is purely optional and only targeted at optimizing the creation time of the FFT object", "[1,inf)", 1024);
-    declareParameter("normalize", "wheter to normalize the output by the FFT length.", "{true,false}", true);
+    declareParameter("normalize", "whether to normalize the output by the FFT length.", "{true,false}", true);
   }
 
 
diff --git a/src/algorithms/standard/ifftwcomplex.h b/src/algorithms/standard/ifftwcomplex.h
index 33197c909..b76833815 100644
--- a/src/algorithms/standard/ifftwcomplex.h
+++ b/src/algorithms/standard/ifftwcomplex.h
@@ -45,7 +45,7 @@ class IFFTWComplex : public Algorithm {
 
   void declareParameters() {
     declareParameter("size", "the expected size of the input frame. This is purely optional and only targeted at optimizing the creation time of the FFT object", "[1,inf)", 1024);
-    declareParameter("normalize", "wheter to normalize the output by the FFT length.", "{true,false}", true);
+    declareParameter("normalize", "whether to normalize the output by the FFT length.", "{true,false}", true);
   }
 
 

From d6800ae1f17ed3b0b2b8560e63b3d341f243a2fa Mon Sep 17 00:00:00 2001
From: Dmitry Bogdanov <dmitry.bogdanov@upf.edu>
Date: Tue, 26 Sep 2023 18:30:45 +0200
Subject: [PATCH 05/26] ciwheelbuild: disable builds for i686

NumPy does not provide wheels for i686 and they are built from source with pip
for Essentia wheels builds. However, since Numpy 1.26.0, the build fails due to
missing BLAS: https://github.com/numpy/numpy/issues/24703

The simplest solution, similar to other projects, is to disable i686 builds
entirely, given that the i686 is getting too outdated and uncommon.
---
 pyproject-tensorflow.toml | 2 +-
 pyproject.toml            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject-tensorflow.toml b/pyproject-tensorflow.toml
index e4759a621..112da2fde 100644
--- a/pyproject-tensorflow.toml
+++ b/pyproject-tensorflow.toml
@@ -4,7 +4,7 @@ manylinux-x86_64-image = "mtgupf/essentia-builds:manylinux2014_x86_64"
 
 # Only support x86_64 for essentia-tensorflow
 build = "cp**-manylinux_x86_64"
-skip = ["pp*", "*-musllinux*"]
+skip = ["pp*", "*-musllinux*", "*i686"]
 
 environment = { PROJECT_NAME="essentia-tensorflow", ESSENTIA_PROJECT_NAME="${PROJECT_NAME}", ESSENTIA_WHEEL_SKIP_3RDPARTY=1, ESSENTIA_WHEEL_ONLY_PYTHON=1 }
 
diff --git a/pyproject.toml b/pyproject.toml
index 0c4f4c08b..a943f3371 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@
 manylinux-x86_64-image = "mtgupf/essentia-builds:manylinux2014_x86_64"
 manylinux-i686-image = "mtgupf/essentia-builds:manylinux2014_i686"
 
-skip = ["pp*", "*-musllinux*"]
+skip = ["pp*", "*-musllinux*", "*i686"]
 
 environment = { PROJECT_NAME="essentia", ESSENTIA_PROJECT_NAME="${PROJECT_NAME}", ESSENTIA_WHEEL_SKIP_3RDPARTY=1, ESSENTIA_WHEEL_ONLY_PYTHON=1 }
 

From 4c48bb1cabfd2f73f4555065806df67922c39ac0 Mon Sep 17 00:00:00 2001
From: Dmitry Bogdanov <dmitry.bogdanov@upf.edu>
Date: Wed, 27 Sep 2023 12:23:17 +0200
Subject: [PATCH 06/26] docs: clarify no support for TensorFlow on iOS

---
 FAQ.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/FAQ.md b/FAQ.md
index 1b97c94f1..a048b16ac 100644
--- a/FAQ.md
+++ b/FAQ.md
@@ -153,6 +153,8 @@ A lightweight version of Essentia for iOS can be compiled using the ```--cross-c
 
 You can also compile it for iOS simulator (so that you can test on your desktop) using ```--cross-compile-ios-sim``` flag.
 
+Please note that TensorFlow-based Essentia algorithms are not supported on iOS at the moment because we do not currently offer a TensorFlowLite wrapper.
+
 
 Compiling Essentia to ASM.js or WebAssembly using Emscripten
 ------------------------------------------------------------

From efc65d8e5f73fc376361badf0b15a20d3246c02a Mon Sep 17 00:00:00 2001
From: Dmitry Bogdanov <dmitry.bogdanov@upf.edu>
Date: Mon, 16 Oct 2023 17:59:02 +0200
Subject: [PATCH 07/26] PredominantPitchMelodia: reset algorithm on each
 compute(). Fixes #1374

---
 src/algorithms/tonal/predominantpitchmelodia.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/algorithms/tonal/predominantpitchmelodia.cpp b/src/algorithms/tonal/predominantpitchmelodia.cpp
index 085ff30be..323b1835d 100644
--- a/src/algorithms/tonal/predominantpitchmelodia.cpp
+++ b/src/algorithms/tonal/predominantpitchmelodia.cpp
@@ -229,6 +229,8 @@ void PredominantPitchMelodia::compute() {
   _pitchContoursMelody->output("pitchConfidence").set(pitchConfidence);
 
   _pitchContoursMelody->compute();
+
+  reset();
 }
 
 PredominantPitchMelodia::~PredominantPitchMelodia() {

From fb9e5e9a4c42b2af841bc73e9462095cf4d99b88 Mon Sep 17 00:00:00 2001
From: palonso <pablo.alonso@upf.edu>
Date: Thu, 19 Oct 2023 16:49:46 +0200
Subject: [PATCH 08/26] Add TensorflowPredictMAEST and unit tests

This algorithm is based in the implementation of
TensorflowPredictMusiCNN since it requires the same mel-spectrogram
signature.

There are two features specific to this algorithm:
- When `patchSize` and `patchHopSize` are not set, it parsers the
  `graphFilename` to try to set the adequate value. This is useful for
  the MAEST variations with a different input sequence length.
- The algorithm throws an exception when the input mel-spectrogram is
  too short to produce a patch.

The values for the regression test were computed with the original MAEST
implementation: https://github.com/palonso/MAEST/.
---
 .../tensorflowpredictmaest.cpp                | 280 ++++++++++++++++++
 .../machinelearning/tensorflowpredictmaest.h  | 141 +++++++++
 .../test_tensorflowpredictmaest.py            |  99 +++++++
 3 files changed, 520 insertions(+)
 create mode 100644 src/algorithms/machinelearning/tensorflowpredictmaest.cpp
 create mode 100644 src/algorithms/machinelearning/tensorflowpredictmaest.h
 create mode 100644 test/src/unittests/machinelearning/test_tensorflowpredictmaest.py

diff --git a/src/algorithms/machinelearning/tensorflowpredictmaest.cpp b/src/algorithms/machinelearning/tensorflowpredictmaest.cpp
new file mode 100644
index 000000000..2d30cc970
--- /dev/null
+++ b/src/algorithms/machinelearning/tensorflowpredictmaest.cpp
@@ -0,0 +1,280 @@
+/*
+ * Copyright (C) 2006-2023  Music Technology Group - Universitat Pompeu Fabra
+ *
+ * This file is part of Essentia
+ *
+ * Essentia is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation (FSF), either version 3 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the Affero GNU General Public License
+ * version 3 along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#include "tensorflowpredictmaest.h"
+
+using namespace std;
+
+namespace essentia {
+namespace streaming {
+
+const char* TensorflowPredictMAEST::name = essentia::standard::TensorflowPredictMAEST::name;
+const char* TensorflowPredictMAEST::category = essentia::standard::TensorflowPredictMAEST::category;
+const char* TensorflowPredictMAEST::description = essentia::standard::TensorflowPredictMAEST::description;
+
+
+TensorflowPredictMAEST::TensorflowPredictMAEST() : AlgorithmComposite(),
+    _frameCutter(0), _tensorflowInputMusiCNN(0), _shift(0), _scale(0), _vectorRealToTensor(0),
+    _tensorToPool(0), _tensorflowPredict(0), _poolToTensor(0), _tensorToVectorReal(0), _configured(false) {
+
+  declareInput(_signal, 480000, "signal", "the input audio signal sampled at 16 kHz");
+  declareOutput(_predictions, 1, "predictions", "the output values from the model node named after `output`");
+}
+
+
+void TensorflowPredictMAEST::createInnerNetwork() {
+  AlgorithmFactory& factory = AlgorithmFactory::instance();
+
+  _frameCutter            = factory.create("FrameCutter");
+  _tensorflowInputMusiCNN = factory.create("TensorflowInputMusiCNN");
+  _shift                  = factory.create("UnaryOperator");
+  _scale                  = factory.create("UnaryOperator");
+  _vectorRealToTensor     = factory.create("VectorRealToTensor");
+  _tensorToPool           = factory.create("TensorToPool");
+  _tensorflowPredict      = factory.create("TensorflowPredict");
+  _poolToTensor           = factory.create("PoolToTensor");
+  _tensorToVectorReal     = factory.create("TensorToVectorReal");
+
+  _shift->output("array").setBufferType(BufferUsage::forMultipleFrames);
+  _scale->output("array").setBufferType(BufferUsage::forMultipleFrames);
+  _tensorflowInputMusiCNN->output("bands").setBufferType(BufferUsage::forMultipleFrames);
+
+  _signal                                  >> _frameCutter->input("signal");
+  _frameCutter->output("frame")            >> _tensorflowInputMusiCNN->input("frame");
+  _tensorflowInputMusiCNN->output("bands") >> _shift->input("array");
+  _shift->output("array")                  >> _scale->input("array");
+  _scale->output("array")                  >> _vectorRealToTensor->input("frame");
+  _vectorRealToTensor->output("tensor")    >> _tensorToPool->input("tensor");
+  _tensorToPool->output("pool")            >> _tensorflowPredict->input("poolIn");
+  _tensorflowPredict->output("poolOut")    >> _poolToTensor->input("pool");
+  _poolToTensor->output("tensor")          >> _tensorToVectorReal->input("tensor");
+
+
+  attach(_tensorToVectorReal->output("frame"), _predictions);
+
+  _network = new scheduler::Network(_frameCutter);
+}
+
+
+void TensorflowPredictMAEST::clearAlgos() {
+  if (!_configured) return;
+  delete _network;
+}
+
+
+TensorflowPredictMAEST::~TensorflowPredictMAEST() {
+  clearAlgos();
+}
+
+
+void TensorflowPredictMAEST::reset() {
+  AlgorithmComposite::reset();
+}
+
+
+void TensorflowPredictMAEST::configure() {
+  if (_configured) {
+    clearAlgos();
+  }
+
+  createInnerNetwork();
+
+  int patchHopSize = parameter("patchHopSize").toInt();
+  string lastPatchMode = parameter("lastPatchMode").toString();
+  int patchSize = parameter("patchSize").toInt();
+  int batchSize = parameter("batchSize").toInt();
+
+  string input = parameter("input").toString();
+  string output = parameter("output").toString();
+  string isTrainingName = parameter("isTrainingName").toString();
+
+  string graphFilename = parameter("graphFilename").toString();
+  string savedModel = parameter("savedModel").toString();
+
+
+ // Note the small difference between the patchHopSize and the patchSize parameters below.
+ // The patchHopSize is set to jump exactly 30, 20, 10, or 5 seconds.
+ // The patchSize is the closest number suitable considering the kernel and stride sizes of the
+ // Transformer's embedding layer:
+ // https://cs231n.github.io/convolutional-networks/#conv
+
+  if (parameter("patchSize").isConfigured()) {
+    if (graphFilename.find("20s") != std::string::npos) {
+      E_INFO("TensorFlowPredictMAEST: We detected that the default patchSize is not suitable for the graph `" << graphFilename.c_str() << "`. Setting it to 1258, which is adequate for the 20s model.");
+      patchSize = 1258;
+    } else if (graphFilename.find("10s") != std::string::npos) {
+      E_INFO("TensorFlowPredictMAEST: We detected that the default patchSize is not suitable for the graph `" << graphFilename.c_str() << "`. Setting it to 626, which is adequate for the 10s model.");
+      patchSize = 626;
+    } else if (graphFilename.find("5s") != std::string::npos) {
+      E_INFO("TensorFlowPredictMAEST: We detected that the default patchSize is not suitable for the graph `" << graphFilename.c_str() << "`. Setting it to 316, which is adequate for the 5s model.");
+      patchSize = 316;
+    }
+  }
+
+  if (parameter("patchHopSize").isConfigured()) {
+    if (graphFilename.find("20s") != std::string::npos) {
+      E_INFO("TensorFlowPredictMAEST: Setting patchHopSize to 1250, which is adequate for the 20s model.\n");
+      patchHopSize = 1250;
+    } else if (graphFilename.find("10s") != std::string::npos) {
+      E_INFO("TensorFlowPredictMAEST: Setting patchHopSize to 625, which is adequate for the 10s model.\n");
+      patchHopSize = 625;
+    } else if (graphFilename.find("5s") != std::string::npos) {
+      E_INFO("TensorFlowPredictMAEST: Setting patchHopSize to 313, which is adequate for the 5s model.\n");
+      patchHopSize = 313;
+    }
+  }
+
+
+  vector<int> inputShape({batchSize, 1, patchSize, _numberBands});
+
+  _frameCutter->configure("frameSize", _frameSize, "hopSize", _hopSize);
+
+  _vectorRealToTensor->configure("shape", inputShape,
+                                 "lastPatchMode", lastPatchMode,
+                                 "patchHopSize", patchHopSize);
+
+  _shift->configure("shift", -_mean);
+  _scale->configure("scale", 1.0 / (_std * 2));
+
+  _configured = true;
+
+
+  _tensorToPool->configure("namespace", input);
+
+  _poolToTensor->configure("namespace", output);
+
+
+  _tensorflowPredict->configure("graphFilename", graphFilename,
+                                "savedModel", savedModel,
+                                "inputs", vector<string>({input}),
+                                "outputs", vector<string>({output}),
+                                "isTrainingName", isTrainingName);
+}
+
+} // namespace streaming
+} // namespace essentia
+
+
+
+namespace essentia {
+namespace standard {
+
+const char* TensorflowPredictMAEST::name = "TensorflowPredictMAEST";
+const char* TensorflowPredictMAEST::category = "Machine Learning";
+const char* TensorflowPredictMAEST::description = DOC(
+  "This algorithm makes predictions using MAEST-based models.\n"
+  "\n"
+  "Internally, it uses TensorflowInputMusiCNN for the input feature extraction "
+  "(mel bands). It feeds the model with mel-spectrogram patches and "
+  "jumps a constant amount of frames determined by `patchHopSize`.\n"
+  "\n"
+  "By setting the `batchSize` parameter to -1 or 0 the patches are stored to run a single "
+  "TensorFlow session at the end of the stream. This allows to take advantage "
+  "of parallelization when GPUs are available, but at the same time it can be "
+  "memory exhausting for long files.\n"
+  "\n"
+  "The recommended pipeline is as follows::\n"
+  "\n"
+  "  MonoLoader(sampleRate=16000, resampleQuality=4) >> TensorflowPredictMAEST\n"
+  "\n"
+  "Note: this algorithm does not make any check on the input model so it is "
+  "the user's responsibility to make sure it is a valid one.\n"
+  "\n"
+  "Note: when `patchHopSize` and `patchSize` are not specified, the algorithm "
+  "will parse `graphFilename` to try to set appropriate values.\n"
+  "\n"
+  "References:\n"
+  "\n"
+  "1. Alonso-Jiménez, P., Serra, X., & Bogdanov, D. (2023). Efficient Supervised "
+  "Training of Audio Transformers for Music Representation Learning. In Proceedings "
+  "of the 24th International Society for Music Information Retrieval Conference "
+  "(ISMIR 2023)\n\n"
+  "2. Supported models at https://essentia.upf.edu/models.html#MAEST\n\n");
+
+
+TensorflowPredictMAEST::TensorflowPredictMAEST() {
+    declareInput(_signal, "signal", "the input audio signal sampled at 16 kHz");
+    declareOutput(_predictions, "predictions", "the output values from the model node named after `output`");
+
+    createInnerNetwork();
+  }
+
+
+TensorflowPredictMAEST::~TensorflowPredictMAEST() {
+  delete _network;
+}
+
+
+void TensorflowPredictMAEST::createInnerNetwork() {
+  _tensorflowPredictMAEST = streaming::AlgorithmFactory::create("TensorflowPredictMAEST");
+  _vectorInput = new streaming::VectorInput<Real>();
+
+  *_vectorInput  >> _tensorflowPredictMAEST->input("signal");
+  _tensorflowPredictMAEST->output("predictions") >>  PC(_pool, "predictions");
+
+  _network = new scheduler::Network(_vectorInput);
+}
+
+
+void TensorflowPredictMAEST::configure() {
+  _tensorflowPredictMAEST->configure(INHERIT("graphFilename"),
+                                       INHERIT("savedModel"),
+                                       INHERIT("input"),
+                                       INHERIT("output"),
+                                       INHERIT("isTrainingName"),
+                                       INHERIT("patchHopSize"),
+                                       INHERIT("lastPatchMode"),
+                                       INHERIT("patchSize"),
+                                       INHERIT("batchSize"));
+}
+
+
+void TensorflowPredictMAEST::compute() {
+  const vector<Real>& signal = _signal.get();
+  vector<vector<Real> >& predictions = _predictions.get();
+
+  if (!signal.size()) {
+    throw EssentiaException("TensorflowPredictMAEST: empty input signal");
+  }
+
+  _vectorInput->setVector(&signal);
+
+  _network->run();
+
+  try {
+    predictions = _pool.value<vector<vector<Real> > >("predictions");
+  }
+  catch (EssentiaException&) {
+    predictions.clear();
+    reset();
+
+    throw EssentiaException("TensorflowPredictMAEST: input signal is too short.");
+  }
+
+  reset();
+}
+
+
+void TensorflowPredictMAEST::reset() {
+  _network->reset();
+  _pool.remove("predictions");
+}
+
+} // namespace standard
+} // namespace essentia
diff --git a/src/algorithms/machinelearning/tensorflowpredictmaest.h b/src/algorithms/machinelearning/tensorflowpredictmaest.h
new file mode 100644
index 000000000..dfc88ead6
--- /dev/null
+++ b/src/algorithms/machinelearning/tensorflowpredictmaest.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (C) 2006-2023  Music Technology Group - Universitat Pompeu Fabra
+ *
+ * This file is part of Essentia
+ *
+ * Essentia is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Affero General Public License as published by the Free
+ * Software Foundation (FSF), either version 3 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the Affero GNU General Public License
+ * version 3 along with this program.  If not, see http://www.gnu.org/licenses/
+ */
+
+#ifndef ESSENTIA_TENSORFLOWPREDICTMAEST_H
+#define ESSENTIA_TENSORFLOWPREDICTMAEST_H
+
+
+#include "streamingalgorithmcomposite.h"
+#include "algorithmfactory.h"
+#include "algorithm.h"
+#include "network.h"
+
+namespace essentia {
+namespace streaming {
+
+class TensorflowPredictMAEST : public AlgorithmComposite {
+ protected:
+  Algorithm* _frameCutter;
+  Algorithm* _tensorflowInputMusiCNN;
+  Algorithm* _shift;
+  Algorithm* _scale;
+  Algorithm* _vectorRealToTensor;
+  Algorithm* _tensorToPool;
+  Algorithm* _tensorflowPredict;
+  Algorithm* _poolToTensor;
+  Algorithm* _tensorToVectorReal;
+
+  SinkProxy<Real> _signal;
+  SourceProxy<std::vector<Real> > _predictions;
+
+  scheduler::Network* _network;
+  bool _configured;
+
+  void createInnerNetwork();
+  void clearAlgos();
+
+  // Hardcoded parameters similar to our previous models (EffnetDiscogs/MusiCNN).
+  int _frameSize = 512;
+  int _hopSize = 256;
+  int _numberBands = 96;
+  double _mean = 2.06755686098554;
+  double _std = 1.268292820667291;
+
+ public:
+  TensorflowPredictMAEST();
+  ~TensorflowPredictMAEST();
+
+  void declareParameters() {
+    declareParameter("graphFilename", "the name of the file from which to load the TensorFlow graph", "", "");
+    declareParameter("savedModel", "the name of the TensorFlow SavedModel. Overrides parameter `graphFilename`", "", "");
+    declareParameter("input", "the name of the input node in the TensorFlow graph", "", "serving_default_melspectrogram");
+    declareParameter("output", "the name of the node from which to retrieve the output tensors", "", "StatefulPartitionedCall");
+    declareParameter("isTrainingName", "the name of an additional input node to indicate the model if it is in training mode or not. Leave it empty when the model does not need such input", "", "");
+    declareParameter("patchHopSize", "the number of frames between the beginnings of adjacent patches. 0 to avoid overlap", "[0,inf)", 1875);
+    declareParameter("lastPatchMode", "what to do with the last frames: `repeat` them to fill the last patch or `discard` them", "{discard,repeat}", "discard");
+    declareParameter("batchSize", "the batch size for prediction. This allows parallelization when GPUs are available. Set it to -1 or 0 to accumulate all the patches and run a single TensorFlow session at the end of the stream", "[-1,inf)", 1);
+    declareParameter("patchSize", "number of frames required for each inference. This parameter should match the model's expected input shape.", "[0,inf)", 1876);
+  }
+
+  void declareProcessOrder() {
+    declareProcessStep(ChainFrom(_frameCutter));
+  }
+
+  void configure();
+  void reset();
+
+  static const char* name;
+  static const char* category;
+  static const char* description;
+
+};
+
+} // namespace streaming
+} // namespace essentia
+
+#include "vectorinput.h"
+#include "pool.h"
+#include "poolstorage.h"
+
+namespace essentia {
+namespace standard {
+
+// Standard non-streaming algorithm comes after the streaming one as it
+// depends on it
+class TensorflowPredictMAEST : public Algorithm {
+ protected:
+  Input<std::vector<Real> > _signal;
+  Output<std::vector<std::vector<Real> > > _predictions;
+
+  streaming::Algorithm* _tensorflowPredictMAEST;
+  streaming::VectorInput<Real>* _vectorInput;
+  scheduler::Network* _network;
+  Pool _pool;
+
+  void createInnerNetwork();
+
+ public:
+  TensorflowPredictMAEST();
+  ~TensorflowPredictMAEST();
+
+  void declareParameters() {
+    declareParameter("graphFilename", "the name of the file from which to load the TensorFlow graph", "", "");
+    declareParameter("savedModel", "the name of the TensorFlow SavedModel. Overrides parameter `graphFilename`", "", "");
+    declareParameter("input", "the name of the input nodes in the Tensorflow graph", "", "serving_default_melspectrogram");
+    declareParameter("output", "the name of the node from which to retrieve the output tensors", "", "StatefulPartitionedCall");
+    declareParameter("isTrainingName", "the name of an additional input node indicating whether the model is to be run in a training mode (for models with a training mode, leave it empty otherwise)", "", "");
+    declareParameter("patchHopSize", "number of frames between the beginnings of adjacent patches. 0 to avoid overlap", "[0,inf)", 1876);
+    declareParameter("lastPatchMode", "what to do with the last frames: `repeat` them to fill the last patch or `discard` them", "{discard,repeat}", "discard");
+    declareParameter("batchSize", "the batch size for prediction. This allows parallelization when GPUs are available. Set it to -1 or 0 to accumulate all the patches and run a single TensorFlow session at the end of the stream", "[-1,inf)", 1);
+    declareParameter("patchSize", "number of frames required for each inference. This parameter should match the model's expected input shape.", "[0,inf)", 1876);
+  }
+
+  void configure();
+  void compute();
+  void reset();
+
+  static const char* name;
+  static const char* category;
+  static const char* description;
+};
+
+} // namespace standard
+} // namespace essentia
+
+#endif // ESSENTIA_TENSORFLOWPREDICTMAEST_H
diff --git a/test/src/unittests/machinelearning/test_tensorflowpredictmaest.py b/test/src/unittests/machinelearning/test_tensorflowpredictmaest.py
new file mode 100644
index 000000000..34a42d207
--- /dev/null
+++ b/test/src/unittests/machinelearning/test_tensorflowpredictmaest.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python
+
+# Copyright (C) 2006-2023  Music Technology Group - Universitat Pompeu Fabra
+#
+# This file is part of Essentia
+#
+# Essentia is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Affero General Public License as published by the Free
+# Software Foundation (FSF), either version 3 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+# details.
+#
+# You should have received a copy of the Affero GNU General Public License
+# version 3 along with this program. If not, see http://www.gnu.org/licenses/
+
+
+from essentia_test import *
+
+
+class TestTensorFlowPredictMAEST(TestCase):
+    @classmethod
+    def setUpClass(self):
+        # Since loading the Transformers takes a lot of time, we do it only once as reusable class members.
+        # When using these algos, Essentia complains that other networks (e.g., MonoLoader' network) were
+        # destroyed in the meantime. These warnings are not relevant for the tests, so we disable
+        # Warnings temporally.
+
+        essentia.log.warningActive = False
+
+        self.graphFilename30s = join(
+            testdata.models_dir, "maest", "discogs-maest-30s-pw-1.pb"
+        )
+        self.graphFilename10s = join(
+            testdata.models_dir, "maest", "discogs-maest-10s-pw-1.pb"
+        )
+
+        self.model30s = TensorflowPredictMAEST(graphFilename=self.graphFilename30s)
+        self.model10s = TensorflowPredictMAEST(graphFilename=self.graphFilename10s)
+
+    @classmethod
+    def tearDownClass(self):
+        essentia.log.warningActive = True
+
+    def testRegressionFrozenModel(self):
+        expected = numpy.load(
+            join(
+                filedir(),
+                "tensorflowpredictmaest",
+                "preds_maest_discogs-maest-30s-pw-1.npy",
+            )
+        )
+
+        filename = join(testdata.audio_dir, "recorded", "techno_loop.wav")
+        audio = MonoLoader(filename=filename, sampleRate=16000, resampleQuality=4)()
+
+        activations = self.model30s(audio)
+        found = numpy.mean(activations, axis=0)
+
+        self.assertAlmostEqualVector(found, expected, 1e-1)
+
+    def testInvalidParam(self):
+        self.assertConfigureFails(
+            TensorflowPredictMAEST(),
+            {
+                "graphFilename": self.graphFilename30s,
+                "batchSize": -2,
+            },
+        )  # Cannot be < -1.
+        self.assertConfigureFails(
+            TensorflowPredictMAEST(),
+            {
+                "graphFilename": self.graphFilename30s,
+                "patchSize": 0,
+            },
+        )  # Cannot be 0.
+
+    def testEmptyAudio(self):
+        self.assertComputeFails(self.model30s, [])
+
+    def testShortAudio(self):
+        self.assertComputeFails(
+            self.model30s, [1.0] * 16000 * 10
+        )  # This model expects more than 30s of audio, 10s should fail.
+
+    def testAutomaticPatchSizeConfig(self):
+        found_patches = self.model10s([1.0] * 16000 * 20).shape[0]
+
+        # 10s of audio, 5s of patch size -> 2 patches.
+        self.assertEqual(found_patches, 2)
+
+
+suite = allTests(TestTensorFlowPredictMAEST)
+
+if __name__ == "__main__":
+    TextTestRunner(verbosity=2).run(suite)

From 13e965beafa5f3e26ea23699f6b4c4d49b60f51e Mon Sep 17 00:00:00 2001
From: palonso <pablo.alonso@upf.edu>
Date: Thu, 19 Oct 2023 21:09:41 +0200
Subject: [PATCH 09/26] Add MAEST doc

---
 doc/sphinxdoc/models.rst                      | 103 +++++++++++++++++-
 .../python/models/generate_example_scripts.py |  15 ++-
 src/examples/python/models/models.yaml        |  16 ++-
 .../discogs-maest-10s-dw-1_embeddings.py      |   5 +
 .../discogs-maest-10s-fs-1_embeddings.py      |   5 +
 .../discogs-maest-10s-pw-1_embeddings.py      |   5 +
 .../discogs-maest-20s-pw-1_embeddings.py      |   5 +
 .../discogs-maest-30s-pw-1_embeddings.py      |   5 +
 .../discogs-maest-30s-pw-ts-1_embeddings.py   |   5 +
 .../maest/discogs-maest-5s-pw-1_embeddings.py |   5 +
 10 files changed, 158 insertions(+), 11 deletions(-)
 create mode 100644 src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-dw-1_embeddings.py
 create mode 100644 src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-fs-1_embeddings.py
 create mode 100644 src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-pw-1_embeddings.py
 create mode 100644 src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-20s-pw-1_embeddings.py
 create mode 100644 src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-30s-pw-1_embeddings.py
 create mode 100644 src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-30s-pw-ts-1_embeddings.py
 create mode 100644 src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-5s-pw-1_embeddings.py

diff --git a/doc/sphinxdoc/models.rst b/doc/sphinxdoc/models.rst
index 457bfbf0b..f881aaec5 100644
--- a/doc/sphinxdoc/models.rst
+++ b/doc/sphinxdoc/models.rst
@@ -25,7 +25,7 @@ If you use any of the models in your research, please cite the following paper::
       booktitle={International Conference on Acoustics, Speech and Signal Processing ({ICASSP})},
       year={2020}
     }
-    
+
 .. highlight:: default
 
 
@@ -137,6 +137,102 @@ Models:
 *Note: We provide models operating with a fixed batch size of 64 samples since it was not possible to port the version with dynamic batch size from ONNX to TensorFlow. Additionally, an ONNX version of the model with* `dynamic batch <https://essentia.upf.edu/models/feature-extractors/discogs-effnet/discogs-effnet-bsdynamic-1.onnx>`_ *size is provided.*
 
 
+Discogs-MAEST
+^^^^^^^^^^^^^
+
+Music Audio Efficient Spectrogram Transformer (`MAEST <https://github.com/palonso/MAEST/>`_) trained to predict music style labels using an in-house dataset annotated with Discogs metadata.
+
+Models:
+
+    .. collapse:: ⬇️ <a class="reference external">discogs-maest-30s-pw</a>
+
+        |
+
+            [`weights <https://essentia.upf.edu/models/feature-extractors/maest/discogs-maest-30s-pw-1.pb>`_, `metadata <https://essentia.upf.edu/models/feature-extractors/maest/discogs-maest-30s-pw-1.json>`_]
+
+            Model trained with a multi-label classification objective targeting 400 Discogs styles.
+
+            Python code for embedding extraction:
+
+            .. literalinclude:: ../../src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-30s-pw-1_embeddings.py
+
+    .. collapse:: ⬇️ <a class="reference external">discogs-maest-30s-pw-ts</a>
+
+        |
+
+            [`weights <https://essentia.upf.edu/models/feature-extractors/maest/discogs-maest-30s-pw-ts-1.pb>`_, `metadata <https://essentia.upf.edu/models/feature-extractors/maest/discogs-maest-30s-pw-ts-1.json>`_]
+
+            Model trained with a multi-label classification objective targeting 400 Discogs styles.
+
+            Python code for embedding extraction:
+
+            .. literalinclude:: ../../src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-30s-pw-ts-1_embeddings.py
+
+    .. collapse:: ⬇️ <a class="reference external">discogs-maest-20s-pw</a>
+
+        |
+
+            [`weights <https://essentia.upf.edu/models/feature-extractors/maest/discogs-maest-20s-pw-1.pb>`_, `metadata <https://essentia.upf.edu/models/feature-extractors/maest/discogs-maest-20s-pw-1.json>`_]
+
+            Model trained with a multi-label classification objective targeting 400 Discogs styles.
+
+            Python code for embedding extraction:
+
+            .. literalinclude:: ../../src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-20s-pw-1_embeddings.py
+
+    .. collapse:: ⬇️ <a class="reference external">discogs-maest-10s-pw</a>
+
+        |
+
+            [`weights <https://essentia.upf.edu/models/feature-extractors/maest/discogs-maest-10s-pw-1.pb>`_, `metadata <https://essentia.upf.edu/models/feature-extractors/maest/discogs-maest-10s-pw-1.json>`_]
+
+            Model trained with a multi-label classification objective targeting 400 Discogs styles.
+
+            Python code for embedding extraction:
+
+            .. literalinclude:: ../../src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-pw-1_embeddings.py
+
+    .. collapse:: ⬇️ <a class="reference external">discogs-maest-10s-fs</a>
+
+        |
+
+            [`weights <https://essentia.upf.edu/models/feature-extractors/maest/discogs-maest-10s-fs-1.pb>`_, `metadata <https://essentia.upf.edu/models/feature-extractors/maest/discogs-maest-10s-fs-1.json>`_]
+
+            Model trained with a multi-label classification objective targeting 400 Discogs styles.
+
+            Python code for embedding extraction:
+
+            .. literalinclude:: ../../src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-fs-1_embeddings.py
+
+    .. collapse:: ⬇️ <a class="reference external">discogs-maest-10s-dw</a>
+
+        |
+
+            [`weights <https://essentia.upf.edu/models/feature-extractors/maest/discogs-maest-10s-dw-1.pb>`_, `metadata <https://essentia.upf.edu/models/feature-extractors/maest/discogs-maest-10s-dw-1.json>`_]
+
+            Model trained with a multi-label classification objective targeting 400 Discogs styles.
+
+            Python code for embedding extraction:
+
+            .. literalinclude:: ../../src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-dw-1_embeddings.py
+
+    .. collapse:: ⬇️ <a class="reference external">discogs-maest-5s-pw</a>
+
+        |
+
+            [`weights <https://essentia.upf.edu/models/feature-extractors/maest/discogs-maest-5s-pw-1.pb>`_, `metadata <https://essentia.upf.edu/models/feature-extractors/maest/discogs-maest-5s-pw-1.json>`_]
+
+            Model trained with a multi-label classification objective targeting 400 Discogs styles.
+
+            Python code for embedding extraction:
+
+            .. literalinclude:: ../../src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-5s-pw-1_embeddings.py
+
+
+*Note: It is possible to retrieve the output of each attention layer by setting* ``output=StatefulParitionedCall:n`` *, where* ``n`` *is the index of the layer (starting from 1).*
+*The output from the attention layers should be interpreted as* ``[batch_index, 1, token_number, embeddings_size]``
+*, where the fist and second tokens (i.e.,* ``[0, 0, :2, :]`` *) correspond to the*  ``CLS`` *and* ``DIST`` *tokens respectively, and the following ones to input signal (refer to the* `paper <https://repositori.upf.edu/handle/10230/58023>`_  *for details).*
+
 OpenL3
 ^^^^^^
 
@@ -240,7 +336,7 @@ The name of these models is a combination of the classification/regression task
 *Note: TensorflowPredict2D has to be configured with the correct output layer name for each classifier. Check the attached JSON file to find the name of the output layer on each case.*
 
 
-Music genre and style 
+Music genre and style
 ^^^^^^^^^^^^^^^^^^^^^
 
 
@@ -2071,6 +2167,3 @@ Models:
             Python code for predictions:
 
             .. literalinclude :: ../../src/examples/python/models/scripts/tempo/tempocnn/deeptemp-k16-3_predictions.py
-
-
-
diff --git a/src/examples/python/models/generate_example_scripts.py b/src/examples/python/models/generate_example_scripts.py
index 6fd02a008..ea00e846e 100644
--- a/src/examples/python/models/generate_example_scripts.py
+++ b/src/examples/python/models/generate_example_scripts.py
@@ -14,6 +14,7 @@
     "TensorflowPredict2D": "model/Placeholder",
     "TensorflowPredictEffnetDiscogs": "serving_default_melspectrogram",
     "TensorflowPredictFSDSINet": "x",
+    "TensorflowPredictMAEST": "serving_default_melspectrogram",
     "PitchCREPE": "frames",
     "TempoCNN": "input",
 }
@@ -24,6 +25,7 @@
     "TensorflowPredict2D": "model/Sigmoid",
     "TensorflowPredictEffnetDiscogs": "PartitionedCall:0",
     "TensorflowPredictFSDSINet": "model/predictions/Sigmoid",
+    "TensorflowPredictMAEST": "PartitionedCall:0",
     "PitchCREPE": "model/classifier/Sigmoid",
     "TempoCNN": "output",
 }
@@ -97,13 +99,14 @@ def get_additional_parameters(metadata: dict, output: str, algo_name: str):
             model_output["output_purpose"] == output
             and model_output["name"] != OUTPUT_DEFAULTS[algo_name]
         ):
-            additional_parameters += f', output="{model_output["name"]}"'
+            if metadata["name"] == "MAEST" and ":7" in model_output["name"]:
+                # In maest models we recomend the embeddings from the 7th layer.
+                additional_parameters += f', output="{model_output["name"]}"'
+
     return additional_parameters
 
 
-def get_metadata(
-    task_type: str, family_name: str, model: str, metadata_base_dir=False
-):
+def get_metadata(task_type: str, family_name: str, model: str, metadata_base_dir=False):
     if metadata_base_dir:
         metadata_path = str(
             Path(metadata_base_dir, task_type, family_name, f"{model}.json")
@@ -155,7 +158,9 @@ def process_model(
 
     graph_filename_tgt = script_dir / graph_filename
     if download_models and (not graph_filename_tgt.exists()):
-        assert not models_base_dir, "downloading the models is incompatible with specifying `models_base_dir`"
+        assert (
+            not models_base_dir
+        ), "downloading the models is incompatible with specifying `models_base_dir`"
         try:
             script_dir.mkdir(parents=True, exist_ok=True)
             urlretrieve(metadata["link"], graph_filename_tgt)
diff --git a/src/examples/python/models/models.yaml b/src/examples/python/models/models.yaml
index fda6aa0f3..ae866ea13 100644
--- a/src/examples/python/models/models.yaml
+++ b/src/examples/python/models/models.yaml
@@ -7,7 +7,7 @@ audio-event-recognition:
      - embeddings
     models:
       - audioset-yamnet-1
-  
+
   fsd-sinet:
     algo_name: TensorflowPredictFSDSINet
     sample_rate: 22050
@@ -65,6 +65,20 @@ feature-extractors:
      - discogs_release_embeddings-effnet-bs64-1
      - discogs_track_embeddings-effnet-bs64-1
 
+  maest:
+    algo_name: TensorflowPredictMAEST
+    sample_rate: 16000
+    outputs:
+     - embeddings
+    models:
+     - discogs-maest-10s-dw-1
+     - discogs-maest-10s-fs-1
+     - discogs-maest-10s-pw-1
+     - discogs-maest-20s-pw-1
+     - discogs-maest-30s-pw-1
+     - discogs-maest-30s-pw-ts-1
+     - discogs-maest-5s-pw-1
+
 pitch:
   crepe:
     algo_name: PitchCREPE
diff --git a/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-dw-1_embeddings.py b/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-dw-1_embeddings.py
new file mode 100644
index 000000000..24c2c636c
--- /dev/null
+++ b/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-dw-1_embeddings.py
@@ -0,0 +1,5 @@
+from essentia.standard import MonoLoader, TensorflowPredictMAEST
+
+audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)()
+model = TensorflowPredictMAEST(graphFilename="discogs-maest-10s-dw-1.pb", output="StatefulPartitionedCall:7")
+embeddings = model(audio)
diff --git a/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-fs-1_embeddings.py b/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-fs-1_embeddings.py
new file mode 100644
index 000000000..59c1c891d
--- /dev/null
+++ b/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-fs-1_embeddings.py
@@ -0,0 +1,5 @@
+from essentia.standard import MonoLoader, TensorflowPredictMAEST
+
+audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)()
+model = TensorflowPredictMAEST(graphFilename="discogs-maest-10s-fs-1.pb", output="StatefulPartitionedCall:7")
+embeddings = model(audio)
diff --git a/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-pw-1_embeddings.py b/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-pw-1_embeddings.py
new file mode 100644
index 000000000..aabe99d14
--- /dev/null
+++ b/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-10s-pw-1_embeddings.py
@@ -0,0 +1,5 @@
+from essentia.standard import MonoLoader, TensorflowPredictMAEST
+
+audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)()
+model = TensorflowPredictMAEST(graphFilename="discogs-maest-10s-pw-1.pb", output="StatefulPartitionedCall:7")
+embeddings = model(audio)
diff --git a/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-20s-pw-1_embeddings.py b/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-20s-pw-1_embeddings.py
new file mode 100644
index 000000000..3cd8d93b0
--- /dev/null
+++ b/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-20s-pw-1_embeddings.py
@@ -0,0 +1,5 @@
+from essentia.standard import MonoLoader, TensorflowPredictMAEST
+
+audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)()
+model = TensorflowPredictMAEST(graphFilename="discogs-maest-20s-pw-1.pb", output="StatefulPartitionedCall:7")
+embeddings = model(audio)
diff --git a/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-30s-pw-1_embeddings.py b/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-30s-pw-1_embeddings.py
new file mode 100644
index 000000000..8e26a43ef
--- /dev/null
+++ b/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-30s-pw-1_embeddings.py
@@ -0,0 +1,5 @@
+from essentia.standard import MonoLoader, TensorflowPredictMAEST
+
+audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)()
+model = TensorflowPredictMAEST(graphFilename="discogs-maest-30s-pw-1.pb", output="StatefulPartitionedCall:7")
+embeddings = model(audio)
diff --git a/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-30s-pw-ts-1_embeddings.py b/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-30s-pw-ts-1_embeddings.py
new file mode 100644
index 000000000..2dfbaa9f6
--- /dev/null
+++ b/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-30s-pw-ts-1_embeddings.py
@@ -0,0 +1,5 @@
+from essentia.standard import MonoLoader, TensorflowPredictMAEST
+
+audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)()
+model = TensorflowPredictMAEST(graphFilename="discogs-maest-30s-pw-ts-1.pb", output="StatefulPartitionedCall:7")
+embeddings = model(audio)
diff --git a/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-5s-pw-1_embeddings.py b/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-5s-pw-1_embeddings.py
new file mode 100644
index 000000000..3c747bd63
--- /dev/null
+++ b/src/examples/python/models/scripts/feature-extractors/maest/discogs-maest-5s-pw-1_embeddings.py
@@ -0,0 +1,5 @@
+from essentia.standard import MonoLoader, TensorflowPredictMAEST
+
+audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)()
+model = TensorflowPredictMAEST(graphFilename="discogs-maest-5s-pw-1.pb", output="StatefulPartitionedCall:7")
+embeddings = model(audio)

From 6d362d0833e311bd238b834d5cdc82830e0bd1f2 Mon Sep 17 00:00:00 2001
From: palonso <pablo.alonso@upf.edu>
Date: Thu, 19 Oct 2023 21:11:28 +0200
Subject: [PATCH 10/26] Modify MAEST to return tensors instead of vectors

In other TensorflowPredict algorithm we have prefered to return 2D
outputs since they typically fit in the schema (timestamps,
embeddings), or (timestamps, activations). Hoewer in some cases more
dimensions are required. For example, we need 3D to return the attention
layers (batch, tokens, dimensions).
A similar problem has happened before when trying to retrieve the
internal representations of VGGish:
https://github.com/MTG/essentia/issues/1333
---
 .../tensorflowpredictmaest.cpp                | 35 +++++++++++++------
 .../machinelearning/tensorflowpredictmaest.h  |  7 ++--
 .../test_tensorflowpredictmaest.py            |  4 +--
 3 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/src/algorithms/machinelearning/tensorflowpredictmaest.cpp b/src/algorithms/machinelearning/tensorflowpredictmaest.cpp
index 2d30cc970..09df1d708 100644
--- a/src/algorithms/machinelearning/tensorflowpredictmaest.cpp
+++ b/src/algorithms/machinelearning/tensorflowpredictmaest.cpp
@@ -31,7 +31,7 @@ const char* TensorflowPredictMAEST::description = essentia::standard::Tensorflow
 
 TensorflowPredictMAEST::TensorflowPredictMAEST() : AlgorithmComposite(),
     _frameCutter(0), _tensorflowInputMusiCNN(0), _shift(0), _scale(0), _vectorRealToTensor(0),
-    _tensorToPool(0), _tensorflowPredict(0), _poolToTensor(0), _tensorToVectorReal(0), _configured(false) {
+    _tensorToPool(0), _tensorflowPredict(0), _poolToTensor(0), _configured(false) {
 
   declareInput(_signal, 480000, "signal", "the input audio signal sampled at 16 kHz");
   declareOutput(_predictions, 1, "predictions", "the output values from the model node named after `output`");
@@ -49,7 +49,6 @@ void TensorflowPredictMAEST::createInnerNetwork() {
   _tensorToPool           = factory.create("TensorToPool");
   _tensorflowPredict      = factory.create("TensorflowPredict");
   _poolToTensor           = factory.create("PoolToTensor");
-  _tensorToVectorReal     = factory.create("TensorToVectorReal");
 
   _shift->output("array").setBufferType(BufferUsage::forMultipleFrames);
   _scale->output("array").setBufferType(BufferUsage::forMultipleFrames);
@@ -63,10 +62,8 @@ void TensorflowPredictMAEST::createInnerNetwork() {
   _vectorRealToTensor->output("tensor")    >> _tensorToPool->input("tensor");
   _tensorToPool->output("pool")            >> _tensorflowPredict->input("poolIn");
   _tensorflowPredict->output("poolOut")    >> _poolToTensor->input("pool");
-  _poolToTensor->output("tensor")          >> _tensorToVectorReal->input("tensor");
 
-
-  attach(_tensorToVectorReal->output("frame"), _predictions);
+  attach(_poolToTensor->output("tensor"), _predictions);
 
   _network = new scheduler::Network(_frameCutter);
 }
@@ -180,14 +177,25 @@ const char* TensorflowPredictMAEST::category = "Machine Learning";
 const char* TensorflowPredictMAEST::description = DOC(
   "This algorithm makes predictions using MAEST-based models.\n"
   "\n"
-  "Internally, it uses TensorflowInputMusiCNN for the input feature extraction "
-  "(mel bands). It feeds the model with mel-spectrogram patches and "
-  "jumps a constant amount of frames determined by `patchHopSize`.\n"
+  "Internally, it uses TensorflowInputMusiCNN for the input feature extraction. "
+  "It feeds the model with mel-spectrogram patches and jumps a constant amount "
+  "of frames determined by `patchHopSize`.\n"
   "\n"
   "By setting the `batchSize` parameter to -1 or 0 the patches are stored to run a single "
   "TensorFlow session at the end of the stream. This allows to take advantage "
   "of parallelization when GPUs are available, but at the same time it can be "
   "memory exhausting for long files.\n"
+  "\n"
+  "For the official MAEST models, the algorithm outputs the probabilities for "
+  "400 music style labels by default. Additionally, it is possible to retrieve "
+  "the output of each attention layer by setting `output=StatefulParitionedCall:n`, "
+  "where `n` is the index of the layer (starting from 1).\n"
+  "The output from the attention layers should be interpreted as follows:\n"
+  "  [batch_index, 1, token_number, embeddings_size]\n"
+  "Where the the fist and second tokens (e.g., [0, 0, :2, :]) correspond to the "
+  "CLS and DIST tokens respectively, and the following ones to input signal ( "
+  "refer to the original paper for details [1]).\n"
+
   "\n"
   "The recommended pipeline is as follows::\n"
   "\n"
@@ -247,7 +255,7 @@ void TensorflowPredictMAEST::configure() {
 
 void TensorflowPredictMAEST::compute() {
   const vector<Real>& signal = _signal.get();
-  vector<vector<Real> >& predictions = _predictions.get();
+  Tensor<Real>& predictions = _predictions.get();
 
   if (!signal.size()) {
     throw EssentiaException("TensorflowPredictMAEST: empty input signal");
@@ -258,10 +266,15 @@ void TensorflowPredictMAEST::compute() {
   _network->run();
 
   try {
-    predictions = _pool.value<vector<vector<Real> > >("predictions");
+    vector<Tensor<Real> > predictions_vector = _pool.value<vector<Tensor<Real> > >("predictions");
+    predictions = predictions_vector[0];
+
+    for (int i = 1; i < (int)predictions_vector.size(); i++) {
+       Tensor<Real> new_predictions = predictions.concatenate(predictions_vector[i], 0).eval();
+       predictions = new_predictions;
+    }
   }
   catch (EssentiaException&) {
-    predictions.clear();
     reset();
 
     throw EssentiaException("TensorflowPredictMAEST: input signal is too short.");
diff --git a/src/algorithms/machinelearning/tensorflowpredictmaest.h b/src/algorithms/machinelearning/tensorflowpredictmaest.h
index dfc88ead6..c366b8650 100644
--- a/src/algorithms/machinelearning/tensorflowpredictmaest.h
+++ b/src/algorithms/machinelearning/tensorflowpredictmaest.h
@@ -39,10 +39,9 @@ class TensorflowPredictMAEST : public AlgorithmComposite {
   Algorithm* _tensorToPool;
   Algorithm* _tensorflowPredict;
   Algorithm* _poolToTensor;
-  Algorithm* _tensorToVectorReal;
 
   SinkProxy<Real> _signal;
-  SourceProxy<std::vector<Real> > _predictions;
+  SourceProxy<Tensor<Real> > _predictions;
 
   scheduler::Network* _network;
   bool _configured;
@@ -101,7 +100,7 @@ namespace standard {
 class TensorflowPredictMAEST : public Algorithm {
  protected:
   Input<std::vector<Real> > _signal;
-  Output<std::vector<std::vector<Real> > > _predictions;
+  Output<Tensor<Real> > _predictions;
 
   streaming::Algorithm* _tensorflowPredictMAEST;
   streaming::VectorInput<Real>* _vectorInput;
@@ -120,7 +119,7 @@ class TensorflowPredictMAEST : public Algorithm {
     declareParameter("input", "the name of the input nodes in the Tensorflow graph", "", "serving_default_melspectrogram");
     declareParameter("output", "the name of the node from which to retrieve the output tensors", "", "StatefulPartitionedCall");
     declareParameter("isTrainingName", "the name of an additional input node indicating whether the model is to be run in a training mode (for models with a training mode, leave it empty otherwise)", "", "");
-    declareParameter("patchHopSize", "number of frames between the beginnings of adjacent patches. 0 to avoid overlap", "[0,inf)", 1876);
+    declareParameter("patchHopSize", "number of frames between the beginnings of adjacent patches. 0 to avoid overlap", "[0,inf)", 1875);
     declareParameter("lastPatchMode", "what to do with the last frames: `repeat` them to fill the last patch or `discard` them", "{discard,repeat}", "discard");
     declareParameter("batchSize", "the batch size for prediction. This allows parallelization when GPUs are available. Set it to -1 or 0 to accumulate all the patches and run a single TensorFlow session at the end of the stream", "[-1,inf)", 1);
     declareParameter("patchSize", "number of frames required for each inference. This parameter should match the model's expected input shape.", "[0,inf)", 1876);
diff --git a/test/src/unittests/machinelearning/test_tensorflowpredictmaest.py b/test/src/unittests/machinelearning/test_tensorflowpredictmaest.py
index 34a42d207..a50c810ef 100644
--- a/test/src/unittests/machinelearning/test_tensorflowpredictmaest.py
+++ b/test/src/unittests/machinelearning/test_tensorflowpredictmaest.py
@@ -45,7 +45,7 @@ def setUpClass(self):
     def tearDownClass(self):
         essentia.log.warningActive = True
 
-    def testRegressionFrozenModel(self):
+    def testRegression(self):
         expected = numpy.load(
             join(
                 filedir(),
@@ -58,7 +58,7 @@ def testRegressionFrozenModel(self):
         audio = MonoLoader(filename=filename, sampleRate=16000, resampleQuality=4)()
 
         activations = self.model30s(audio)
-        found = numpy.mean(activations, axis=0)
+        found = numpy.mean(activations, axis=0).squeeze()
 
         self.assertAlmostEqualVector(found, expected, 1e-1)
 

From 7f7b81192deba666d40941e8fd1a823a0e1dff17 Mon Sep 17 00:00:00 2001
From: palonso <pablo.alonso@upf.edu>
Date: Thu, 19 Oct 2023 21:37:32 +0200
Subject: [PATCH 11/26] Fix ouput index detection bug in TensorflowPredict

By developing MAEST we found that our strategy to separate output name
and index only supported outputs up to 1 digit.

The parsing code has beed improved and simplified.
As a result we have removed one test case that was no longer needed:
e.g, output:3a.
Now atoi will process (3a) and correctly retrieve (3) so that:
name: output
index: 3

Additionally we could inforce that no trimming character after the
number were allowed.

Finally, we have implemented subtest so that it is easer to know which
assertion failed.
---
 .../machinelearning/tensorflowpredict.cpp     |  17 +-
 .../machinelearning/test_tensorflowpredict.py | 276 ++++++++++--------
 2 files changed, 160 insertions(+), 133 deletions(-)

diff --git a/src/algorithms/machinelearning/tensorflowpredict.cpp b/src/algorithms/machinelearning/tensorflowpredict.cpp
index e6867f6d5..5b8a02ebc 100644
--- a/src/algorithms/machinelearning/tensorflowpredict.cpp
+++ b/src/algorithms/machinelearning/tensorflowpredict.cpp
@@ -366,6 +366,7 @@ const Tensor<Real> TensorflowPredict::TFToTensor(
 TF_Output TensorflowPredict::graphOperationByName(const string nodeName) {
   int index = 0;
   const char* name = nodeName.c_str();
+  string newNodeName;
 
   // TensorFlow operations (or nodes from the graph perspective) return tensors named <nodeName:n>, where n goes
   // from 0 to the number of outputs. The first output tensor of a node can be extracted implicitly (nodeName)
@@ -374,22 +375,16 @@ TF_Output TensorflowPredict::graphOperationByName(const string nodeName) {
   string::size_type n = nodeName.find(':');
   if (n != string::npos) {
     try {
-      string::size_type next_char;
-      index = stoi(nodeName.substr(n + 1), &next_char);
-
-      if (n + next_char + 1 != nodeName.size()) {
-        throw EssentiaException("TensorflowPredict: `" + nodeName + "` is not a valid node name, the index cannot "
-                                "be followed by other characters. Make sure that all your inputs and outputs follow "
-                                "the pattern `nodeName:n`, where `n` in an integer that goes from 0 to the number "
-                                "of outputs of the node - 1.");
-      }
+      newNodeName = nodeName.substr(0, n);
+      name = newNodeName.c_str();
+      index = stoi(nodeName.substr(n + 1, nodeName.size()));
 
     } catch (const invalid_argument& ) {
       throw EssentiaException("TensorflowPredict: `" + nodeName + "` is not a valid node name. Make sure that all "
                               "your inputs and outputs follow the pattern `nodeName:n`, where `n` in an integer that "
                               "goes from 0 to the number of outputs of the node - 1.");
-    } 
-    name = nodeName.substr(0, n).c_str();
+    }
+
   }
 
   TF_Operation* oper = TF_GraphOperationByName(_graph, name);
diff --git a/test/src/unittests/machinelearning/test_tensorflowpredict.py b/test/src/unittests/machinelearning/test_tensorflowpredict.py
index da2b6bbab..bd709bd0e 100644
--- a/test/src/unittests/machinelearning/test_tensorflowpredict.py
+++ b/test/src/unittests/machinelearning/test_tensorflowpredict.py
@@ -18,14 +18,12 @@
 # version 3 along with this program. If not, see http://www.gnu.org/licenses/
 
 
-
 from essentia_test import *
 import sys
 import os
 
 
 class TestTensorFlowPredict(TestCase):
-
     def regression(self, parameters):
         # Test a simple tensorflow model trained on Essentia features.
         # The ground true values were obtained with the following script:
@@ -36,14 +34,14 @@ def regression(self, parameters):
         frameSize = 1024
         hopSize = frameSize
 
-        filename = join(testdata.audio_dir, 'recorded', 'cat_purrrr.wav')
+        filename = join(testdata.audio_dir, "recorded", "cat_purrrr.wav")
 
         audio = MonoLoader(filename=filename)()
 
-        w = Windowing(type='hann', zeroPadding=frameSize)
+        w = Windowing(type="hann", zeroPadding=frameSize)
         spectrum = Spectrum()
-        mels = MelBands(numberBands=numberBands, type='magnitude')
-        logNorm = UnaryOperator(type='log')
+        mels = MelBands(numberBands=numberBands, type="magnitude")
+        logNorm = UnaryOperator(type="log")
 
         bands = []
         for frame in FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize):
@@ -52,37 +50,37 @@ def regression(self, parameters):
         bands = array(bands)
 
         discard = bands.shape[0] % patchSize
-        bands = numpy.reshape(bands[:-discard,:], [-1, patchSize, numberBands])
+        bands = numpy.reshape(bands[:-discard, :], [-1, patchSize, numberBands])
         batch = numpy.expand_dims(bands, 1)
 
         pool = Pool()
-        pool.set('model/Placeholder', batch)
+        pool.set("model/Placeholder", batch)
 
         tfp = TensorflowPredict(**parameters)
         poolOut = tfp(pool)
 
-        foundValues = poolOut['model/Softmax'].mean(axis=0).squeeze()
+        foundValues = poolOut["model/Softmax"].mean(axis=0).squeeze()
 
         self.assertAlmostEqualVector(foundValues, expectedValues, 1e-5)
 
     def testRegressionFrozenModel(self):
         parameters = {
-            'graphFilename': join(testdata.models_dir, 'vgg', 'vgg4.pb'),
-            'inputs': ['model/Placeholder'],
-            'outputs': ['model/Softmax'],
-            'isTraining': False,
-            'isTrainingName': 'model/Placeholder_1',
+            "graphFilename": join(testdata.models_dir, "vgg", "vgg4.pb"),
+            "inputs": ["model/Placeholder"],
+            "outputs": ["model/Softmax"],
+            "isTraining": False,
+            "isTrainingName": "model/Placeholder_1",
         }
 
         self.regression(parameters)
 
     def testRegressionSavedModel(self):
         parameters = {
-            'savedModel': join(testdata.models_dir, 'vgg', 'vgg4'),
-            'inputs': ['model/Placeholder'],
-            'outputs': ['model/Softmax'],
-            'isTraining': False,
-            'isTrainingName': 'model/Placeholder_1',
+            "savedModel": join(testdata.models_dir, "vgg", "vgg4"),
+            "inputs": ["model/Placeholder"],
+            "outputs": ["model/Softmax"],
+            "isTraining": False,
+            "isTrainingName": "model/Placeholder_1",
         }
 
         self.regression(parameters)
@@ -91,12 +89,12 @@ def testSavedModelOverridesGraphFilename(self):
         # When both are specified, `savedModel` should be preferred.
         # Test this by setting an invalid `graphFilename` that should be ignored.
         parameters = {
-            'graphFilename': "wrong_model",
-            'savedModel': join(testdata.models_dir, 'vgg', 'vgg4'),
-            'inputs': ['model/Placeholder'],
-            'outputs': ['model/Softmax'],
-            'isTraining': False,
-            'isTrainingName': 'model/Placeholder_1',
+            "graphFilename": "wrong_model",
+            "savedModel": join(testdata.models_dir, "vgg", "vgg4"),
+            "inputs": ["model/Placeholder"],
+            "outputs": ["model/Softmax"],
+            "isTraining": False,
+            "isTrainingName": "model/Placeholder_1",
         }
 
         self.regression(parameters)
@@ -104,160 +102,194 @@ def testSavedModelOverridesGraphFilename(self):
     def testEmptyModelName(self):
         # With empty model name the algorithm should skip the configuration without errors.
         self.assertConfigureSuccess(TensorflowPredict(), {})
-        self.assertConfigureSuccess(TensorflowPredict(), {'graphFilename': ''})
-        self.assertConfigureSuccess(TensorflowPredict(), {'graphFilename': '',
-                                                          'inputs': ['']
-                                                         })
-        self.assertConfigureSuccess(TensorflowPredict(), {'graphFilename': '',
-                                                          'inputs': ['wrong_input']
-                                                         })
-        self.assertConfigureSuccess(TensorflowPredict(), {'savedModel': ''})
-        self.assertConfigureSuccess(TensorflowPredict(), {'savedModel': '',
-                                                          'inputs':['']
-                                                         })
-        self.assertConfigureSuccess(TensorflowPredict(), {'savedModel': '',
-                                                          'inputs':['wrong_input']
-                                                         })
-        self.assertConfigureSuccess(TensorflowPredict(), {'graphFilename': '',
-                                                          'savedModel':''
-                                                         })
-        self.assertConfigureSuccess(TensorflowPredict(), {'graphFilename': '',
-                                                          'savedModel':'',
-                                                          'inputs': ['']
-                                                         })
-        self.assertConfigureSuccess(TensorflowPredict(), {'graphFilename': '',
-                                                          'savedModel':'',
-                                                          'inputs': ['wrong_input']
-                                                         })
+        self.assertConfigureSuccess(TensorflowPredict(), {"graphFilename": ""})
+        self.assertConfigureSuccess(
+            TensorflowPredict(), {"graphFilename": "", "inputs": [""]}
+        )
+        self.assertConfigureSuccess(
+            TensorflowPredict(), {"graphFilename": "", "inputs": ["wrong_input"]}
+        )
+        self.assertConfigureSuccess(TensorflowPredict(), {"savedModel": ""})
+        self.assertConfigureSuccess(
+            TensorflowPredict(), {"savedModel": "", "inputs": [""]}
+        )
+        self.assertConfigureSuccess(
+            TensorflowPredict(), {"savedModel": "", "inputs": ["wrong_input"]}
+        )
+        self.assertConfigureSuccess(
+            TensorflowPredict(), {"graphFilename": "", "savedModel": ""}
+        )
+        self.assertConfigureSuccess(
+            TensorflowPredict(), {"graphFilename": "", "savedModel": "", "inputs": [""]}
+        )
+        self.assertConfigureSuccess(
+            TensorflowPredict(),
+            {"graphFilename": "", "savedModel": "", "inputs": ["wrong_input"]},
+        )
 
     def testInvalidParam(self):
-        model = join(testdata.models_dir, 'vgg', 'vgg4.pb')
-        self.assertConfigureFails(TensorflowPredict(), {'graphFilename': model})  # inputs and outputs are not defined
-        self.assertConfigureFails(TensorflowPredict(), {'graphFilename': model,
-                                                        'inputs': ['model/Placeholder'],
-                                                       })  # outputs are not defined
-        self.assertConfigureFails(TensorflowPredict(), {'graphFilename': model,
-                                                        'inputs': ['wrong_input_name'],
-                                                        'outputs': ['model/Softmax'],
-                                                        })  # input does not exist in the model
-        self.assertConfigureFails(TensorflowPredict(), {'graphFilename': 'wrong_model_name',
-                                                        'inputs': ['model/Placeholder'],
-                                                        'outputs': ['model/Softmax'],
-                                                        })  # the model does not exist
-        
+        model = join(testdata.models_dir, "vgg", "vgg4.pb")
+        self.assertConfigureFails(
+            TensorflowPredict(), {"graphFilename": model}
+        )  # inputs and outputs are not defined
+        self.assertConfigureFails(
+            TensorflowPredict(),
+            {
+                "graphFilename": model,
+                "inputs": ["model/Placeholder"],
+            },
+        )  # outputs are not defined
+        self.assertConfigureFails(
+            TensorflowPredict(),
+            {
+                "graphFilename": model,
+                "inputs": ["wrong_input_name"],
+                "outputs": ["model/Softmax"],
+            },
+        )  # input does not exist in the model
+        self.assertConfigureFails(
+            TensorflowPredict(),
+            {
+                "graphFilename": "wrong_model_name",
+                "inputs": ["model/Placeholder"],
+                "outputs": ["model/Softmax"],
+            },
+        )  # the model does not exist
+
         # Repeat tests for savedModel format.
-        model = join(testdata.models_dir, 'vgg', 'vgg4/')
-        self.assertConfigureFails(TensorflowPredict(), {'savedModel': model})  # inputs and outputs are not defined
-        self.assertConfigureFails(TensorflowPredict(), {'savedModel': model,
-                                                        'inputs': ['model/Placeholder'],
-                                                       })  # outputs are not defined
-        self.assertConfigureFails(TensorflowPredict(), {'savedModel': model,
-                                                        'inputs': ['wrong_input_name'],
-                                                        'outputs': ['model/Softmax'],
-                                                        })  # input does not exist in the model
-        self.assertConfigureFails(TensorflowPredict(), {'savedModel': 'wrong_model_name',
-                                                        'inputs': ['model/Placeholder'],
-                                                        'outputs': ['model/Softmax'],
-                                                        })  # the model does not exist
+        model = join(testdata.models_dir, "vgg", "vgg4/")
+        self.assertConfigureFails(
+            TensorflowPredict(), {"savedModel": model}
+        )  # inputs and outputs are not defined
+        self.assertConfigureFails(
+            TensorflowPredict(),
+            {
+                "savedModel": model,
+                "inputs": ["model/Placeholder"],
+            },
+        )  # outputs are not defined
+        self.assertConfigureFails(
+            TensorflowPredict(),
+            {
+                "savedModel": model,
+                "inputs": ["wrong_input_name"],
+                "outputs": ["model/Softmax"],
+            },
+        )  # input does not exist in the model
+        self.assertConfigureFails(
+            TensorflowPredict(),
+            {
+                "savedModel": "wrong_model_name",
+                "inputs": ["model/Placeholder"],
+                "outputs": ["model/Softmax"],
+            },
+        )  # the model does not exist
 
     def testIdentityModel(self):
         # Perform the identity operation in Tensorflow to test if the data is
         # being copied correctly backwards and fordwards.
-        model = join(filedir(), 'tensorflowpredict', 'identity.pb')
-        filename = join(testdata.audio_dir, 'recorded', 'cat_purrrr.wav')
+        model = join(filedir(), "tensorflowpredict", "identity.pb")
+        filename = join(testdata.audio_dir, "recorded", "cat_purrrr.wav")
 
         audio = MonoLoader(filename=filename)()
         frames = array([frame for frame in FrameGenerator(audio)])
         batch = frames[numpy.newaxis, numpy.newaxis, :]
 
         pool = Pool()
-        pool.set('model/Placeholder', batch)
+        pool.set("model/Placeholder", batch)
 
-        poolOut = TensorflowPredict(graphFilename=model,
-                                    inputs=['model/Placeholder'],
-                                    outputs=['model/Identity'])(pool)
+        poolOut = TensorflowPredict(
+            graphFilename=model,
+            inputs=["model/Placeholder"],
+            outputs=["model/Identity"],
+        )(pool)
 
-        foundValues = poolOut['model/Identity']
+        foundValues = poolOut["model/Identity"]
 
         self.assertAlmostEqualMatrix(foundValues, batch)
 
     def testComputeWithoutConfiguration(self):
         pool = Pool()
-        pool.set('model/Placeholder', numpy.zeros((1, 1, 1, 1), dtype='float32'))
+        pool.set("model/Placeholder", numpy.zeros((1, 1, 1, 1), dtype="float32"))
 
         self.assertComputeFails(TensorflowPredict(), pool)
 
     def testIgnoreInvalidReconfiguration(self):
         pool = Pool()
-        pool.set('model/Placeholder', numpy.ones((1, 1, 1, 1), dtype='float32'))
+        pool.set("model/Placeholder", numpy.ones((1, 1, 1, 1), dtype="float32"))
 
-        model_name = join(filedir(), 'tensorflowpredict', 'identity.pb')
+        model_name = join(filedir(), "tensorflowpredict", "identity.pb")
         model = TensorflowPredict(
             graphFilename=model_name,
-            inputs=['model/Placeholder'],
-            outputs=['model/Identity'],
+            inputs=["model/Placeholder"],
+            outputs=["model/Identity"],
             squeeze=False,
         )
 
-        firstResult = model(pool)['model/Identity']
+        firstResult = model(pool)["model/Identity"]
 
         # This attempt to reconfigure the algorithm should be ignored and trigger a Warning.
         model.configure()
 
-        secondResult = model(pool)['model/Identity']
+        secondResult = model(pool)["model/Identity"]
 
         self.assertEqualMatrix(firstResult, secondResult)
 
     def testImplicitOutputTensorIndex(self):
-        model = join(filedir(), 'tensorflowpredict', 'identity.pb')
-        batch = numpy.reshape(numpy.arange(4, dtype='float32'), (1, 1, 2, 2))
+        model = join(filedir(), "tensorflowpredict", "identity.pb")
+        batch = numpy.reshape(numpy.arange(4, dtype="float32"), (1, 1, 2, 2))
 
         pool = Pool()
-        pool.set('model/Placeholder', batch)
+        pool.set("model/Placeholder", batch)
 
-        implicit_output = 'model/Identity'
+        implicit_output = "model/Identity"
         implicit = TensorflowPredict(
             graphFilename=model,
-            inputs=['model/Placeholder'],
+            inputs=["model/Placeholder"],
             outputs=[implicit_output],
         )(pool)[implicit_output].squeeze()
 
-        explicit_output = 'model/Identity:0'
+        explicit_output = "model/Identity:0"
         explicit = TensorflowPredict(
             graphFilename=model,
-            inputs=['model/Placeholder'],
+            inputs=["model/Placeholder"],
             outputs=[explicit_output],
         )(pool)[explicit_output].squeeze()
 
         self.assertAlmostEqualMatrix(implicit, explicit)
 
     def testNodeNameParser(self):
-        model = join(testdata.models_dir, 'vgg', 'vgg4.pb')
-
-        self.assertConfigureFails(TensorflowPredict(), {'graphFilename': model,
-                                                        'inputs': ['model/Placeholder'],
-                                                        'outputs': ['model/Softmax:0a'],
-                                                        })  # Invalid index.
-        self.assertConfigureFails(TensorflowPredict(), {'graphFilename': model,
-                                                        'inputs': ['model/Placeholder'],
-                                                        'outputs': ['model/Softmax:'],
-                                                        })  # No index.
-        self.assertConfigureFails(TensorflowPredict(), {'graphFilename': model,
-                                                        'inputs': ['model/Placeholder'],
-                                                        'outputs': ['model/Softmax:3'],
-                                                        })  # Index out of bounds.
-        self.assertConfigureFails(TensorflowPredict(), {'graphFilename': model,
-                                                        'inputs': ['model/Placeholder'],
-                                                        'outputs': ['model/Softmax::0'],
-                                                        })  # Double colon.
-        self.assertConfigureFails(TensorflowPredict(), {'graphFilename': model,
-                                                        'inputs': ['model/Placeholder'],
-                                                        'outputs': ['model/Softmax:s:0'],
-                                                        })  # Several colons.
-
+        model = join(testdata.models_dir, "vgg", "vgg4.pb")
+
+        configs = [
+            {
+                "graphFilename": model,
+                "inputs": ["model/Placeholder"],
+                "outputs": ["model/Softmax:"],
+            },  # No index.
+            {
+                "graphFilename": model,
+                "inputs": ["model/Placeholder"],
+                "outputs": ["model/Softmax:3"],
+            },  # Index out of bounds.
+            {
+                "graphFilename": model,
+                "inputs": ["model/Placeholder"],
+                "outputs": ["model/Softmax::0"],
+            },  # Double colon.
+            {
+                "graphFilename": model,
+                "inputs": ["model/Placeholder"],
+                "outputs": ["model/Softmax:s:0"],
+            },  # Several colons.
+        ]
+
+        for config in configs[1:]:
+            with self.subTest(f"{config} failed"):
+                self.assertConfigureFails(TensorflowPredict(), config)
 
 
 suite = allTests(TestTensorFlowPredict)
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     TextTestRunner(verbosity=2).run(suite)

From 8dba19e5c2e3c5512bc69f38cbbe0e6361efd8f9 Mon Sep 17 00:00:00 2001
From: palonso <pablo.alonso@upf.edu>
Date: Thu, 19 Oct 2023 21:53:19 +0200
Subject: [PATCH 12/26] Fix typos and improve test

---
 src/examples/python/models/generate_example_scripts.py      | 2 +-
 .../machinelearning/test_tensorflowpredictmaest.py          | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/examples/python/models/generate_example_scripts.py b/src/examples/python/models/generate_example_scripts.py
index ea00e846e..6b380244f 100644
--- a/src/examples/python/models/generate_example_scripts.py
+++ b/src/examples/python/models/generate_example_scripts.py
@@ -100,7 +100,7 @@ def get_additional_parameters(metadata: dict, output: str, algo_name: str):
             and model_output["name"] != OUTPUT_DEFAULTS[algo_name]
         ):
             if metadata["name"] == "MAEST" and ":7" in model_output["name"]:
-                # In maest models we recomend the embeddings from the 7th layer.
+                # For MAEST we recommend using the embeddings from the 7th layer.
                 additional_parameters += f', output="{model_output["name"]}"'
 
     return additional_parameters
diff --git a/test/src/unittests/machinelearning/test_tensorflowpredictmaest.py b/test/src/unittests/machinelearning/test_tensorflowpredictmaest.py
index a50c810ef..079a177f0 100644
--- a/test/src/unittests/machinelearning/test_tensorflowpredictmaest.py
+++ b/test/src/unittests/machinelearning/test_tensorflowpredictmaest.py
@@ -25,9 +25,9 @@ class TestTensorFlowPredictMAEST(TestCase):
     @classmethod
     def setUpClass(self):
         # Since loading the Transformers takes a lot of time, we do it only once as reusable class members.
-        # When using these algos, Essentia complains that other networks (e.g., MonoLoader' network) were
-        # destroyed in the meantime. These warnings are not relevant for the tests, so we disable
-        # Warnings temporally.
+        # When using these algos, Essentia complains that other networks (e.g., MonoLoader's network) were
+        # destroyed in the meantime. These warnings are not relevant for the tests and difficult readability,
+        # so we disable them temporally.
 
         essentia.log.warningActive = False
 

From 6eaa5a9bfd7fe86b2e19968f3714c531f0f1f6ab Mon Sep 17 00:00:00 2001
From: palonso <pablo.alonso@upf.edu>
Date: Thu, 19 Oct 2023 22:26:13 +0200
Subject: [PATCH 13/26] Update models submodule

---
 test/models | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/models b/test/models
index 2884f0da3..3ca4130bc 160000
--- a/test/models
+++ b/test/models
@@ -1 +1 @@
-Subproject commit 2884f0da3c15e9494e656f8f49e01f5208f1e171
+Subproject commit 3ca4130bcb398a1361867e5d8462d3a7a0c02ccd

From 3423d5c1598e9efe4c64bc262ea006dff085a1dd Mon Sep 17 00:00:00 2001
From: palonso <pablo.alonso@upf.edu>
Date: Fri, 20 Oct 2023 00:46:06 +0200
Subject: [PATCH 14/26] Only include TensorflowPredictMAEST when TF available

---
 src/wscript | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/wscript b/src/wscript
index 651ee1c82..ab86dc469 100644
--- a/src/wscript
+++ b/src/wscript
@@ -326,12 +326,12 @@ def configure(ctx):
     else:
         print('- Essentia is configured without Chromaprint.')
         print('  The following algorithms will be ignored: %s' % algos)
-        ctx.env.ALGOIGNORE += algos 
+        ctx.env.ALGOIGNORE += algos
 
     algos = [ 'TensorflowPredict', 'TensorflowPredictMusiCNN', 'TensorflowPredictVGGish',
               'TensorflowPredictTempoCNN', 'TensorflowPredictCREPE', 'PitchCREPE',
               'TempoCNN', 'TensorflowPredictEffnetDiscogs', 'TensorflowPredict2D',
-              'TensorflowPredictFSDSINet']
+              'TensorflowPredictFSDSINet', 'TensorflowPredictMAEST',]
     if has('tensorflow'):
         print('- Tensorflow detected!')
         print('  The following algorithms will be included: %s\n' % algos)

From b0f0ae461476832f584c2dd950ed8c3ea7c2ec5d Mon Sep 17 00:00:00 2001
From: palonso <pablo.alonso@upf.edu>
Date: Fri, 20 Oct 2023 01:03:39 +0200
Subject: [PATCH 15/26] Fix typos

---
 doc/sphinxdoc/models.rst                                  | 2 +-
 src/algorithms/machinelearning/tensorflowpredictmaest.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/sphinxdoc/models.rst b/doc/sphinxdoc/models.rst
index f881aaec5..7dbf2c9b7 100644
--- a/doc/sphinxdoc/models.rst
+++ b/doc/sphinxdoc/models.rst
@@ -231,7 +231,7 @@ Models:
 
 *Note: It is possible to retrieve the output of each attention layer by setting* ``output=StatefulParitionedCall:n`` *, where* ``n`` *is the index of the layer (starting from 1).*
 *The output from the attention layers should be interpreted as* ``[batch_index, 1, token_number, embeddings_size]``
-*, where the fist and second tokens (i.e.,* ``[0, 0, :2, :]`` *) correspond to the*  ``CLS`` *and* ``DIST`` *tokens respectively, and the following ones to input signal (refer to the* `paper <https://repositori.upf.edu/handle/10230/58023>`_  *for details).*
+*, where the first and second tokens (i.e.,* ``[0, 0, :2, :]`` *) correspond to the*  ``CLS`` *and* ``DIST`` *tokens respectively, and the following ones to input signal (refer to the* `paper <https://repositori.upf.edu/handle/10230/58023>`_  *for details).*
 
 OpenL3
 ^^^^^^
diff --git a/src/algorithms/machinelearning/tensorflowpredictmaest.cpp b/src/algorithms/machinelearning/tensorflowpredictmaest.cpp
index 09df1d708..52d6a9bec 100644
--- a/src/algorithms/machinelearning/tensorflowpredictmaest.cpp
+++ b/src/algorithms/machinelearning/tensorflowpredictmaest.cpp
@@ -192,7 +192,7 @@ const char* TensorflowPredictMAEST::description = DOC(
   "where `n` is the index of the layer (starting from 1).\n"
   "The output from the attention layers should be interpreted as follows:\n"
   "  [batch_index, 1, token_number, embeddings_size]\n"
-  "Where the the fist and second tokens (e.g., [0, 0, :2, :]) correspond to the "
+  "Where the first and second tokens (e.g., [0, 0, :2, :]) correspond to the "
   "CLS and DIST tokens respectively, and the following ones to input signal ( "
   "refer to the original paper for details [1]).\n"
 

From ec14b8453c4ab73c09b0d0ee59b5951ad49ade64 Mon Sep 17 00:00:00 2001
From: palonso <pablo.alonso@upf.edu>
Date: Fri, 20 Oct 2023 01:09:36 +0200
Subject: [PATCH 16/26] Fix example script generation logic

---
 src/examples/python/models/generate_example_scripts.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/examples/python/models/generate_example_scripts.py b/src/examples/python/models/generate_example_scripts.py
index 6b380244f..358ae59c3 100644
--- a/src/examples/python/models/generate_example_scripts.py
+++ b/src/examples/python/models/generate_example_scripts.py
@@ -99,9 +99,11 @@ def get_additional_parameters(metadata: dict, output: str, algo_name: str):
             model_output["output_purpose"] == output
             and model_output["name"] != OUTPUT_DEFAULTS[algo_name]
         ):
-            if metadata["name"] == "MAEST" and ":7" in model_output["name"]:
+            if metadata["name"] == "MAEST" and ":7" not in model_output["name"]:
                 # For MAEST we recommend using the embeddings from the 7th layer.
-                additional_parameters += f', output="{model_output["name"]}"'
+                continue
+
+            additional_parameters += f', output="{model_output["name"]}"'
 
     return additional_parameters
 

From 7f9653612c20c72d2dff4883fe3bd66616ff84a5 Mon Sep 17 00:00:00 2001
From: palonso <pablo.alonso@upf.edu>
Date: Fri, 20 Oct 2023 16:58:15 +0200
Subject: [PATCH 17/26] Use permanent paper link

---
 doc/sphinxdoc/models.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/sphinxdoc/models.rst b/doc/sphinxdoc/models.rst
index 7dbf2c9b7..d34bf8ff3 100644
--- a/doc/sphinxdoc/models.rst
+++ b/doc/sphinxdoc/models.rst
@@ -231,7 +231,7 @@ Models:
 
 *Note: It is possible to retrieve the output of each attention layer by setting* ``output=StatefulParitionedCall:n`` *, where* ``n`` *is the index of the layer (starting from 1).*
 *The output from the attention layers should be interpreted as* ``[batch_index, 1, token_number, embeddings_size]``
-*, where the first and second tokens (i.e.,* ``[0, 0, :2, :]`` *) correspond to the*  ``CLS`` *and* ``DIST`` *tokens respectively, and the following ones to input signal (refer to the* `paper <https://repositori.upf.edu/handle/10230/58023>`_  *for details).*
+*, where the first and second tokens (i.e.,* ``[0, 0, :2, :]`` *) correspond to the*  ``CLS`` *and* ``DIST`` *tokens respectively, and the following ones to input signal (refer to the* `paper <http://hdl.handle.net/10230/58023>`_  *for details).*
 
 OpenL3
 ^^^^^^

From 4c91afbfa81ccc1a5e4b1daf289fa9df2c359678 Mon Sep 17 00:00:00 2001
From: palonso <pablo.alonso@upf.edu>
Date: Fri, 20 Oct 2023 17:14:05 +0200
Subject: [PATCH 18/26] Improve TensorflowPredictMAEST doc

---
 .../machinelearning/tensorflowpredictmaest.cpp     | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/algorithms/machinelearning/tensorflowpredictmaest.cpp b/src/algorithms/machinelearning/tensorflowpredictmaest.cpp
index 52d6a9bec..147948dad 100644
--- a/src/algorithms/machinelearning/tensorflowpredictmaest.cpp
+++ b/src/algorithms/machinelearning/tensorflowpredictmaest.cpp
@@ -113,26 +113,26 @@ void TensorflowPredictMAEST::configure() {
 
   if (parameter("patchSize").isConfigured()) {
     if (graphFilename.find("20s") != std::string::npos) {
-      E_INFO("TensorFlowPredictMAEST: We detected that the default patchSize is not suitable for the graph `" << graphFilename.c_str() << "`. Setting it to 1258, which is adequate for the 20s model.");
+      E_INFO("TensorFlowPredictMAEST: The default `patchSize` is not suitable according to the graph filename `" << graphFilename.c_str() << "`. Setting it to 1258, which is adequate for the 20s model.");
       patchSize = 1258;
     } else if (graphFilename.find("10s") != std::string::npos) {
-      E_INFO("TensorFlowPredictMAEST: We detected that the default patchSize is not suitable for the graph `" << graphFilename.c_str() << "`. Setting it to 626, which is adequate for the 10s model.");
+      E_INFO("TensorFlowPredictMAEST: The default `patchSize` is not suitable according to the graph filename `" << graphFilename.c_str() << "`. Setting it to 626, which is adequate for the 10s model.");
       patchSize = 626;
     } else if (graphFilename.find("5s") != std::string::npos) {
-      E_INFO("TensorFlowPredictMAEST: We detected that the default patchSize is not suitable for the graph `" << graphFilename.c_str() << "`. Setting it to 316, which is adequate for the 5s model.");
+      E_INFO("TensorFlowPredictMAEST: The default `patchSize` is not suitable according to the graph filename `" << graphFilename.c_str() << "`. Setting it to 316, which is adequate for the 5s model.");
       patchSize = 316;
     }
   }
 
   if (parameter("patchHopSize").isConfigured()) {
     if (graphFilename.find("20s") != std::string::npos) {
-      E_INFO("TensorFlowPredictMAEST: Setting patchHopSize to 1250, which is adequate for the 20s model.\n");
+      E_INFO("TensorFlowPredictMAEST: The default `patchHopSize` is not suitable according to the graph filename `" << graphFilename.c_str() << "`. Setting it to 1250, which is adequate for the 20s model.\n");
       patchHopSize = 1250;
     } else if (graphFilename.find("10s") != std::string::npos) {
-      E_INFO("TensorFlowPredictMAEST: Setting patchHopSize to 625, which is adequate for the 10s model.\n");
+      E_INFO("TensorFlowPredictMAEST: The default `patchHopSize` is not suitable according to the graph filename `" << graphFilename.c_str() << "`. Setting it to 625, which is adequate for the 10s model.\n");
       patchHopSize = 625;
     } else if (graphFilename.find("5s") != std::string::npos) {
-      E_INFO("TensorFlowPredictMAEST: Setting patchHopSize to 313, which is adequate for the 5s model.\n");
+      E_INFO("TensorFlowPredictMAEST: The default `patchHopSize` is not suitable according to the graph filename `" << graphFilename.c_str() << "`. Setting it to 313, which is adequate for the 5s model.\n");
       patchHopSize = 313;
     }
   }
@@ -205,7 +205,7 @@ const char* TensorflowPredictMAEST::description = DOC(
   "the user's responsibility to make sure it is a valid one.\n"
   "\n"
   "Note: when `patchHopSize` and `patchSize` are not specified, the algorithm "
-  "will parse `graphFilename` to try to set appropriate values.\n"
+  "will parse the `graphFilename` string to try to set appropriate values.\n"
   "\n"
   "References:\n"
   "\n"

From e8b9ab86bb5594cd7eb3cfaae52e8cd3199be058 Mon Sep 17 00:00:00 2001
From: palonso <pablo.alonso@upf.edu>
Date: Fri, 20 Oct 2023 17:18:32 +0200
Subject: [PATCH 19/26] Harden string matching pattern

---
 .../machinelearning/tensorflowpredictmaest.cpp       | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/algorithms/machinelearning/tensorflowpredictmaest.cpp b/src/algorithms/machinelearning/tensorflowpredictmaest.cpp
index 147948dad..29d6a6a78 100644
--- a/src/algorithms/machinelearning/tensorflowpredictmaest.cpp
+++ b/src/algorithms/machinelearning/tensorflowpredictmaest.cpp
@@ -112,26 +112,26 @@ void TensorflowPredictMAEST::configure() {
  // https://cs231n.github.io/convolutional-networks/#conv
 
   if (parameter("patchSize").isConfigured()) {
-    if (graphFilename.find("20s") != std::string::npos) {
+    if (graphFilename.find("discogs-maest-20s-") != std::string::npos) {
       E_INFO("TensorFlowPredictMAEST: The default `patchSize` is not suitable according to the graph filename `" << graphFilename.c_str() << "`. Setting it to 1258, which is adequate for the 20s model.");
       patchSize = 1258;
-    } else if (graphFilename.find("10s") != std::string::npos) {
+    } else if (graphFilename.find("discogs-maest-10s-") != std::string::npos) {
       E_INFO("TensorFlowPredictMAEST: The default `patchSize` is not suitable according to the graph filename `" << graphFilename.c_str() << "`. Setting it to 626, which is adequate for the 10s model.");
       patchSize = 626;
-    } else if (graphFilename.find("5s") != std::string::npos) {
+    } else if (graphFilename.find("discogs-maest-5s-") != std::string::npos) {
       E_INFO("TensorFlowPredictMAEST: The default `patchSize` is not suitable according to the graph filename `" << graphFilename.c_str() << "`. Setting it to 316, which is adequate for the 5s model.");
       patchSize = 316;
     }
   }
 
   if (parameter("patchHopSize").isConfigured()) {
-    if (graphFilename.find("20s") != std::string::npos) {
+    if (graphFilename.find("discogs-maest-20s-") != std::string::npos) {
       E_INFO("TensorFlowPredictMAEST: The default `patchHopSize` is not suitable according to the graph filename `" << graphFilename.c_str() << "`. Setting it to 1250, which is adequate for the 20s model.\n");
       patchHopSize = 1250;
-    } else if (graphFilename.find("10s") != std::string::npos) {
+    } else if (graphFilename.find("discogs-maest-10s-") != std::string::npos) {
       E_INFO("TensorFlowPredictMAEST: The default `patchHopSize` is not suitable according to the graph filename `" << graphFilename.c_str() << "`. Setting it to 625, which is adequate for the 10s model.\n");
       patchHopSize = 625;
-    } else if (graphFilename.find("5s") != std::string::npos) {
+    } else if (graphFilename.find("discogs-maest-5s-") != std::string::npos) {
       E_INFO("TensorFlowPredictMAEST: The default `patchHopSize` is not suitable according to the graph filename `" << graphFilename.c_str() << "`. Setting it to 313, which is adequate for the 5s model.\n");
       patchHopSize = 313;
     }

From 53148b2ec27da92a20c53ce4286e274e1ef192ea Mon Sep 17 00:00:00 2001
From: palonso <pablo.alonso@upf.edu>
Date: Fri, 20 Oct 2023 17:38:59 +0200
Subject: [PATCH 20/26] Improve MAEST doc

---
 doc/sphinxdoc/models.rst | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/doc/sphinxdoc/models.rst b/doc/sphinxdoc/models.rst
index d34bf8ff3..1b46f7154 100644
--- a/doc/sphinxdoc/models.rst
+++ b/doc/sphinxdoc/models.rst
@@ -141,6 +141,9 @@ Discogs-MAEST
 ^^^^^^^^^^^^^
 
 Music Audio Efficient Spectrogram Transformer (`MAEST <https://github.com/palonso/MAEST/>`_) trained to predict music style labels using an in-house dataset annotated with Discogs metadata.
+We offer versions of MAEST trained with sequence lengths ranging from 5 to 30 seconds (``5s``, ``10s``, ``20s``, and ``30s``), and trained starting from different intial weights: from random initialization (``fs``), from `DeiT <https://doi.org/10.48550/arXiv.2012.12877>`_ pre-trained weights (``dw``), and from `PaSST <https://doi.org/10.48550/arXiv.2106.07139>`_ pre-trained weights (``pw``). Additionally, we offer a version of MAEST trained following a teacher student setup (``ts``).
+According to our study ``discogs-maest-30s-pw``, achieved the most competitive performance in most downstream tasks (refer to the `paper <http://hdl.handle.net/10230/58023>`_ for details).
+
 
 Models:
 
@@ -231,7 +234,7 @@ Models:
 
 *Note: It is possible to retrieve the output of each attention layer by setting* ``output=StatefulParitionedCall:n`` *, where* ``n`` *is the index of the layer (starting from 1).*
 *The output from the attention layers should be interpreted as* ``[batch_index, 1, token_number, embeddings_size]``
-*, where the first and second tokens (i.e.,* ``[0, 0, :2, :]`` *) correspond to the*  ``CLS`` *and* ``DIST`` *tokens respectively, and the following ones to input signal (refer to the* `paper <http://hdl.handle.net/10230/58023>`_  *for details).*
+*, where the first and second tokens (i.e.,* ``[0, 0, :2, :]`` *) correspond to the*  ``CLS`` *and* ``DIST`` *tokens respectively, and the following ones to input signal.*
 
 OpenL3
 ^^^^^^

From 6b043f127b7da5f41adf119df442d01fcef37d34 Mon Sep 17 00:00:00 2001
From: palonso <pablo.alonso@upf.edu>
Date: Fri, 20 Oct 2023 19:34:47 +0200
Subject: [PATCH 21/26] Use a stronger threshold in tensorflowpredictmaest

This was achieves by making sure that the input melspectrogram was
computed exactly in the same way in TensorflowPredictMAEST and the
original pytorch implementation, using MonoLoader with
resampleQuality=1.

While using resampleQuality=4 has a numerical impact in the results, it
is not enough to change the prediction ranking according to informal
observations (could not be the same in corner cases).
---
 .../preds_maest_discogs-maest-30s-pw-1.npy       | Bin 0 -> 1728 bytes
 .../test_tensorflowpredictmaest.py               |   4 ++--
 2 files changed, 2 insertions(+), 2 deletions(-)
 create mode 100644 test/src/unittests/machinelearning/tensorflowpredictmaest/preds_maest_discogs-maest-30s-pw-1.npy

diff --git a/test/src/unittests/machinelearning/tensorflowpredictmaest/preds_maest_discogs-maest-30s-pw-1.npy b/test/src/unittests/machinelearning/tensorflowpredictmaest/preds_maest_discogs-maest-30s-pw-1.npy
new file mode 100644
index 0000000000000000000000000000000000000000..2a8d294e831f023393bdc98487c6bef486227794
GIT binary patch
literal 1728
zcmbV}`#;qA9)Kr=A}5ie!C;%&bqme+^L~H6b0KY%LSE8k_jHk=L@wj9uHkiZ$t^-z
ztYUVS)|Tw6bkk{9D{ZzTDNVVhBc>=LW!I^8Pv>7a&u`E3>r>_z<hyytJWe7fRUiwE
z3y}+0o<JNX6gct(VbSt<d2rP3Xt^x(f88rMGA{IEkJ}#{6Z+9V6}q@Me&)za=l$O-
z^irQD$NawloBwzLm3;!iEwK>Q@s`|Ak0A1a9?~(B2x;4YBc}#=c)D%?oTBPU<5)K7
z;41JyLj-Z&GDRf+yi2Z`Ud9{EmM9EAigukI)X@JCRJ~h{Hoa-+@<xO=CsHub`5mlZ
zkcrY}YwR41!}3UD@mlRA;>Gh)M1?!EsmAtS;?Lb4<A#%AG|f=Rj!%bEuB`^`o@tBP
zm@lwiQ7-z&Xa{@FPJ@c^i&XV>9`aLD*?p!3xHeu#>e1IA8F+8Y4rT)}G&=_!>$swU
z`fO2Ex-FYCutajdk?$T9ev9>rxC!l=P@4N;Rx}x%$_7fW!))X&s+(L6Vy+W-TwID<
z!wSTA&Twg?(3bU(8Ka`OfgQ?kXO*^rqKq^halxH7T-rN@%I~Vd{=?^Jy2u3ktnJyX
z9w*k$CZ6`~`jVC>xzpjsBY1VPk(xzz;%$yvqHH%8jW$itLlcCRmpCI?u7(z_3;Jz|
zqUHN@VdhLG>Xw+%<+O~Qf?1Yv8H4OXj{8`0p0uK)m0ja~oQ^vz!0usd`juJ(zsb8G
z>5_y!{MTZ3&+QtFT*bk`J5i#$#F|w-{GJ{TuBCg9Tt)Z80c=~vN*d9Y1VdAK^gu#B
z`ljBLUaQ#3-rF@Log0*}`~2fYS%EI11H07J;f*N@vM#U{lS*8DaxXp=#)6{I8m`C*
z>XyX`bEfsM<+=<WeshYv(X=r})(mYPJO#F@4ASUd%2XBPLPpL2Nvt-8-@TW?kHd*j
z%Q*rid1DYAU_ds41(Y|Hk!C9cvRNyWStQs??#Hx}1Wi9&di@IcGlP(?v7)7$dEk+B
ziMqec#P}66p#DS-=D7y6N#lUad`hspdLdP=E+Ln*%^^WQ7ng>(U}Q%d%sB+Wv($~u
z1K$a7>+2!PL=!qx6F_VFrm>}e5U@W8yGwN-^hOiJKCuF+t37q~%z^C(W{Gs=I!u05
z4DT+R&@#QJkR_UryzfrK?Xd-TXlejR>N31mZwW?UUckx35d8N+f0Wo*z%ROUc;fLc
z-1oQwZq#q3$q_k7-p|94nzvx=;R=h4HNdM(z_o8*fIqhsC270ip!Rd9{PPC*#@eH4
z%TYY;b{3p|>BjkY^)Rs3jIpftBGKC}F|gVb8|;sPsw)b#>_;GJlLgt9(McS9w}8yx
z9Bh<qggE{Qvi{64Y!^1cQ~e*|Y`YVrc-rImc@>j!Tnxdh8kvAAUz4IP6*Jl%$!Hp4
z;pKQDNxM@=EOK<Qd)2pq<yO#tXoQJ5W(|L9kdb9eT1bVZ0UUm$OMhU1mghg9ziQR7
zlb<Q*@Slxf{zM0ps$Re!;&3wQs)P2tu8$vvvED;S_X}#Fs+vIL%{-h7SVLk{4DpCY
z8<oSQ(EWx7>Ds;!7NQRy{-TU~fi{@QIS?VshB+-6{pR*7qPMOP@0oMhyTz^0<Y9uA
z{Igi5SqLLPZGiR*b+|YF3}~s&6H%)ej20Hb6W%ubwvtZ+h$jq%XdvVCX51pN!>d;+
zVI*q}eeJG>6{E?}Y4!}(<7?*Wj!n?*2{2tEpw}yXNE*k2IB)*}XdMNvdlEW+#f58z
zD$v!+M%N$()o#|q`CY{bFA8C2Js%D3ORzV29IA4UqGE`HE@igZalQxos>g}1tq*cz
zt|D)A78Tn9s6u%HSNf#G3gt?4+_V7{Q|Dn{Q#p9pIK%r49V~ok2e*2BAjV1y_kVcI
z@D&^yRQrgy?fD*Rs$Y;<?`|^URs?<x%c&~p8O&|X#>nU#c<xtByp7CJ%`8H`p*HA0
zh=KmU&!;<8hUm1o7lpkOWbKYe%)8e0)ceV9G`&~^VO>&me2@<U^#qiJp8}tRPr>`%
zLCnk=hUwOI7~&oUKTWW>h93lmCx1pi{US1Gk^#NWFPWJW6<{POrXtHtY>MDH{uk%&
Bxc&eD

literal 0
HcmV?d00001

diff --git a/test/src/unittests/machinelearning/test_tensorflowpredictmaest.py b/test/src/unittests/machinelearning/test_tensorflowpredictmaest.py
index 079a177f0..572d17a99 100644
--- a/test/src/unittests/machinelearning/test_tensorflowpredictmaest.py
+++ b/test/src/unittests/machinelearning/test_tensorflowpredictmaest.py
@@ -55,12 +55,12 @@ def testRegression(self):
         )
 
         filename = join(testdata.audio_dir, "recorded", "techno_loop.wav")
-        audio = MonoLoader(filename=filename, sampleRate=16000, resampleQuality=4)()
+        audio = MonoLoader(filename=filename, sampleRate=16000)()
 
         activations = self.model30s(audio)
         found = numpy.mean(activations, axis=0).squeeze()
 
-        self.assertAlmostEqualVector(found, expected, 1e-1)
+        self.assertAlmostEqualVector(found, expected, 1e-3)
 
     def testInvalidParam(self):
         self.assertConfigureFails(

From 92f0831a855fd2cff4f92e617988657b2ba1f5a5 Mon Sep 17 00:00:00 2001
From: palonso <pablo.alonso@upf.edu>
Date: Fri, 20 Oct 2023 19:40:00 +0200
Subject: [PATCH 22/26] Use MAEST as title in the documentation.

This makes a cleaner url for the model and is consistent with other
models without reference to the training dataset (e.g., OpenL3,
CREPE).
---
 doc/sphinxdoc/models.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/sphinxdoc/models.rst b/doc/sphinxdoc/models.rst
index 1b46f7154..4d4780477 100644
--- a/doc/sphinxdoc/models.rst
+++ b/doc/sphinxdoc/models.rst
@@ -137,8 +137,8 @@ Models:
 *Note: We provide models operating with a fixed batch size of 64 samples since it was not possible to port the version with dynamic batch size from ONNX to TensorFlow. Additionally, an ONNX version of the model with* `dynamic batch <https://essentia.upf.edu/models/feature-extractors/discogs-effnet/discogs-effnet-bsdynamic-1.onnx>`_ *size is provided.*
 
 
-Discogs-MAEST
-^^^^^^^^^^^^^
+MAEST
+^^^^^
 
 Music Audio Efficient Spectrogram Transformer (`MAEST <https://github.com/palonso/MAEST/>`_) trained to predict music style labels using an in-house dataset annotated with Discogs metadata.
 We offer versions of MAEST trained with sequence lengths ranging from 5 to 30 seconds (``5s``, ``10s``, ``20s``, and ``30s``), and trained starting from different intial weights: from random initialization (``fs``), from `DeiT <https://doi.org/10.48550/arXiv.2012.12877>`_ pre-trained weights (``dw``), and from `PaSST <https://doi.org/10.48550/arXiv.2106.07139>`_ pre-trained weights (``pw``). Additionally, we offer a version of MAEST trained following a teacher student setup (``ts``).

From 10227fa2f670386d4ddc27328b1701bcd9915508 Mon Sep 17 00:00:00 2001
From: Dmitry Bogdanov <dmitry.bogdanov@upf.edu>
Date: Tue, 24 Oct 2023 16:45:10 +0200
Subject: [PATCH 23/26] Add gifsync to applications

---
 doc/sphinxdoc/_templates/applications.html | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/doc/sphinxdoc/_templates/applications.html b/doc/sphinxdoc/_templates/applications.html
index ea31b493a..955322dfc 100644
--- a/doc/sphinxdoc/_templates/applications.html
+++ b/doc/sphinxdoc/_templates/applications.html
@@ -296,7 +296,18 @@ <h1>Applications</h1>
         <a href="https://github.com/leozimmerman/ofxAudioAnalyzer">ofxAudioAnalyzer</a> is an openFrameworks wrapper for Essentia. It provides audio analysis algorithms modified to process signals in real-time.
       </dd>
     </div>
-
+    <div class="row essnt-apps-page__container">
+      <dt class="col-xs-2 col-sm-3 col-md-2 essnt-apps-page__logo">
+        <a href="https://github.com/p3zo/gifsync" title="Go to GIF Sync">
+          <span class="essnt-apps-page__logo-text">
+            GIF Sync
+          </span>
+        </a>
+      </dt>
+      <dd class="col-xs-10 col-sm-9 col-md-10 essnt-apps-page__description">
+        <a href="https://github.com/p3zo/gifsync">GIF Sync</a> reassembles the frames of a GIF to sync its animation to the beat of an audio file.
+      </dd>
+    </div>
   </dl>
 
 {% endblock %}

From 86202df602353fc22d25e047a9f13e9a07af1ea4 Mon Sep 17 00:00:00 2001
From: Dmitry Bogdanov <dmitry.bogdanov@upf.edu>
Date: Wed, 29 Nov 2023 18:45:11 +0100
Subject: [PATCH 24/26] Docs: add MAEST Replicate demo

---
 doc/sphinxdoc/demos.rst | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/doc/sphinxdoc/demos.rst b/doc/sphinxdoc/demos.rst
index 0880e7cbe..8cfb86555 100644
--- a/doc/sphinxdoc/demos.rst
+++ b/doc/sphinxdoc/demos.rst
@@ -15,8 +15,9 @@ Essentia TensorFlow models
 Examples of inference with the pre-trained TensorFlow models for music auto-tagging and classification tasks:
 
 - Music classification by genre, mood, danceability, instrumentation: https://replicate.com/mtg/music-classifiers
-- Music style classification with the Discogs taxonomy (400 styles). Overall track-level predictions: https://replicate.com/mtg/effnet-discogs
-- Music style classification with the Discogs taxonomy (400 styles). Segment-level real-time predictions with Essentia.js: https://essentia.upf.edu/essentiajs-discogs
+- Music style classification with the Discogs taxonomy (400 styles, MAEST model). Overall track-level predictions: https://replicate.com/mtg/maest
+- Music style classification with the Discogs taxonomy (400 styles, Effnet-Discogs model). Overall track-level predictions: https://replicate.com/mtg/effnet-discogs
+- Music style classification with the Discogs taxonomy (400 styles, Effnet-Discogs model). Segment-level real-time predictions with Essentia.js: https://essentia.upf.edu/essentiajs-discogs
 - Real-time music autotagging (50 tags) in the browser with Essentia.js: https://mtg.github.io/essentia.js/examples/demos/autotagging-rt/
 - Mood classification in the browser with Essentia.js: https://mtg.github.io/essentia.js/examples/demos/mood-classifiers/
 - Music emotion arousal/valence regression: https://replicate.com/mtg/music-arousal-valence

From e1a476f6b3c7cdcc8416e8af3e0b2a2a5756b3df Mon Sep 17 00:00:00 2001
From: Dmitry Bogdanov <dmitry.bogdanov@upf.edu>
Date: Thu, 30 Nov 2023 13:07:14 +0100
Subject: [PATCH 25/26] Docs: add tempo estimation demos

---
 doc/sphinxdoc/demos.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/doc/sphinxdoc/demos.rst b/doc/sphinxdoc/demos.rst
index 8cfb86555..fe15dab2b 100644
--- a/doc/sphinxdoc/demos.rst
+++ b/doc/sphinxdoc/demos.rst
@@ -9,6 +9,12 @@ Examples of music audio analysis with Essentia algorithms using Essentia.js
 https://mtg.github.io/essentia.js/examples/
 
 
+Tempo estimation
+----------------
+
+Tempo BPM estimation with Essentia: https://replicate.com/mtg/essentia-bpm
+
+
 Essentia TensorFlow models
 --------------------------
 

From 95c996e312ad6de0530bc43772ee1677d9e738c7 Mon Sep 17 00:00:00 2001
From: Dmitry Bogdanov <dmitry.bogdanov@upf.edu>
Date: Fri, 1 Dec 2023 13:25:37 +0100
Subject: [PATCH 26/26] Fix typo

---
 src/algorithms/filters/iir.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/algorithms/filters/iir.cpp b/src/algorithms/filters/iir.cpp
index e7c0896a7..dd58d9fe9 100644
--- a/src/algorithms/filters/iir.cpp
+++ b/src/algorithms/filters/iir.cpp
@@ -26,7 +26,7 @@ using namespace standard;
 
 const char* IIR::name = "IIR";
 const char* IIR::category = "Filters";
-const char* IIR::description = DOC("This algorithm implements a standard IIR filter. It filters the data in the input vector with the filter described by parameter vectors 'numerator' and 'denominator' to create the output filtered vector. In the litterature, the numerator is often referred to as the 'B' coefficients and the denominator as the 'A' coefficients.\n"
+const char* IIR::description = DOC("This algorithm implements a standard IIR filter. It filters the data in the input vector with the filter described by parameter vectors 'numerator' and 'denominator' to create the output filtered vector. In the literature, the numerator is often referred to as the 'B' coefficients and the denominator as the 'A' coefficients.\n"
 "\n"
 "The filter is a Direct Form II Transposed implementation of the standard difference equation:\n"
 "  a(0)*y(n) = b(0)*x(n) + b(1)*x(n-1) + ... + b(nb-1)*x(n-nb+1) - a(1)*y(n-1) - ... - a(nb-1)*y(n-na+1)\n"